diff --git a/.ansible-lint b/.ansible-lint new file mode 100644 index 000000000..2a8fdbfe6 --- /dev/null +++ b/.ansible-lint @@ -0,0 +1,2 @@ +skip_list: + - var-naming[no-role-prefix] diff --git a/.metadata/omnia_version b/.metadata/omnia_version index d7fa96b58..de19e429c 100644 --- a/.metadata/omnia_version +++ b/.metadata/omnia_version @@ -1 +1,2 @@ -omnia_version: 1.6.1 \ No newline at end of file +omnia_version: 1.7 +omnia_installation_path: "" diff --git a/accelerator/accelerator.yml b/accelerator/accelerator.yml index 302ec43df..2812067fa 100644 --- a/accelerator/accelerator.yml +++ b/accelerator/accelerator.yml @@ -1,4 +1,4 @@ -# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,34 +13,34 @@ # limitations under the License. --- +- name: Check if virtual environment is active + ansible.builtin.import_playbook: ../utils/check_venv.yml + when: not ( check_venv_executed | default(false) | bool ) + +- name: Update Inventory with ansible_host information + ansible.builtin.import_playbook: ../utils/servicetag_host_mapping.yml + - name: Validate accelerator inputs hosts: localhost gather_facts: true connection: local roles: - accelerator_validation - tags: amd, nvidia + tags: amd, nvidia, intel - name: Update Repositories/Registries on nodes ansible.builtin.import_playbook: ../utils/update_user_repo.yml when: not ( hostvars['127.0.0.1']['update_user_repo_executed'] | default(false) | bool ) -# - name: Validate repo file and subscription -# hosts: all -# gather_facts: true -# roles: -# - repo_validation -# tags: amd, nvidia - - name: Gather Cluster Facts - hosts: all + hosts: slurm_control_node, slurm_node, kube_control_plane, kube_node, auth_server, login, etcd gather_facts: true roles: - common - tags: amd, nvidia + tags: amd, nvidia, intel - name: Perform GPU driver and ROCm installation for AMD Accelerators - hosts: all + hosts: slurm_control_node, slurm_node, kube_control_plane, kube_node, auth_server, login, etcd gather_facts: true any_errors_fatal: true roles: @@ -66,3 +66,14 @@ # - name: Reboot node # ansible.builtin.reboot: # tags: nvidia + +- name: Install Intel Gaudi drivers on nodes + hosts: slurm_control_node, slurm_node, kube_control_plane, kube_node, auth_server, login, etcd + gather_facts: true + any_errors_fatal: true + roles: + - intel + tags: intel + +- name: Import playbook to set performance profile on nodes + ansible.builtin.import_playbook: "../utils/performance_profile/performance_profile.yml" diff --git a/accelerator/ansible.cfg b/accelerator/ansible.cfg index 7c8bb3413..59a7e2596 100644 --- a/accelerator/ansible.cfg +++ b/accelerator/ansible.cfg @@ -3,6 +3,8 @@ log_path = /var/log/omnia/accelerator.log host_key_checking = false forks = 5 timeout = 180 +collections_path = $VIRTUAL_ENV +executable = /bin/bash [persistent_connection] command_timeout = 180 @@ -10,4 +12,4 @@ connect_timeout = 180 [ssh_connection] retries = 3 -ssh_args = -o ControlMaster=auto -o ControlPersist=180 \ No newline at end of file +ssh_args = -o ControlMaster=auto -o ControlPersist=180 diff --git a/accelerator/roles/accelerator_validation/tasks/main.yml b/accelerator/roles/accelerator_validation/tasks/main.yml index c4abce679..3e86d9214 100644 --- a/accelerator/roles/accelerator_validation/tasks/main.yml +++ b/accelerator/roles/accelerator_validation/tasks/main.yml @@ -13,12 +13,33 @@ # limitations under the License. --- -- name: Saving distribution of os +- name: Saving distribution and version of OS ansible.builtin.set_fact: - control_plane_os: "{{ ansible_distribution | lower }}" + oim_os: "{{ ansible_distribution | lower }}" + oim_os_version: "{{ ansible_distribution_version | lower }}" - name: Include local_repo variables ansible.builtin.include_tasks: include_local_repo_config.yml - name: Check xcat installation status ansible.builtin.include_tasks: validate_amd.yml + +- name: Check xcat installation status + ansible.builtin.include_tasks: validate_intel_gaudi.yml + +- name: Debug intel_gaudi_config_status + ansible.builtin.debug: + msg: "intel_gaudi_config_status is {{ hostvars['localhost']['intel_gaudi_config_status'] }}" + +- name: Debug amdgpu_config_status + ansible.builtin.debug: + msg: "amdgpu_config_status is {{ hostvars['localhost']['amdgpu_config_status'] }}" + +- name: Check if both intel_gaudi_config_status and amdgpu_config_status are false + ansible.builtin.fail: + msg: "{{ driver_not_found_msg }}" + when: > + hostvars['localhost']['intel_gaudi_config_status'] is defined and + hostvars['localhost']['amdgpu_config_status'] is defined and + not hostvars['localhost']['intel_gaudi_config_status'] | bool and + not hostvars['localhost']['amdgpu_config_status'] | bool diff --git a/accelerator/roles/accelerator_validation/tasks/validate_amd.yml b/accelerator/roles/accelerator_validation/tasks/validate_amd.yml index da883c189..3003d2978 100644 --- a/accelerator/roles/accelerator_validation/tasks/validate_amd.yml +++ b/accelerator/roles/accelerator_validation/tasks/validate_amd.yml @@ -25,8 +25,8 @@ file: "{{ software_config_json_file }}" name: user_config -- name: Include vars for {{ control_plane_os }} - ansible.builtin.include_vars: "{{ role_path }}/vars/{{ control_plane_os }}.yml" +- name: Include vars for {{ oim_os }} + ansible.builtin.include_vars: "{{ role_path }}/vars/{{ oim_os }}.yml" - name: Get amdgpu status ansible.builtin.set_fact: @@ -47,10 +47,11 @@ loop_control: loop_var: item -- name: Failed, AMDGPU ROCm software stack not present in software_config.json - ansible.builtin.fail: - msg: "{{ amdgpu_input_fail_msg }}" - when: not amdgpu_input_status +- name: Check if the rocm offline repo exists + ansible.builtin.stat: + path: "{{ offline_rocm_directory }}/rocm/" + register: check_rocm_repo + when: rocm_input_status - name: Set amdgpu_config_status when: amdgpu_input_status @@ -63,28 +64,28 @@ ansible.builtin.set_fact: amdgpu_directory: "{{ offline_rocm_directory }}/amdgpu/{{ amdgpu_version }}/" - - name: Check amdgpu_version exists or not + - name: Check amdgpu version directory exists or not ansible.builtin.stat: path: "{{ amdgpu_directory }}" register: check_amdgpu_dir - failed_when: not check_amdgpu_dir.stat.exists - - name: Set amdgpu_config_status to true + - name: Set amdgpu_config_status based on directory existence ansible.builtin.set_fact: - amdgpu_config_status: true - when: check_amdgpu_dir.stat.exists + amdgpu_config_status: "{{ check_amdgpu_dir.stat.exists | ternary(true, false) }}" rescue: - - name: Failed, amdgpu directory repo not found - ansible.builtin.fail: - msg: "{{ amdgpu_repo_fail_msg }}" - when: not check_amdgpu_dir.stat.exists + - name: Log an error message + ansible.builtin.debug: + msg: " {{ amdgpu_fail_msg }} " - - name: Failed, amdgpu version not found - ansible.builtin.fail: - msg: "{{ amdgpu_version_fail_msg }}" + - name: Set amdgpu_config_status to false + ansible.builtin.set_fact: + amdgpu_config_status: false - name: Set rocm_config_status - when: rocm_input_status + when: + - rocm_input_status + - user_config.repo_config == 'always' or user_config.repo_config == 'partial' + - check_rocm_repo.stat.exists block: - name: Fetch rocm_version ansible.builtin.set_fact: @@ -98,18 +99,38 @@ ansible.builtin.stat: path: "{{ rocm_directory }}" register: check_rocm_dir - failed_when: not check_rocm_dir.stat.exists + + - name: Set rocm_config_status based on directory existence + ansible.builtin.set_fact: + rocm_config_status: "{{ check_rocm_dir.stat.exists | ternary(true, false) }}" + + rescue: + - name: Log an error message + ansible.builtin.debug: + msg: " {{ amdgpu_fail_msg }} " + + - name: Set rocm_config_status to false + ansible.builtin.set_fact: + rocm_config_status: false + +- name: Set rocm_config_status + when: + - rocm_input_status + - user_config.repo_config == 'never' or user_config.repo_config == 'partial' + - not check_rocm_repo.stat.exists + block: + - name: Fetch rocm_version + ansible.builtin.set_fact: + rocm_version: "{{ user_config.amdgpu | selectattr('name', 'equalto', 'rocm') | map(attribute='version') | first }}" - name: Set rocm_config_status to true ansible.builtin.set_fact: rocm_config_status: true - when: check_rocm_dir.stat.exists rescue: - - name: Failed, rocm directory repo not found - ansible.builtin.fail: - msg: "{{ rocm_repo_fail_msg }}" - when: not check_rocm_dir.stat.exists - - - name: Failed, rocm version not found - ansible.builtin.fail: - msg: "{{ rocm_version_fail_msg }}" + - name: Log an error message + ansible.builtin.debug: + msg: " {{ amdgpu_fail_msg }} " + + - name: Set rocm_config_status to false + ansible.builtin.set_fact: + rocm_config_status: false diff --git a/accelerator/roles/accelerator_validation/tasks/validate_intel_gaudi.yml b/accelerator/roles/accelerator_validation/tasks/validate_intel_gaudi.yml new file mode 100644 index 000000000..e57ab6fc8 --- /dev/null +++ b/accelerator/roles/accelerator_validation/tasks/validate_intel_gaudi.yml @@ -0,0 +1,99 @@ +# Copyright 2024 Intel Corporation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Set default intel gaudi status + ansible.builtin.set_fact: + habana_config_status: false + habana_input_status: false + intel_gaudi_config_status: false + intel_gaudi_input_status: false + +- name: Load software_config.json + ansible.builtin.include_vars: + file: "{{ software_config_json_file }}" + name: user_config + +- name: Include vars for {{ oim_os }} + ansible.builtin.include_vars: "{{ role_path }}/vars/{{ oim_os }}.yml" + +- name: Get Intel Gaudi status + ansible.builtin.set_fact: + intel_gaudi_input_status: true + loop: "{{ user_config.softwares | default([]) }}" + when: + - "'intelgaudi' in item.name" + loop_control: + loop_var: item + +- name: Get habana status only if intel gaudi is present gaudi_status is true + ansible.builtin.set_fact: + habana_input_status: true + loop: "{{ user_config.gaudi | default([]) }}" + when: + - intel_gaudi_input_status + - "'intel' in item.name" + loop_control: + loop_var: item + +- name: Set intel_gaudi_config_status + when: intel_gaudi_input_status + block: + - name: Fetch intelgaudi_version + ansible.builtin.set_fact: + intelgaudi_version: "{{ user_config.softwares | selectattr('name', 'equalto', 'intelgaudi') | map(attribute='version') | first }}" + + - name: Set intelgaudi_version + ansible.builtin.set_fact: + intelgaudi_directory: "{{ offline_intelgaudi_directory }}/intelgaudi/{{ intelgaudi_version }}/" + + - name: Set gaudi_directory + ansible.builtin.set_fact: + gaudi_directory: "{{ intelgaudi_directory }}" + + - name: Check gaudi_directory exists or not + ansible.builtin.stat: + path: "{{ gaudi_directory }}" + register: check_gaudi_dir + + - name: Set intel_gaudi_config_status to true + ansible.builtin.set_fact: + intel_gaudi_config_status: true + when: check_gaudi_dir.stat.exists + + rescue: + - name: Intel Gaudi not found + ansible.builtin.debug: + msg: "{{ intel_gaudi_repo_fail_msg }}" + when: not check_gaudi_dir.stat.exists + +- name: Set habana_config_status + when: habana_config_status + block: + + - name: Check driver packages inside offline_gaudi_directory + ansible.builtin.find: + paths: "{{ offline_gaudi_directory }}" + patterns: "{{ gaudi_search_pattern }}" + register: check_driver_packages + + - name: Set habana_config_status to true + ansible.builtin.set_fact: + habana_config_status: true + when: check_driver_packages.matched > 0 + rescue: + - name: Intel Gaudi driver packages not found + ansible.builtin.debug: + msg: "{{ intel_gaudi_repo_fail_msg }}" + when: check_driver_packages.matched == 0 diff --git a/accelerator/roles/accelerator_validation/vars/main.yml b/accelerator/roles/accelerator_validation/vars/main.yml index f7be83d64..9d6cb034a 100644 --- a/accelerator/roles/accelerator_validation/vars/main.yml +++ b/accelerator/roles/accelerator_validation/vars/main.yml @@ -22,7 +22,16 @@ amdgpu_version_fail_msg: "Failed, software_config.json does not have the version amdgpu_repo_fail_msg: "Failed, local_repo.yml is not executed for downloading AMDGPU packages." rocm_version_fail_msg: "Failed, software_config.json does not have the version for ROCM." rocm_repo_fail_msg: "Failed, local_repo.yml is not executed for downloading ROCM packages." +amdgpu_fail_msg: "An error occurred while setting the rocm_config_status." # Usage: include_local_repo_config.yml local_repo_config_file: "{{ role_path }}/../../../input/local_repo_config.yml" local_repo_config_syntax_fail_msg: "Failed. Syntax errors present in local_repo_config.yml. Fix errors and re-run playbook again." + +# Usage: validate_intel_gaudi.yml +intel_gaudi_input_fail_msg: "Failed, software_config.json does not have the intelgaudi software stack." +intel_gaudi_repo_fail_msg: "Failed, local_repo.yml is not executed for downloading Intel Gaudi driver packages." + +# Usage: main.yml +driver_not_found_msg: | + "Please ensure that either 'intelgaudi' or 'amdgpu' is included in 'software_config.json' and then run 'accelerator.yml' to install GPU drivers." diff --git a/accelerator/roles/accelerator_validation/vars/ubuntu.yml b/accelerator/roles/accelerator_validation/vars/ubuntu.yml index 4c44815ac..bb7759fc5 100644 --- a/accelerator/roles/accelerator_validation/vars/ubuntu.yml +++ b/accelerator/roles/accelerator_validation/vars/ubuntu.yml @@ -15,3 +15,8 @@ # Usage: validate_amd.yml offline_rocm_directory: "{{ repo_store_path }}/cluster/apt" + +# Usage: validate_intel_gaudi.yml +offline_intelgaudi_directory: "{{ repo_store_path }}/cluster/apt" +offline_gaudi_directory: "{{ repo_store_path }}/cluster/{{ oim_os }}/{{ oim_os_version }}/deb" +gaudi_search_pattern: "habanalabs*.deb" diff --git a/accelerator/roles/amd/tasks/amd_rhel.yml b/accelerator/roles/amd/tasks/amd_rhel.yml index c2a79f490..5a3e99735 100644 --- a/accelerator/roles/amd/tasks/amd_rhel.yml +++ b/accelerator/roles/amd/tasks/amd_rhel.yml @@ -1,4 +1,4 @@ -# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,34 +17,53 @@ ansible.builtin.package: name: amdgpu-dkms enablerepo: amd_gpu - state: present + state: latest # noqa package-latest - name: Reboot after installing GPU drivers ansible.builtin.reboot: -- name: Install ROCm packages - ansible.builtin.package: - name: "{{ rocm_packages }}" - enablerepo: ROCm - state: present - -- name: Check if environment variables are set - ansible.builtin.command: echo $PATH - changed_when: false - failed_when: false - register: path_output - -- name: Perform Post Installation steps - when: "'rocm' not in path_output.stdout" +- name: Verify Repo and Install ROCm packages block: - - name: Check current environment variables - ansible.builtin.shell: echo $PATH + - name: Install ROCm packages + ansible.builtin.package: + name: "{{ rocm_packages }}" + enablerepo: ROCm + state: latest # noqa package-latest + + - name: Check if environment variables are set + ansible.builtin.command: echo $PATH changed_when: false - register: environment_output - - - name: Replace PATH variable - ansible.builtin.lineinfile: - path: /etc/bashrc - regexp: '^PATH=*' - insertafter: EOF - line: 'PATH={{ environment_output.stdout }}:/opt/rocm/bin/' + failed_when: false + register: path_output + + - name: Perform Post Installation steps + when: "'rocm' not in path_output.stdout" + block: + - name: Check current environment variables + ansible.builtin.shell: echo $PATH + changed_when: false + register: environment_output + + - name: Replace PATH variable + ansible.builtin.lineinfile: + path: /etc/bashrc + regexp: '^PATH=*' + insertafter: EOF + line: 'PATH={{ environment_output.stdout }}:/opt/rocm/bin/' + + - name: Ensure rocm.conf exists and add library paths + ansible.builtin.copy: + dest: "{{ linker_dest_path }}" + content: | + /opt/rocm/lib + /opt/rocm/lib64 + mode: "{{ file_permission }}" + + - name: Run ldconfig to update dynamic linker bindings + ansible.builtin.command: ldconfig + changed_when: false + rescue: + - name: Warning, rocm repo not configured + ansible.builtin.pause: + prompt: "{{ rocm_warning_msg }}" + seconds: "{{ warning_time }}" diff --git a/accelerator/roles/amd/tasks/amd_ubuntu.yml b/accelerator/roles/amd/tasks/amd_ubuntu.yml index 68537d921..c24104029 100644 --- a/accelerator/roles/amd/tasks/amd_ubuntu.yml +++ b/accelerator/roles/amd/tasks/amd_ubuntu.yml @@ -1,4 +1,4 @@ -# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,33 +16,68 @@ - name: Install kernel mode driver ansible.builtin.package: name: amdgpu-dkms - state: present + state: latest # noqa package-latest - name: Reboot after installing GPU drivers ansible.builtin.reboot: -- name: Install ROCm packages - ansible.builtin.package: - name: "{{ rocm_packages }}" - state: present +- name: Verify Repo and Install ROCm packages + block: + - name: Local local_repo_access.yml file + ansible.builtin.include_vars: "{{ local_repo_access_path }}" -- name: Check if environment variables are set - ansible.builtin.command: echo $PATH - changed_when: false - failed_when: false - register: path_output + - name: Check if the ROCm preference file exists + ansible.builtin.stat: + path: "{{ rocm_prefrence_dst }}" + register: rocm_preference_src_stat -- name: Perform Post Installation steps - when: "'rocm' not in path_output.stdout" - block: - - name: Check current environment variables - ansible.builtin.shell: echo $PATH + - name: Create ROCm preference file + ansible.builtin.template: + src: "{{ rocm_prefrence_src }}" + dest: "{{ rocm_prefrence_dst }}" + mode: "{{ prefrence_file_mode }}" + when: + - not rocm_preference_src_stat.stat.exists + + - name: Install ROCm packages + ansible.builtin.package: + name: "{{ rocm_packages }}" + state: latest # noqa package-latest + + - name: Check if environment variables are set + ansible.builtin.command: echo $PATH changed_when: false - register: environment_output - - - name: Replace PATH variable - ansible.builtin.lineinfile: - path: /root/.bashrc - regexp: '^PATH=*' - insertafter: EOF - line: 'PATH={{ environment_output.stdout }}:/opt/rocm/bin/' + failed_when: false + register: path_output + + - name: Perform Post Installation steps + when: "'rocm' not in path_output.stdout" + block: + - name: Check current environment variables + ansible.builtin.shell: echo $PATH + changed_when: false + register: environment_output + + - name: Replace PATH variable + ansible.builtin.lineinfile: + path: /root/.bashrc + regexp: '^PATH=*' + insertafter: EOF + line: 'PATH={{ environment_output.stdout }}:/opt/rocm/bin/' + + - name: Ensure rocm.conf exists and add library paths + ansible.builtin.copy: + dest: "{{ linker_dest_path }}" + content: | + /opt/rocm/lib + /opt/rocm/lib64 + mode: "{{ file_permission }}" + + - name: Run ldconfig to update dynamic linker bindings + ansible.builtin.command: ldconfig + changed_when: false + rescue: + - name: Warning, rocm repo not configured + ansible.builtin.pause: + prompt: "{{ rocm_warning_msg }}" + seconds: "{{ warning_time }}" diff --git a/accelerator/roles/amd/templates/rocm_preferences_ubuntu.j2 b/accelerator/roles/amd/templates/rocm_preferences_ubuntu.j2 new file mode 100644 index 000000000..1baa52c47 --- /dev/null +++ b/accelerator/roles/amd/templates/rocm_preferences_ubuntu.j2 @@ -0,0 +1,3 @@ +Package: rocm* +Pin: origin {{ admin_nic_ip }} +Pin-Priority: 600 \ No newline at end of file diff --git a/accelerator/roles/amd/vars/main.yml b/accelerator/roles/amd/vars/main.yml index dff3a452c..4b4e53f28 100644 --- a/accelerator/roles/amd/vars/main.yml +++ b/accelerator/roles/amd/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -22,4 +22,15 @@ os_ubuntu: "ubuntu" # Used: amd_rhel.yml rocm_packages: - - "rocm-hip-sdk{{ hostvars['127.0.0.1']['rocm_version'] }}*" + - "rocm" +warning_time: 10 +rocm_warning_msg: "Unable to install ROCM on {{ ansible_host }} node. ROCm repository not configured on the node. +Run local_repo.yml with rocm software stack in software_config or ROCm repo in user_repo_url." +file_permission: "0644" +linker_dest_path: "/etc/ld.so.conf.d/rocm.conf" + +# Used: amd_ubuntu.yml +local_repo_access_path: "/opt/omnia/offline/local_repo_access.yml" +rocm_prefrence_dst: "/etc/apt/preferences.d/rocm-pin-600" +rocm_prefrence_src: "rocm_preferences_ubuntu.j2" +prefrence_file_mode: '0644' diff --git a/accelerator/roles/intel/tasks/install_ubuntu.yml b/accelerator/roles/intel/tasks/install_ubuntu.yml new file mode 100644 index 000000000..a13fb6ce0 --- /dev/null +++ b/accelerator/roles/intel/tasks/install_ubuntu.yml @@ -0,0 +1,67 @@ +# Copyright 2024 Intel Corporation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if accelerator is present on node + ansible.builtin.include_tasks: verify_has_accelerators.yml + +- name: Install drivers on Gaudi nodes + when: node_has_accelerator + block: + - name: Gather package facts + ansible.builtin.package_facts: + manager: auto + + - name: Set intelgaudi version + ansible.builtin.set_fact: + intelgaudi_version: "{{ item.version }}" + loop: "{{ software_config.softwares | default([]) }}" + when: "'intelgaudi' in item.name" + loop_control: + loop_var: item + + - name: Check if kernel supported + ansible.builtin.fail: + msg: "Kernel not supported" + when: ansible_kernel is version('5.4.0', '<') + + - name: Base dependencies + ansible.builtin.apt: + name: "{{ intel_apt_base_packages | list }}" + state: latest # noqa package-latest + update_cache: true + + - name: Update apt and install habanalabs dependencies + ansible.builtin.apt: + name: "{{ intel_habana_packages | list }}" + update_cache: true + + - name: Remove (old) Gaudi kernel modules if present + community.general.modprobe: + name: "{{ item }}" + state: absent + loop: "{{ intel_gaudi_kernel_module_to_load }}" + + - name: Make sure dkms module is built for the current kernel + ansible.builtin.command: /usr/lib/dkms/dkms_autoinstaller start + changed_when: true + + - name: Add Gaudi kernel modules + community.general.modprobe: + name: "{{ item }}" + state: present + loop: "{{ intel_gaudi_kernel_module_to_load }}" + + - name: Set cron job for scale out interfaces + ansible.builtin.include_tasks: make_sure_scale_out_interfaces_up.yml diff --git a/upgrade/roles/telemetry_uninstall/tasks/main.yml b/accelerator/roles/intel/tasks/main.yml similarity index 74% rename from upgrade/roles/telemetry_uninstall/tasks/main.yml rename to accelerator/roles/intel/tasks/main.yml index 20da8f314..3b2e7cf68 100644 --- a/upgrade/roles/telemetry_uninstall/tasks/main.yml +++ b/accelerator/roles/intel/tasks/main.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Intel Corporation. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,6 +13,8 @@ # limitations under the License. --- -- name: Uninstall telemetry - ansible.builtin.import_tasks: telemetry_uninstall.yml - tags: telemetry +- name: Install for ubuntu + ansible.builtin.include_tasks: install_ubuntu.yml + when: + - compute_os == "ubuntu" + - compute_os_version == "22.04" diff --git a/accelerator/roles/intel/tasks/make_sure_scale_out_interfaces_up.yml b/accelerator/roles/intel/tasks/make_sure_scale_out_interfaces_up.yml new file mode 100644 index 000000000..fb6cda55c --- /dev/null +++ b/accelerator/roles/intel/tasks/make_sure_scale_out_interfaces_up.yml @@ -0,0 +1,79 @@ +# Copyright 2024 Intel Corporation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check bring_up_ports.sh exists or not + ansible.builtin.stat: + path: "{{ intel_bring_up_ports_script_path }}" + register: check_bring_up_script + +- name: Init gaudi_ver variable + ansible.builtin.set_fact: + gaudi_ver: "gaudi2" + +- name: Check node has Gaudi3 or not + ansible.builtin.shell: | + set -o pipefail + lspci -n -d {{ gaudi3_pci_vendor_device_class }} + register: lspci_output + changed_when: false + failed_when: false + args: + executable: /bin/bash + +- name: Set gaudi_ver variable + ansible.builtin.set_fact: + gaudi_ver: "gaudi3" + when: lspci_output.stdout | length > 0 + +- name: Create bring_up_ports.sh + when: not check_bring_up_script.stat.exists + ansible.builtin.blockinfile: + path: "{{ intel_bring_up_ports_script_path }}" + create: true + mode: "{{ file_permissions }}" + block: | + #!/bin/bash + /opt/habanalabs/qual/{{gaudi_ver}}/bin/manage_network_ifs.sh --up + RET_CODE=$? + if [ "${RET_CODE}" -eq "1" ]; then + echo "One or more Gaudi ports are down." >> /dev/stderr + return 1 + fi + +- name: Change permission on bring_up_ports.sh file + ansible.builtin.file: + path: "{{ intel_bring_up_ports_script_path }}" + state: file + owner: root + group: root + mode: "{{ file_permissions }}" + +- name: Check for existing cron job + ansible.builtin.shell: | + set -o pipefail + crontab -l | grep -q "{{ intel_bring_up_ports_script_path }}" + args: + executable: /bin/bash + register: cron_job_check + failed_when: false + changed_when: false + +- name: Create cronjob for bringing up Gaudi ports + when: cron_job_check.rc != 0 + ansible.builtin.cron: + name: "Bring up Gaudi ports" + special_time: reboot + job: "{{ intel_bring_up_ports_script_path }}" + diff --git a/accelerator/roles/intel/tasks/verify_has_accelerators.yml b/accelerator/roles/intel/tasks/verify_has_accelerators.yml new file mode 100644 index 000000000..ecb24f2f8 --- /dev/null +++ b/accelerator/roles/intel/tasks/verify_has_accelerators.yml @@ -0,0 +1,33 @@ +# Copyright 2024 Intel Corporation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Initialise node accelerator status + ansible.builtin.set_fact: + node_has_accelerator: false + +- name: Check node accelerator status + ansible.builtin.shell: | + set -o pipefail + lspci | grep -i "{{ intel_gaudi_device_pattern }}" + register: lspci_output + changed_when: false + failed_when: false + args: + executable: /bin/bash + +- name: Update node accelerator status + ansible.builtin.set_fact: + node_has_accelerator: true + when: lspci_output.stdout | length > 0 diff --git a/accelerator/roles/intel/vars/main.yml b/accelerator/roles/intel/vars/main.yml new file mode 100644 index 000000000..cfce992bd --- /dev/null +++ b/accelerator/roles/intel/vars/main.yml @@ -0,0 +1,58 @@ +# Copyright 2024 Intel Corporation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +intel_gaudi_device_pattern: "Processing accelerators: Habana Labs Ltd." + +intel_gaudi_kernel_module_to_load: + - habanalabs + - habanalabs_cn + - habanalabs_en + +# TODO: move to a central config file +intel_habana_packages: + - "habanalabs-dkms={{ intelgaudi_version }}" + - "habanalabs-firmware={{ intelgaudi_version }}" + - "habanalabs-firmware-tools={{ intelgaudi_version }}" + - "habanalabs-graph={{ intelgaudi_version }}" + - "habanalabs-qual={{ intelgaudi_version }}" + - "habanalabs-rdma-core={{ intelgaudi_version }}" + - "habanalabs-thunk={{ intelgaudi_version }}" + - "habanatools={{ intelgaudi_version }}" + +# TODO: move to a central config file +intel_apt_base_packages: + - cmake + - curl + - dkms + - ethtool + - gcc + - iproute2 + - libbz2-dev + - libelf-dev + - libibverbs-dev + - liblzma-dev + - librdmacm-dev + - linux-headers-{{ ansible_kernel }} + - linux-modules-extra-{{ ansible_kernel }} + - lsof + - moreutils + - numactl + - unzip + - wget + - libopenmpi3 + +intel_bring_up_ports_script_path: "/opt/omnia/cronjobs/bring_up_ports.sh" +file_permissions: "0755" +gaudi3_pci_vendor_device_class: "1da3:1060:1200" diff --git a/accelerator/roles/nvidia/tasks/initiate_nfs_server.yml b/accelerator/roles/nvidia/tasks/initiate_nfs_server.yml index cc473f6ca..b72e78303 100644 --- a/accelerator/roles/nvidia/tasks/initiate_nfs_server.yml +++ b/accelerator/roles/nvidia/tasks/initiate_nfs_server.yml @@ -14,6 +14,7 @@ --- - name: Setup NFS Server when cuda offline path is given + when: local_installer block: - name: Include vars file of inventory role @@ -39,6 +40,7 @@ # Setup nfs server when nfs_cuda status is set to true - name: Setup NFS Server with cuda toolkit + when: nfs_cuda is true block: - name: Setup nfs server on localhost ansible.builtin.include_tasks: setup_nfs_server.yml @@ -48,7 +50,3 @@ src: "{{ cuda_toolkit_path }}" dest: "{{ cuda_filepath }}" mode: "{{ cuda_nfs_permissions }}" - - when: nfs_cuda is true - - when: local_installer diff --git a/accelerator/roles/nvidia/tasks/install_cuda_redhat.yml b/accelerator/roles/nvidia/tasks/install_cuda_redhat.yml index 39738b5b4..f08f1bb0f 100644 --- a/accelerator/roles/nvidia/tasks/install_cuda_redhat.yml +++ b/accelerator/roles/nvidia/tasks/install_cuda_redhat.yml @@ -18,6 +18,7 @@ when: - os_supported_rhel in ansible_facts['distribution'] | lower - not hostvars['127.0.0.1']['xcat_installation_status'] + changed_when: false - name: Download development packages ansible.builtin.package: @@ -35,6 +36,7 @@ disable_gpg_check: true - name: Install cuda toolkit using offline path + when: hostvars['localhost']['local_installer'] block: - name: Install packages from cuda rpm file ansible.builtin.yum: @@ -42,9 +44,8 @@ state: present disable_gpg_check: true - when: hostvars['localhost']['local_installer'] - - name: Install latest cuda toolkit using network way + when: not hostvars['localhost']['local_installer'] block: - name: Set Redhat distro ansible.builtin.set_fact: @@ -54,8 +55,6 @@ ansible.builtin.command: dnf config-manager --add-repo "{{ cuda_repo_url }}" changed_when: false - when: not hostvars['localhost']['local_installer'] - - name: Delete xorg.conf file if present ansible.builtin.file: path: "/etc/X11/xorg.conf" @@ -85,6 +84,7 @@ register: path_output - name: Perform Post Installation steps + when: "'cuda' not in path_output.stdout" block: - name: Check current environment variables ansible.builtin.shell: echo $PATH @@ -98,8 +98,6 @@ insertafter: EOF line: 'PATH={{ environment_output.stdout }}:/usr/local/cuda/bin' - when: "'cuda' not in path_output.stdout" - - name: Enable nvidia-persistenced systemd service ansible.builtin.service: name: nvidia-persistenced diff --git a/accelerator/roles/nvidia/tasks/main.yml b/accelerator/roles/nvidia/tasks/main.yml index 0d78f7baf..77ee74093 100644 --- a/accelerator/roles/nvidia/tasks/main.yml +++ b/accelerator/roles/nvidia/tasks/main.yml @@ -14,6 +14,8 @@ --- - name: Install cuda toolkit when node has nvidia gpu accelerator installed + when: + - cuda_node_status block: - name: Include vars file of accelerator role ansible.builtin.include_vars: "{{ role_path }}/../../../input/accelerator_config.yml" @@ -38,6 +40,3 @@ - name: Install cuda on leap nodes ansible.builtin.include_tasks: install_cuda_leap.yml when: os_supported_leap in ansible_facts['distribution'] | lower - - when: - - cuda_node_status diff --git a/accelerator/roles/nvidia/tasks/setup_nfs_client.yml b/accelerator/roles/nvidia/tasks/setup_nfs_client.yml index d7a62685c..839192bab 100644 --- a/accelerator/roles/nvidia/tasks/setup_nfs_client.yml +++ b/accelerator/roles/nvidia/tasks/setup_nfs_client.yml @@ -37,6 +37,5 @@ - name: Mount NFS client ansible.builtin.command: "mount -o {{ client_mount_options }} -t nfs {{ server_ip }}:{{ cuda_nfs_path }} {{ cuda_nfs_path }}" changed_when: true - args: - warn: false when: cuda_nfs_path not in mounted_share.stdout + # noqa command-instead-of-module diff --git a/accelerator/roles/nvidia/tasks/setup_nfs_server.yml b/accelerator/roles/nvidia/tasks/setup_nfs_server.yml index 2157199d0..f68c33146 100644 --- a/accelerator/roles/nvidia/tasks/setup_nfs_server.yml +++ b/accelerator/roles/nvidia/tasks/setup_nfs_server.yml @@ -66,7 +66,7 @@ ansible.builtin.set_fact: ib_inventory_start_octets: "{{ groups['all'][0].split('.')[0:2] | join('.') }}" -# NFS Server IP (control plane IP) should be in same range as that of NFS Clients(compute nodes IP) +# NFS Server IP (Omnia Infrastructure Managemnet (OIM) Node IP) should be in same range as that of NFS Clients(compute nodes IP) - name: Find server IP in the range matching with node_inventory ansible.builtin.set_fact: server_ip: "{{ item }}" diff --git a/accelerator/roles/nvidia/tasks/validations.yml b/accelerator/roles/nvidia/tasks/validations.yml index 8908ef932..46a58dae6 100644 --- a/accelerator/roles/nvidia/tasks/validations.yml +++ b/accelerator/roles/nvidia/tasks/validations.yml @@ -33,6 +33,7 @@ when: not local_installer - name: Check if correct cuda_toolkit file exists + when: local_installer block: - name: Verify if cuda_toolkit offline path is given ansible.builtin.assert: @@ -56,5 +57,3 @@ - name: Install nfs server setup ansible.builtin.include_tasks: initiate_nfs_server.yml - - when: local_installer diff --git a/accelerator/roles/repo_validation/tasks/main.yml b/accelerator/roles/repo_validation/tasks/main.yml deleted file mode 100644 index 341a79e8a..000000000 --- a/accelerator/roles/repo_validation/tasks/main.yml +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Saving distribution of os - ansible.builtin.set_fact: - compute_os: "{{ ansible_facts['distribution'] | lower }}" - -- name: Check repos configured when distribution os is rhel - when: compute_os == os_supported_redhat - block: - - name: Check the repos configured on the cluster nodes - ansible.builtin.include_tasks: repo_check.yml - when: - - hostvars['127.0.0.1']['xcat_installation_status'] - - - name: Check the repo files configured after update node is applied - ansible.builtin.include_tasks: repo_file_check.yml - when: - - hostvars['127.0.0.1']['xcat_installation_status'] - - not repo_list_status - - - name: Validate subscription when xcat is not installed - when: - - not hostvars['127.0.0.1']['xcat_installation_status'] - block: - - name: Validate redhat subscription status - ansible.builtin.include_tasks: validate_rhsm.yml diff --git a/accelerator/roles/repo_validation/tasks/repo_check.yml b/accelerator/roles/repo_validation/tasks/repo_check.yml deleted file mode 100644 index 0a4421460..000000000 --- a/accelerator/roles/repo_validation/tasks/repo_check.yml +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Initiallize variable - ansible.builtin.set_fact: - repo_list_status: false - -- name: Fetch xcat enabled repo list - ansible.builtin.command: dnf repolist - changed_when: false - register: dnf_repo_list - -- name: Check if repo list is valid - ansible.builtin.set_fact: - repo_list_status: true - when: 'repo_search_string in dnf_repo_list.stdout' - -- name: Check enabled repos using repo list - when: repo_list_status - block: - - name: Check xcat enabled repos - ansible.builtin.assert: - that: - - "'{{ item }}'| lower in dnf_repo_list.stdout | lower" - success_msg: "{{ xcat_repo_success_msg }}" - fail_msg: "{{ xcat_repo_fail_msg }}" - with_items: - - "{{ search_string }}" diff --git a/accelerator/roles/repo_validation/tasks/repo_file_check.yml b/accelerator/roles/repo_validation/tasks/repo_file_check.yml deleted file mode 100644 index 69d734a5d..000000000 --- a/accelerator/roles/repo_validation/tasks/repo_file_check.yml +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Check for AppStream - ansible.builtin.command: grep -r "{{ search_string[0] }}" "{{ repo_path }}" - register: check_appstream - changed_when: false - -- name: Repo validation for AppStream - ansible.builtin.assert: - that: search_string[0] in check_appstream.stdout - success_msg: "{{ appstream_success_msg }}" - fail_msg: "{{ appstream_fail_msg }}" - -- name: Check for BaseOS - ansible.builtin.command: grep -r "{{ search_string[1] }}" "{{ repo_path }}" - register: check_baseos - changed_when: false - -- name: Repo validation for BaseOS - ansible.builtin.assert: - that: search_string[1] in check_baseos.stdout - success_msg: "{{ baseos_success_msg }}" - fail_msg: "{{ baseos_fail_msg }}" diff --git a/accelerator/roles/repo_validation/tasks/validate_rhsm.yml b/accelerator/roles/repo_validation/tasks/validate_rhsm.yml deleted file mode 100644 index 6e5b3ff7b..000000000 --- a/accelerator/roles/repo_validation/tasks/validate_rhsm.yml +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Validate rhsm status - block: - - name: Fetch redhat subscription status - ansible.builtin.command: subscription-manager list - changed_when: false - register: rhsm_status - - - name: Check redhat subscription status - ansible.builtin.assert: - that: "'Subscribed' in rhsm_status.stdout" - - - name: Fetch redhat enabled repo list - ansible.builtin.command: subscription-manager repos --list-enabled - changed_when: false - register: rhsm_repo_list - - - name: Check redhat enabled repos - ansible.builtin.assert: - that: - - "'appstream' in rhsm_repo_list.stdout" - - "'baseos' in rhsm_repo_list.stdout" - rescue: - - name: Redhat subscription is not enabled - ansible.builtin.fail: - msg: - - "Execution skipped for this host because of one of the following reasons:" - - "{{ compute_inactive_msg1 }}" - - "{{ compute_inactive_msg2 }}" diff --git a/accelerator/roles/repo_validation/vars/main.yml b/accelerator/roles/repo_validation/vars/main.yml deleted file mode 100644 index c6fd762fa..000000000 --- a/accelerator/roles/repo_validation/vars/main.yml +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -# Usage: repo_check.yml, repo_file_check.yml -search_string: ['AppStream', 'BaseOS'] - -# Usage: repo_check.yml -xcat_repo_success_msg: "AppStream and BaseOS repos validated successfully." -xcat_repo_fail_msg: "Failed. AppStream and BaseOS repos not enabled." -repo_search_string: "local-rhel" - -# Usage: validate_rhsm.yml -os_supported_redhat: "redhat" -manager_inactive_msg1: "1. Manager node does not have an active RedHat subscription." -manager_inactive_msg2: "2. Manager node does not have baseos and appstream repos enabled." -compute_inactive_msg1: "1. Host {{ ansible_ssh_host }} in group {{ group_names }} is not subscribed to redhat." -compute_inactive_msg2: "2. Baseos and appstream repos are not enabled on host {{ ansible_ssh_host }}." - -# Usage: repo_file_check.yml -appstream_success_msg: "AppStream repo is enabled" -appstream_fail_msg: "Failed, AppStream repo not available" -baseos_success_msg: "BaseOS repo is enabled" -baseos_fail_msg: "Failed, BaseOS repo is not available" -repo_path: "/etc/yum.repos.d/" diff --git a/accelerator/tests/test_CUDA.yml b/accelerator/tests/test_CUDA.yml index 690eaa40d..52aff89f2 100644 --- a/accelerator/tests/test_CUDA.yml +++ b/accelerator/tests/test_CUDA.yml @@ -30,6 +30,7 @@ - name: Validate CUDA Version ansible.builtin.command: "ansible-playbook test_CUDA_validation.yml -i {{ cuda_inventory }}" + changed_when: false tags: TC_001, TC_002, TC_003, TC_004 # Testcase OMNIA_1.4_CUDA_TC_005 @@ -49,4 +50,5 @@ - name: Validate CUDA Version ansible.builtin.command: "ansible-playbook test_CUDA_validation.yml -i {{ inventory }}" + changed_when: false tags: TC_005 diff --git a/accelerator/tests/test_CUDA_validation.yml b/accelerator/tests/test_CUDA_validation.yml index ef43251bd..fea99e38b 100644 --- a/accelerator/tests/test_CUDA_validation.yml +++ b/accelerator/tests/test_CUDA_validation.yml @@ -103,4 +103,4 @@ that: - "'Success' in cuda_verify.stdout" success_msg: "{{ installation_pass }}" - fail_msg: "{{ installation_fail }}" \ No newline at end of file + fail_msg: "{{ installation_fail }}" diff --git a/accelerator/tests/test_Gaudi2.yml b/accelerator/tests/test_Gaudi2.yml new file mode 100644 index 000000000..44bc747ed --- /dev/null +++ b/accelerator/tests/test_Gaudi2.yml @@ -0,0 +1,68 @@ +# Copyright 2024 Intel Corporation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Verify Gaudi installation + +- name: OMNIA_Gaudi_TC_001 + tags: TC_001, TC_003, TC_004 + hosts: localhost + connection: local + vars_files: + - 'test_vars/test_Gaudi_vars.yml' + tasks: + - name: Validate Gaudi Driver Version + ansible.builtin.command: + argv: + - "ansible-playbook" + - "test_Gaudi_validation.yml" + - "-i" + - "{{ inventory }}" + - "-t" + - "TC_001,TC_002,TC_003,TC_004" + changed_when: false +# Verify hl-qual + +- name: OMNIA_Gaudi_TC_007 + tags: TC_007 + hosts: localhost + connection: local + vars_files: + - 'test_vars/test_Gaudi_vars.yml' + tasks: + - name: Verify hl-qual + ansible.builtin.command: + argv: + - "ansible-playbook" + - "test_Gaudi2_hlqual_validation.yml" + - "-i" + - "{{ inventory }}" + changed_when: false +# Verify HCCL + +- name: OMNIA_Gaudi_TC_008 + tags: TC_008 + hosts: localhost + connection: local + vars_files: + - 'test_vars/test_Gaudi_vars.yml' + tasks: + - name: Verify HCCL + ansible.builtin.command: + argv: + - "ansible-playbook" + - "test_Gaudi2_hccl_validation.yml" + - "-i" + - "{{ inventory }}" + changed_when: false diff --git a/accelerator/tests/test_Gaudi2_hccl_validation.yml b/accelerator/tests/test_Gaudi2_hccl_validation.yml new file mode 100644 index 000000000..fa3b1a653 --- /dev/null +++ b/accelerator/tests/test_Gaudi2_hccl_validation.yml @@ -0,0 +1,138 @@ +# Copyright 2024 Intel Corporation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Verify HCCL + hosts: GPU_node + vars: + ansible_become: true + ansible_become_user: root + + habana_tests: + log_level_all: "4" + enable_console: "true" + habana_logs: "/var/log/habana_logs" + gc_kernel_path: "/usr/lib/habanalabs/libtpc_kernels.so" + habana_scal_bin_path: "/opt/habanalabs/engines_fw" + habana_plugins_lib_path: "/opt/habanalabs/habana_plugins" + data_loader_aeon_lib_path: "/usr/lib/habanalabs/libaeon.so" + rdma_core_root: "/opt/habanalabs/rdma-core/src" + rdma_core_lib: "/opt/habanalabs/rdma-core/src/build/lib" + + habana_extra: + hccl_comm_id: "127.0.0.1:5555" + tasks: + - name: Checking for the right amount of HPU devices + ansible.builtin.shell: | + set -o pipefail + lspci | grep 'accelerators.*Habana' | wc -l + register: hpu_dev + failed_when: (hpu_dev.stdout != "8") + changed_when: false + + - name: Setting python version used for the test runs + ansible.legacy.set_fact: + pver: "{{ '.'.join(ansible_python_version.split('.')[0:2]) }}" + + - name: Create test folder + ansible.builtin.tempfile: + state: directory + suffix: omnia_gaudi_hccl_test + register: test_folder + + - name: Git clone hccl_demo + ansible.legacy.git: + repo: 'https://github.com/HabanaAI/hccl_demo.git' + dest: "{{ test_folder.path }}" + + - name: Build hccl_demo + ansible.builtin.shell: "cd {{ test_folder.path }} && make clean" + changed_when: false + + - name: Hccl_demo all_reduce single node test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ habana_tests['data_loader_aeon_lib_path'] }}" + RDMA_CORE_ROOT: "{{ habana_tests['rdma_core_root'] }}" + RDMA_CORE_LIB: "{{ habana_tests['rdma_core_lib'] }}" + HCCL_COMM_ID: "{{ habana_extra['hccl_comm_id'] }}" + ansible.builtin.shell: "cd {{ test_folder.path }} && python3 run_hccl_demo.py -clean --test all_reduce --nranks 8 --loop 1000 --node_id 0 --size 256m --ranks_per_node 8" # noqa: yaml[line-length] + register: hccl_demo_all_reduce_single_node_test_result + failed_when: ((hccl_demo_all_reduce_single_node_test_result.stdout | regex_search('.NW Bandwidth.*:(.*)MB\/s', '\\1') | first | int <= 127000) or (hccl_demo_all_reduce_single_node_test_result.stdout | regex_search('.Algo Bandwidth.*:(.*)MB\/s', '\\1') | first | int <= 73000)) # noqa: yaml[line-length] + changed_when: false + + - name: Hccl_demo all_gather single node test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ habana_tests['data_loader_aeon_lib_path'] }}" + RDMA_CORE_ROOT: "{{ habana_tests['rdma_core_root'] }}" + RDMA_CORE_LIB: "{{ habana_tests['rdma_core_lib'] }}" + HCCL_COMM_ID: "{{ habana_extra['hccl_comm_id'] }}" + ansible.builtin.shell: "cd {{ test_folder.path }} && python3 run_hccl_demo.py -clean --test all_gather --nranks 8 --loop 1000 --node_id 0 --size 4m --ranks_per_node 8" # noqa: yaml[line-length] + register: hccl_demo_all_gather_single_node_test_result + failed_when: ((hccl_demo_all_gather_single_node_test_result.stdout | regex_search('.NW Bandwidth.*:(.*)MB\/s', '\\1') | first | int <= 127000) or (hccl_demo_all_gather_single_node_test_result.stdout | regex_search('.Algo Bandwidth.*:(.*)MB\/s', '\\1') | first | int <= 18000)) # noqa: yaml[line-length] + changed_when: false + + - name: Hccl_demo reduce_scatter single node test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ habana_tests['data_loader_aeon_lib_path'] }}" + RDMA_CORE_ROOT: "{{ habana_tests['rdma_core_root'] }}" + RDMA_CORE_LIB: "{{ habana_tests['rdma_core_lib'] }}" + HCCL_COMM_ID: "{{ habana_extra['hccl_comm_id'] }}" + ansible.builtin.shell: "cd {{ test_folder.path }} && python3 run_hccl_demo.py -clean --test reduce_scatter --nranks 8 --loop 1000 --node_id 0 --size 64m" # noqa: yaml[line-length] + register: hccl_demo_reduce_scatter_single_node_test_result + failed_when: ((hccl_demo_reduce_scatter_single_node_test_result.stdout | regex_search('.NW Bandwidth.*:(.*)MB\/s', '\\1') | first | int <= 127000) or (hccl_demo_reduce_scatter_single_node_test_result.stdout | regex_search('.Algo Bandwidth.*:(.*)MB\/s', '\\1') | first | int <= 142000)) # noqa: yaml[line-length] + changed_when: false + + - name: Hccl_demo all2all single node test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ habana_tests['data_loader_aeon_lib_path'] }}" + RDMA_CORE_ROOT: "{{ habana_tests['rdma_core_root'] }}" + RDMA_CORE_LIB: "{{ habana_tests['rdma_core_lib'] }}" + HCCL_COMM_ID: "{{ habana_extra['hccl_comm_id'] }}" + ansible.builtin.shell: "cd {{ test_folder.path }} && python3 run_hccl_demo.py -clean --test all2all --nranks 8 --loop 1000 --node_id 0 --size 4m --ranks_per_node 8" # noqa: yaml[line-length] + register: hccl_demo_all2all_single_node_test_result + failed_when: ((hccl_demo_all2all_single_node_test_result.stdout | regex_search('.NW Bandwidth.*:(.*)MB\/s', '\\1') | first | int <= 127000) or (hccl_demo_all2all_single_node_test_result.stdout | regex_search('.Algo Bandwidth.*:(.*)MB\/s', '\\1') | first | int <= 71000)) # noqa: yaml[line-length] + changed_when: false + + - name: Remove hccl_demo directory + ansible.builtin.file: + state: absent + path: "{{ test_folder.path }}" diff --git a/accelerator/tests/test_Gaudi2_hlqual_validation.yml b/accelerator/tests/test_Gaudi2_hlqual_validation.yml new file mode 100644 index 000000000..3a710dd13 --- /dev/null +++ b/accelerator/tests/test_Gaudi2_hlqual_validation.yml @@ -0,0 +1,224 @@ +# Copyright 2024 Intel Corporation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Verify hl-qual + hosts: GPU_node + vars: + ansible_become: true + ansible_become_user: root + + habana_tests: + log_level_all: "4" + enable_console: "true" + habana_logs: "/var/log/habana_logs" + gc_kernel_path: "/usr/lib/habanalabs/libtpc_kernels.so" + habana_scal_bin_path: "/opt/habanalabs/engines_fw" + habana_plugins_lib_path: "/opt/habanalabs/habana_plugins" + data_loader_aeon_lib_path: "/usr/lib/habanalabs/libaeon.so" + rdma_core_root: "/opt/habanalabs/rdma-core/src" + rdma_core_lib: "/opt/habanalabs/rdma-core/src/build/lib" + tasks: + - name: Setting python version used for the test runs + ansible.legacy.set_fact: + pver: "{{ '.'.join(ansible_python_version.split('.')[0:2]) }}" + + - name: Recursively change ownership of habana_logs directory + ansible.builtin.file: + path: "{{ habana_tests['habana_logs'] }}" + state: directory + recurse: true + mode: '0777' + + - name: Hl_qual hardware sanity check test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ habana_tests['data_loader_aeon_lib_path'] }}" + ansible.builtin.shell: "cd /opt/habanalabs/qual/gaudi2/bin && ./hl_qual -gaudi2 -c all -rmod parallel -f2 -l extreme -t 60 -dis_mon" + register: sanity_test_result + failed_when: "'FAILED' in sanity_test_result.stdout" + changed_when: false + + - name: Hl_qual memory bandwidth test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + ansible.builtin.shell: "cd /opt/habanalabs/qual/gaudi2/bin && ./hl_qual -c all -rmod parallel -mb -memOnly -gaudi2 -dis_mon" + register: memory_bandwidth_test_result + failed_when: "'FAILED' in memory_bandwidth_test_result.stdout" + changed_when: false + + - name: Hl_qual pci bandwidth test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + ansible.builtin.shell: "cd /opt/habanalabs/qual/gaudi2/bin && ./hl_qual -c all -rmod serial -mb -b -pciOnly -sramOnly -gaudi2 -dis_mon" + register: pci_bandwidth_test_result + failed_when: "'FAILED' in pci_bandwidth_test_result.stdout" + changed_when: false + + - name: Hl_qual serdes base test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + ansible.builtin.shell: "cd /opt/habanalabs/qual/gaudi2/bin && ./hl_qual -gaudi2 -c all -rmod parallel -i 50 -nic_base -test_type pairs -dis_mon" + register: serdes_base_test_result + failed_when: "'FAILED' in serdes_base_test_result.stdout" + changed_when: false + + - name: Hl_qual serdes base allreduce test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + ansible.builtin.shell: "cd /opt/habanalabs/qual/gaudi2/bin && ./hl_qual -gaudi2 -c all -rmod parallel -i 50 -ep 100 -nic_base -test_type allreduce -dis_mon" # noqa: yaml[line-length] + register: serdes_base_allreduce_test_result + failed_when: "'FAILED' in serdes_base_allreduce_test_result.stdout" + changed_when: false + + - name: Unload habanalabs kernel module + community.general.modprobe: + name: habanalabs + state: absent + + - name: Load habanalabs kernel module with timeout_locked param + community.general.modprobe: + name: habanalabs + state: present + params: 'timeout_locked=0' + + - name: Bring DOWN all Gaudi2 NICs + ansible.builtin.shell: "cd /opt/habanalabs/qual/gaudi2/bin && ./manage_network_ifs.sh --down" + changed_when: false + + - name: Bring UP all Gaudi2 NICs + ansible.builtin.shell: "cd /opt/habanalabs/qual/gaudi2/bin && ./manage_network_ifs.sh --up" + changed_when: false + + - name: Retry until HPUs NICs are ready + ansible.builtin.shell: | + set -o pipefail + cd /opt/habanalabs/qual/gaudi2/bin && ./manage_network_ifs.sh --status | grep down | wc -l + register: result + until: (result.stdout == "0") + retries: 5 + delay: 5 + changed_when: false + + - name: Hl_qual HBM DMA stress test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ habana_tests['data_loader_aeon_lib_path'] }}" + ansible.builtin.shell: "cd /opt/habanalabs/qual/gaudi2/bin && ./hl_qual -gaudi2 -c all -rmod parallel -i 1 -hbm_dma_stress -dis_mon" + register: hbm_dma_stress_test_result + failed_when: "'FAILED' in hbm_dma_stress_test_result.stdout" + changed_when: false + + - name: Hl_qual HBM TPC stress test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ habana_tests['data_loader_aeon_lib_path'] }}" + ansible.builtin.shell: "cd /opt/habanalabs/qual/gaudi2/bin && ./hl_qual -gaudi2 -c all -rmod parallel -i 1 -hbm_tpc_stress -dis_mon" + register: hbm_tpc_stress_test_result + failed_when: "'FAILED' in hbm_tpc_stress_test_result.stdout" + changed_when: false + + - name: Hl_qual power stress test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ habana_tests['data_loader_aeon_lib_path'] }}" + ansible.builtin.shell: "cd /opt/habanalabs/qual/gaudi2/bin && ./hl_qual -gaudi2 -c all -rmod parallel -s -t 120" + register: power_stress_test_result + failed_when: "'FAILED' in power_stress_test_result.stdout" + changed_when: false + + - name: Hl_qual EDP stress test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ habana_tests['data_loader_aeon_lib_path'] }}" + ansible.builtin.shell: "cd /opt/habanalabs/qual/gaudi2/bin && ./hl_qual -gaudi2 -c all -rmod parallel -t 40 -e -Tw 3 -Ts 1" + register: edp_stress_test_result + failed_when: "'FAILED' in edp_stress_test_result.stdout" + changed_when: false + + - name: Hl_qual e2e concurrency test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ habana_tests['data_loader_aeon_lib_path'] }}" + RDMA_CORE_ROOT: "{{ habana_tests['rdma_core_root'] }}" + RDMA_CORE_LIB: "{{ habana_tests['rdma_core_lib'] }}" + ansible.builtin.shell: "cd /opt/habanalabs/qual/gaudi2/bin && ./hl_qual -gaudi2 -c all -rmod parallel -t 30 -dis_mon -e2e_concurrency -disable_ports 8,22,23 -enable_ports_check int" # noqa: yaml[line-length] + register: e2e_concurrency_test_result + failed_when: "'FAILED' in e2e_concurrency_test_result.stdout" + changed_when: false + + - name: Hl_qual SER test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ habana_tests['data_loader_aeon_lib_path'] }}" + RDMA_CORE_ROOT: "{{ habana_tests['rdma_core_root'] }}" + RDMA_CORE_LIB: "{{ habana_tests['rdma_core_lib'] }}" + ansible.builtin.shell: "cd /opt/habanalabs/qual/gaudi2/bin && ./hl_qual -gaudi2 -c all -rmod parallel -dis_mon -ser" + register: ser_test_result + failed_when: "'FAILED' in ser_test_result.stdout" + changed_when: false diff --git a/accelerator/tests/test_Gaudi3.yml b/accelerator/tests/test_Gaudi3.yml new file mode 100644 index 000000000..314196eff --- /dev/null +++ b/accelerator/tests/test_Gaudi3.yml @@ -0,0 +1,67 @@ +# Copyright 2024 Intel Corporation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Verify Gaudi installation + +- name: OMNIA_Gaudi_TC_001 + tags: TC_001, TC_003, TC_004 + hosts: localhost + connection: local + vars_files: + - 'test_vars/test_Gaudi_vars.yml' + tasks: + - name: Validate Gaudi Driver Version + ansible.builtin.command: + argv: + - "ansible-playbook" + - "test_Gaudi_validation.yml" + - "-i" + - "{{ inventory }}" + - "-t" + - "TC_001,TC_002,TC_003,TC_004" + +# Verify hl-qual + +- name: OMNIA_Gaudi_TC_007 + tags: TC_007 + hosts: localhost + connection: local + vars_files: + - 'test_vars/test_Gaudi_vars.yml' + tasks: + - name: Verify hl-qual + ansible.builtin.command: + argv: + - "ansible-playbook" + - "test_Gaudi3_hlqual_validation.yml" + - "-i" + - "{{ inventory }}" + +# Verify HCCL + +- name: OMNIA_Gaudi_TC_008 + tags: TC_008 + hosts: localhost + connection: local + vars_files: + - 'test_vars/test_Gaudi_vars.yml' + tasks: + - name: Verify HCCL + ansible.builtin.command: + argv: + - "ansible-playbook" + - "test_Gaudi3_hccl_validation.yml" + - "-i" + - "{{ inventory }}" diff --git a/accelerator/tests/test_Gaudi3_hccl_validation.yml b/accelerator/tests/test_Gaudi3_hccl_validation.yml new file mode 100644 index 000000000..403bc7118 --- /dev/null +++ b/accelerator/tests/test_Gaudi3_hccl_validation.yml @@ -0,0 +1,138 @@ +# Copyright 2024 Intel Corporation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Verify HCCL + hosts: GPU_node + vars: + ansible_become: true + ansible_become_user: root + + habana_tests: + log_level_all: "4" + enable_console: "true" + habana_logs: "/var/log/habana_logs" + gc_kernel_path: "/usr/lib/habanalabs/libtpc_kernels.so" + habana_scal_bin_path: "/opt/habanalabs/engines_fw" + habana_plugins_lib_path: "/opt/habanalabs/habana_plugins" + data_loader_aeon_lib_path: "/usr/lib/habanalabs/libaeon.so" + rdma_core_root: "/opt/habanalabs/rdma-core/src" + rdma_core_lib: "/opt/habanalabs/rdma-core/src/build/lib" + + habana_extra: + hccl_comm_id: "127.0.0.1:5555" + tasks: + - name: Checking for the right amount of HPU devices + ansible.builtin.shell: | + set -o pipefail + lspci | grep 'accelerators.*Habana' | wc -l + register: hpu_dev + failed_when: (hpu_dev.stdout != "8") + changed_when: false + + - name: Setting python version used for the test runs + ansible.legacy.set_fact: + pver: "{{ '.'.join(ansible_python_version.split('.')[0:2]) }}" + + - name: Create test folder + ansible.builtin.tempfile: + state: directory + suffix: omnia_gaudi_hccl_test + register: test_folder + + - name: Git clone hccl_demo + ansible.legacy.git: + repo: 'https://github.com/HabanaAI/hccl_demo.git' + dest: "{{ test_folder.path }}" + + - name: Build hccl_demo + ansible.builtin.shell: "cd {{ test_folder.path }} && make clean" + changed_when: false + + - name: Hccl_demo all_reduce single node test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ habana_tests['data_loader_aeon_lib_path'] }}" + RDMA_CORE_ROOT: "{{ habana_tests['rdma_core_root'] }}" + RDMA_CORE_LIB: "{{ habana_tests['rdma_core_lib'] }}" + HCCL_COMM_ID: "{{ habana_extra['hccl_comm_id'] }}" + ansible.builtin.shell: "cd {{ test_folder.path }} && python3 run_hccl_demo.py -clean --test all_reduce --nranks 8 --loop 1000 --node_id 0 --size 256m --ranks_per_node 8" # noqa: yaml[line-length] + register: hccl_demo_all_reduce_single_node_test_result + failed_when: ((hccl_demo_all_reduce_single_node_test_result.stdout | regex_search('.NW Bandwidth.*:(.*)MB\/s', '\\1') | first | int <= 255000) or (hccl_demo_all_reduce_single_node_test_result.stdout | regex_search('.Algo Bandwidth.*:(.*)MB\/s', '\\1') | first | int <= 146000)) # noqa: yaml[line-length] + changed_when: false + + - name: Hccl_demo all_gather single node test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ habana_tests['data_loader_aeon_lib_path'] }}" + RDMA_CORE_ROOT: "{{ habana_tests['rdma_core_root'] }}" + RDMA_CORE_LIB: "{{ habana_tests['rdma_core_lib'] }}" + HCCL_COMM_ID: "{{ habana_extra['hccl_comm_id'] }}" + ansible.builtin.shell: "cd {{ test_folder.path }} && python3 run_hccl_demo.py -clean --test all_gather --nranks 8 --loop 1000 --node_id 0 --size 4m --ranks_per_node 8" # noqa: yaml[line-length] + register: hccl_demo_all_gather_single_node_test_result + failed_when: ((hccl_demo_all_gather_single_node_test_result.stdout | regex_search('.NW Bandwidth.*:(.*)MB\/s', '\\1') | first | int <= 254000) or (hccl_demo_all_gather_single_node_test_result.stdout | regex_search('.Algo Bandwidth.*:(.*)MB\/s', '\\1') | first | int <= 36000)) # noqa: yaml[line-length] + changed_when: false + + - name: Hccl_demo reduce_scatter single node test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ habana_tests['data_loader_aeon_lib_path'] }}" + RDMA_CORE_ROOT: "{{ habana_tests['rdma_core_root'] }}" + RDMA_CORE_LIB: "{{ habana_tests['rdma_core_lib'] }}" + HCCL_COMM_ID: "{{ habana_extra['hccl_comm_id'] }}" + ansible.builtin.shell: "cd {{ test_folder.path }} && python3 run_hccl_demo.py -clean --test reduce_scatter --nranks 8 --loop 1000 --node_id 0 --size 64m" # noqa: yaml[line-length] + register: hccl_demo_reduce_scatter_single_node_test_result + failed_when: ((hccl_demo_reduce_scatter_single_node_test_result.stdout | regex_search('.NW Bandwidth.*:(.*)MB\/s', '\\1') | first | int <= 253000) or (hccl_demo_reduce_scatter_single_node_test_result.stdout | regex_search('.Algo Bandwidth.*:(.*)MB\/s', '\\1') | first | int <= 285000)) # noqa: yaml[line-length] + changed_when: false + + - name: Hccl_demo all2all single node test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ habana_tests['data_loader_aeon_lib_path'] }}" + RDMA_CORE_ROOT: "{{ habana_tests['rdma_core_root'] }}" + RDMA_CORE_LIB: "{{ habana_tests['rdma_core_lib'] }}" + HCCL_COMM_ID: "{{ habana_extra['hccl_comm_id'] }}" + ansible.builtin.shell: "cd {{ test_folder.path }} && python3 run_hccl_demo.py -clean --test all2all --nranks 8 --loop 1000 --node_id 0 --size 4m --ranks_per_node 8" # noqa: yaml[line-length] + register: hccl_demo_all2all_single_node_test_result + failed_when: ((hccl_demo_all2all_single_node_test_result.stdout | regex_search('.NW Bandwidth.*:(.*)MB\/s', '\\1') | first | int <= 253000) or (hccl_demo_all2all_single_node_test_result.stdout | regex_search('.Algo Bandwidth.*:(.*)MB\/s', '\\1') | first | int <= 285000)) # noqa: yaml[line-length] + changed_when: false + + - name: Remove hccl_demo directory + ansible.builtin.file: + state: absent + path: "{{ test_folder.path }}" diff --git a/accelerator/tests/test_Gaudi3_hlqual_validation.yml b/accelerator/tests/test_Gaudi3_hlqual_validation.yml new file mode 100644 index 000000000..b4e1c5a91 --- /dev/null +++ b/accelerator/tests/test_Gaudi3_hlqual_validation.yml @@ -0,0 +1,208 @@ +# Copyright 2024 Intel Corporation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Verify hl-qual + hosts: GPU_node + vars: + ansible_become: yes + ansible_become_user: root + + habana_tests: + log_level_all: "4" + enable_console: "true" + habana_logs: "/var/log/habana_logs" + gc_kernel_path: "/usr/lib/habanalabs/libtpc_kernels.so" + habana_scal_bin_path: "/opt/habanalabs/engines_fw" + habana_plugins_lib_path: "/opt/habanalabs/habana_plugins" + data_loader_aeon_lib_path: "/usr/lib/habanalabs/libaeon.so" + rdma_core_root: "/opt/habanalabs/rdma-core/src" + rdma_core_lib: "/opt/habanalabs/rdma-core/src/build/lib" + tasks: + - name: Setting python version used for the test runs + set_fact: + pver: "{{ '.'.join(ansible_python_version.split('.')[0:2]) }}" + + - name: Recursively change ownership of habana_logs directory + ansible.builtin.file: + path: "{{ habana_tests['habana_logs'] }}" + state: directory + recurse: yes + mode: '0777' + + - name: hl_qual hardware sanity check test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ habana_tests['data_loader_aeon_lib_path'] }}" + shell: "cd /opt/habanalabs/qual/gaudi3/bin && ./hl_qual -gaudi3 -c all -rmod parallel -f2 -l extreme -t 60 -dis_mon" + register: sanity_test_result + failed_when: "'FAILED' in sanity_test_result.stdout" + + - name: hl_qual memory bandwidth test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + shell: "cd /opt/habanalabs/qual/gaudi3/bin && ./hl_qual -c all -rmod parallel -mb -memOnly -gaudi3 -dis_mon" + register: memory_bandwidth_test_result + failed_when: "'FAILED' in memory_bandwidth_test_result.stdout" + + - name: hl_qual pci bandwidth test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + shell: "cd /opt/habanalabs/qual/gaudi3/bin && ./hl_qual -c all -rmod serial -mb -b -pciOnly -sramOnly -gaudi3 -dis_mon" + register: pci_bandwidth_test_result + failed_when: "'FAILED' in pci_bandwidth_test_result.stdout" + + - name: hl_qual serdes base test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + shell: "cd /opt/habanalabs/qual/gaudi3/bin && ./hl_qual -gaudi3 -c all -rmod parallel -i 50 -nic_base -test_type pairs -dis_mon" + register: serdes_base_test_result + failed_when: "'FAILED' in serdes_base_test_result.stdout" + + - name: hl_qual serdes base allreduce test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + shell: "cd /opt/habanalabs/qual/gaudi3/bin && ./hl_qual -gaudi3 -c all -rmod parallel -i 50 -ep 100 -nic_base -test_type allreduce -dis_mon" + register: serdes_base_allreduce_test_result + failed_when: "'FAILED' in serdes_base_allreduce_test_result.stdout" + + - name: Unload habanalabs kernel module + modprobe: + name: habanalabs + state: absent + + - name: Load habanalabs kernel module with timeout_locked param + modprobe: + name: habanalabs + state: present + params: 'timeout_locked=0' + + - name: Bring DOWN all Gaudi3 NICs + shell: "cd /opt/habanalabs/qual/gaudi3/bin && ./manage_network_ifs.sh --down" + + - name: Bring UP all Gaudi3 NICs + shell: "cd /opt/habanalabs/qual/gaudi3/bin && ./manage_network_ifs.sh --up" + + - name: Retry until HPUs NICs are ready + shell: "cd /opt/habanalabs/qual/gaudi3/bin && ./manage_network_ifs.sh --status | grep down | wc -l" + register: result + until: (result.stdout == "0") + retries: 5 + delay: 5 + + - name: hl_qual HBM DMA stress test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ habana_tests['data_loader_aeon_lib_path'] }}" + shell: "cd /opt/habanalabs/qual/gaudi3/bin && ./hl_qual -gaudi3 -c all -rmod parallel -i 1 -hbm_dma_stress -dis_mon" + register: hbm_dma_stress_test_result + failed_when: "'FAILED' in hbm_dma_stress_test_result.stdout" + + - name: hl_qual HBM TPC stress test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ habana_tests['data_loader_aeon_lib_path'] }}" + shell: "cd /opt/habanalabs/qual/gaudi3/bin && ./hl_qual -gaudi3 -c all -rmod parallel -i 1 -hbm_tpc_stress -dis_mon" + register: hbm_tpc_stress_test_result + failed_when: "'FAILED' in hbm_tpc_stress_test_result.stdout" + + - name: hl_qual power stress test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ habana_tests['data_loader_aeon_lib_path'] }}" + shell: "cd /opt/habanalabs/qual/gaudi3/bin && ./hl_qual -gaudi3 -c all -rmod parallel -s -t 120" + register: power_stress_test_result + failed_when: "'FAILED' in power_stress_test_result.stdout" + + - name: hl_qual EDP stress test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ habana_tests['data_loader_aeon_lib_path'] }}" + shell: "cd /opt/habanalabs/qual/gaudi3/bin && ./hl_qual -gaudi3 -c all -rmod parallel -t 40 -e -Tw 3 -Ts 1" + register: edp_stress_test_result + failed_when: "'FAILED' in edp_stress_test_result.stdout" + + - name: hl_qual e2e concurrency test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ habana_tests['data_loader_aeon_lib_path'] }}" + RDMA_CORE_ROOT: "{{ habana_tests['rdma_core_root'] }}" + RDMA_CORE_LIB: "{{ habana_tests['rdma_core_lib'] }}" + shell: "cd /opt/habanalabs/qual/gaudi3/bin && ./hl_qual -gaudi3 -c all -rmod parallel -t 30 -dis_mon -e2e_concurrency -disable_ports 8,22,23 -enable_ports_check int" # noqa: yaml[line-length] + register: e2e_concurrency_test_result + failed_when: "'FAILED' in e2e_concurrency_test_result.stdout" + + - name: hl_qual SER test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ habana_tests['data_loader_aeon_lib_path'] }}" + RDMA_CORE_ROOT: "{{ habana_tests['rdma_core_root'] }}" + RDMA_CORE_LIB: "{{ habana_tests['rdma_core_lib'] }}" + shell: "cd /opt/habanalabs/qual/gaudi3/bin && ./hl_qual -gaudi3 -c all -rmod parallel -dis_mon -ser" + register: ser_test_result + failed_when: "'FAILED' in ser_test_result.stdout" diff --git a/accelerator/tests/test_Gaudi_validation.yml b/accelerator/tests/test_Gaudi_validation.yml new file mode 100644 index 000000000..0993e218c --- /dev/null +++ b/accelerator/tests/test_Gaudi_validation.yml @@ -0,0 +1,57 @@ +# Copyright 2024 Intel Corporation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Validating for Nodes having GPU + +- name: Validating the Gaudi Driver installation on nodes having GPU + hosts: GPU_node + vars_files: 'test_vars/test_Gaudi_vars.yml' + tasks: + - name: Validate the Gaudi Driver Version + tags: TC_001 + block: + - name: Getting the version + ansible.builtin.command: hl-smi -v + register: gaudi_installed_version + changed_when: false + + - name: Checking Version + ansible.builtin.assert: + that: + - "Gaudi_Default_version in gaudi_installed_version.stdout" + success_msg: "{{ version_pass }}" + fail_msg: "{{ version_fail }}" + +# Validating for Nodes with NO GPU + +- name: Validating the Gaudi Driver installation on nodes without GPU + tags: TC_003, TC_004 + hosts: no_GPU_node + vars_files: 'test_vars/test_Gaudi_vars.yml' + tasks: + - name: Validate Gaudi Driver Version + block: + - name: Getting the version + ansible.builtin.command: hl-smi -v + register: gaudi_installed_version + changed_when: false + ignore_errors: true + + - name: Verify Gaudi Driver Installation is not successful + ansible.builtin.assert: + that: + - "gaudi_installed_version.rc != 0" + success_msg: "{{ Not_installed_pass }}" + fail_msg: "{{ Not_installed_fail }}" diff --git a/accelerator/tests/test_ROCm.yml b/accelerator/tests/test_ROCm.yml index bb4bdfa96..d4cb64991 100644 --- a/accelerator/tests/test_ROCm.yml +++ b/accelerator/tests/test_ROCm.yml @@ -30,6 +30,7 @@ - name: Validate ROCm Version ansible.builtin.command: "ansible-playbook test_ROCm_validation.yml -i {{ rocm_inventory }}" + changed_when: false tags: TC_001, TC_002, TC_003, TC_004, TC_006, TC_007 # Testcase OMNIA_1.4_ROCm_TC_005 @@ -49,4 +50,5 @@ - name: Validate ROCm ansible.builtin.command: "ansible-playbook test_ROCm_validation.yml -i {{ inventory }}" + changed_when: false tags: TC_005 diff --git a/accelerator/tests/test_ROCm_validation.yml b/accelerator/tests/test_ROCm_validation.yml index ef40b373a..2bb4f29d3 100644 --- a/accelerator/tests/test_ROCm_validation.yml +++ b/accelerator/tests/test_ROCm_validation.yml @@ -21,7 +21,7 @@ - name: ROCm Version block: - name: Execute command to get the version - ansible.builtin.command: yum info rocm-libs + ansible.builtin.command: yum info rocm-libs # noqa: command-instead-of-module register: rocm_installed_version changed_when: false tags: TC_001 @@ -54,7 +54,7 @@ - name: Get ROCm Version block: - name: Execute command to get the version - ansible.builtin.command: yum info rocm-libs + ansible.builtin.command: yum info rocm-libs # noqa: command-instead-of-module register: rocm_installed_version changed_when: false tags: TC_003, TC_004, TC_006 @@ -114,7 +114,7 @@ - name: Validate ROCm block: - name: Execute command to get the version - ansible.builtin.command: yum info rocm-libs + ansible.builtin.command: yum info rocm-libs # noqa: command-instead-of-module register: rocm_installed_version changed_when: false tags: TC_007 @@ -124,4 +124,4 @@ that: - "{{ ROCm_Default_version }} in rocm_installed_version.stdout" success_msg: "{{ version_pass }}" - fail_msg: "{{ version_fail }}" \ No newline at end of file + fail_msg: "{{ version_fail }}" diff --git a/accelerator/tests/test_accelerator.yml b/accelerator/tests/test_accelerator.yml index 872e39232..5a9c389fe 100644 --- a/accelerator/tests/test_accelerator.yml +++ b/accelerator/tests/test_accelerator.yml @@ -1,33 +1,34 @@ - # Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved. - # - # Licensed under the Apache License, Version 2.0 (the "License"); - # you may not use this file except in compliance with the License. - # You may obtain a copy of the License at - # - # http://www.apache.org/licenses/LICENSE-2.0 - # - # Unless required by applicable law or agreed to in writing, software - # distributed under the License is distributed on an "AS IS" BASIS, - # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - # See the License for the specific language governing permissions and - # limitations under the License. - --- +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- - # Testcase OMNIA_1.4_Accelerator_TC_001 - # Verify the presence of Instinct MI100 Accelerator on cluster nodes +# Testcase OMNIA_1.4_Accelerator_TC_001 +# Verify the presence of Instinct MI100 Accelerator on cluster nodes - - name: OMNIA_1.4_Accelerator_TC_001 - hosts: localhost - connection: local - vars_files: - - test_vars/test_accelerator_vars.yml +- name: OMNIA_1.4_Accelerator_TC_001 + hosts: localhost + connection: local + vars_files: + - test_vars/test_accelerator_vars.yml - tasks: - - name: Execute omnia.yml with default input parameters - ansible.builtin.command: "ansible-playbook post_provision.yml -i {{ accelerator_inventory }}" - changed_when: false - tags: TC_001, TC_002 + tasks: + - name: Execute omnia.yml with default input parameters + ansible.builtin.command: "ansible-playbook post_provision.yml -i {{ accelerator_inventory }}" + changed_when: false + tags: TC_001, TC_002 - - name: Validate Accelerator - ansible.builtin.command: "ansible-playbook test_accelerator_validation.yml -i {{ accelerator_inventory }}" - tags: TC_001, TC_002 + - name: Validate Accelerator + ansible.builtin.command: "ansible-playbook test_accelerator_validation.yml -i {{ accelerator_inventory }}" + changed_when: false + tags: TC_001, TC_002 diff --git a/accelerator/tests/test_accelerator_validation.yml b/accelerator/tests/test_accelerator_validation.yml index 2f8e5dd57..25eed5920 100644 --- a/accelerator/tests/test_accelerator_validation.yml +++ b/accelerator/tests/test_accelerator_validation.yml @@ -44,4 +44,4 @@ that: - "'Success' in showtopo.stdout" success_msg: "{{ acc_pass }}" - fail_msg: "{{ acc_fail }}" \ No newline at end of file + fail_msg: "{{ acc_fail }}" diff --git a/accelerator/tests/test_vars/test_CUDA_vars.yml b/accelerator/tests/test_vars/test_CUDA_vars.yml index 53c6b7330..f8d0e8efe 100644 --- a/accelerator/tests/test_vars/test_CUDA_vars.yml +++ b/accelerator/tests/test_vars/test_CUDA_vars.yml @@ -16,9 +16,9 @@ # vars file for test_CUDA.yml file input_params_folder: "../input" -Control_plane_dir: "../" -CUDA_validation_script_path: test_CUDA_validation.yml -ROCm_vars_file_path: test_vars/test_CUDA_vars.yml +oim_dir: "../" +CUDA_validation_script_path: test_CUDA_validation.yml +ROCm_vars_file_path: test_vars/test_CUDA_vars.yml CUDA_verify_code: test_cuda_code.cu inventory: ../inventory cuda_inventory: ../cuda_inventory @@ -30,4 +30,4 @@ version_fail: 'CUDA version installed on the nodes does not matched with the def installation_pass: 'CUDA installation is successful on the nodes having GPU' installation_fail: 'CUDA installation is not successful on the nodes having GPU' Not_installed_pass: 'CUDA installation is failed on the nodes with no GPU' -Not_installed_fail: 'CUDA installation is passed on the nodes with no GPU' \ No newline at end of file +Not_installed_fail: 'CUDA installation is passed on the nodes with no GPU' diff --git a/accelerator/tests/test_vars/test_Gaudi_vars.yml b/accelerator/tests/test_vars/test_Gaudi_vars.yml new file mode 100644 index 000000000..8c4aa11ba --- /dev/null +++ b/accelerator/tests/test_vars/test_Gaudi_vars.yml @@ -0,0 +1,28 @@ +# Copyright 2024 Intel Corporation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +input_params_folder: "../input" +oim_dir: "../" +Gaudi_validation_script_path: test_Gaudi_validation.yml +inventory: ../inventory.ini + +Gaudi_Default_version: "1.18.0" + +version_pass: 'Gaudi driver version installed on the nodes matched successfully with the default version' +version_fail: 'Gaudi driver version installed on the nodes does not matched with the default version' +installation_pass: 'Gaudi driver installation is successful on the nodes having GPU' +installation_fail: 'Gaudi driver installation is not successful on the nodes having GPU' +Not_installed_pass: 'Gaudi driver installation is failed on the nodes with no GPU' +Not_installed_fail: 'Gaudi driver installation is passed on the nodes with no GPU' diff --git a/accelerator/tests/test_vars/test_ROCm_vars.yml b/accelerator/tests/test_vars/test_ROCm_vars.yml index ac6b98439..ec2982a3c 100644 --- a/accelerator/tests/test_vars/test_ROCm_vars.yml +++ b/accelerator/tests/test_vars/test_ROCm_vars.yml @@ -16,9 +16,9 @@ # vars file for test_ROCm.yml file input_params_folder: "../input_params" -Control_plane_dir: "../" -ROCm_validation_script_path: test_ROCm_validation.yml -ROCm_vars_file_path: /test_vars/test_ROCm_vars.yml +oim_dir: "../" +ROCm_validation_script_path: test_ROCm_validation.yml +ROCm_vars_file_path: /test_vars/test_ROCm_vars.yml ROCm_verify_code: test_ROCm_code.cpp inventory: ../inventory rocm_inventory: ../rocm_inventory @@ -30,4 +30,4 @@ version_fail: 'ROCm version installed on the nodes does not matched with the def installation_pass: 'ROCm installation is successful on the nodes having GPU' installation_fail: 'ROCm installation is not successful on the nodes having GPU' Not_installed_pass: 'ROCm installation is failed on the nodes with no GPU' -Not_installed_fail: 'ROCm installation is passed on the nodes with no GPU' \ No newline at end of file +Not_installed_fail: 'ROCm installation is passed on the nodes with no GPU' diff --git a/accelerator/tests/test_vars/test_accelerator_vars.yml b/accelerator/tests/test_vars/test_accelerator_vars.yml index 451bf2b2e..1e48e5269 100644 --- a/accelerator/tests/test_vars/test_accelerator_vars.yml +++ b/accelerator/tests/test_vars/test_accelerator_vars.yml @@ -16,10 +16,10 @@ # vars file for test_accelerator.yml file input_params_folder: "../input_params" -Control_plane_dir: "../" -Accelerator_validation_script_path: test_accelerator_validation.yml +oim_dir: "../" +Accelerator_validation_script_path: test_accelerator_validation.yml Accelerator_vars_file_path: /test_vars/test_accelerator_vars.yml -inventory: ../inventory +inventory: ../inventory accelerator_inventory: ../inventory ROCm_Default_version: "5.1" diff --git a/ansible.cfg b/ansible.cfg index dca7c5e5d..03f23be62 100644 --- a/ansible.cfg +++ b/ansible.cfg @@ -5,6 +5,7 @@ forks = 5 timeout = 180 executable = /bin/bash display_skipped_hosts = false +collections_path = $VIRTUAL_ENV [persistent_connection] command_timeout = 180 diff --git a/benchmarks/amd_benchmark.yml b/benchmarks/amd_benchmark.yml index fc4fb70d6..d7f14b429 100644 --- a/benchmarks/amd_benchmark.yml +++ b/benchmarks/amd_benchmark.yml @@ -13,6 +13,10 @@ # limitations under the License. --- +- name: Update Inventory with ansible_host information + ansible.builtin.import_playbook: ../utils/servicetag_host_mapping.yml + when: not ( hostvars['127.0.0.1']['update_inventory_executed'] | default(false) | bool ) + - name: Validations for AMD benchmark hosts: localhost gather_facts: true diff --git a/prepare_cp/ansible.cfg b/benchmarks/ansible.cfg similarity index 58% rename from prepare_cp/ansible.cfg rename to benchmarks/ansible.cfg index 8c7909326..3da774d09 100644 --- a/prepare_cp/ansible.cfg +++ b/benchmarks/ansible.cfg @@ -1,8 +1,9 @@ [defaults] -log_path = /var/log/omnia/prepre_cp.log +log_path = /var/log/omnia/benchmarks.log host_key_checking = false forks = 5 timeout = 180 +collections_path = $VIRTUAL_ENV executable = /bin/bash [persistent_connection] @@ -11,4 +12,4 @@ connect_timeout = 180 [ssh_connection] retries = 3 -ssh_args = -o ControlMaster=auto -o ControlPersist=180 \ No newline at end of file +ssh_args = -o ControlMaster=auto -o ControlPersist=180 diff --git a/benchmarks/intel_benchmark.yml b/benchmarks/intel_benchmark.yml index 7b790c380..949456123 100644 --- a/benchmarks/intel_benchmark.yml +++ b/benchmarks/intel_benchmark.yml @@ -13,6 +13,10 @@ # limitations under the License. --- +- name: Update Inventory with ansible_host information + ansible.builtin.import_playbook: ../utils/servicetag_host_mapping.yml + when: not ( hostvars['127.0.0.1']['update_inventory_executed'] | default(false) | bool ) + - name: Validations for Intel benchmark hosts: localhost gather_facts: true diff --git a/discovery/ansible.cfg b/discovery/ansible.cfg index c6346842b..ebf48eacd 100644 --- a/discovery/ansible.cfg +++ b/discovery/ansible.cfg @@ -4,6 +4,7 @@ host_key_checking = false forks = 5 timeout = 180 executable = /bin/bash +collections_path = $VIRTUAL_ENV [persistent_connection] command_timeout = 180 @@ -11,4 +12,4 @@ connect_timeout = 180 [ssh_connection] retries = 3 -ssh_args = -o ControlMaster=auto -o ControlPersist=180 \ No newline at end of file +ssh_args = -o ControlMaster=auto -o ControlPersist=180 diff --git a/discovery/discovery.yml b/discovery/discovery.yml index 0d999836b..3fbfee5a5 100644 --- a/discovery/discovery.yml +++ b/discovery/discovery.yml @@ -13,26 +13,42 @@ # limitations under the License. --- +- name: Check if virtual environment is active + ansible.builtin.import_playbook: ../utils/check_venv.yml + when: not ( hostvars['127.0.0.1']['check_venv_executed'] | default(false) | bool ) + +- name: Check if package manager is not locked + ansible.builtin.import_playbook: ../utils/check_package_lock.yml + when: not ( hostvars['127.0.0.1']['apt_lock_status'] | default(false) | bool ) + - name: Validate discovery parameters hosts: localhost connection: local tasks: - - name: Check whether prepare cp has been executed + - name: Check whether prepare oim has been executed ansible.builtin.include_role: - name: "{{ playbook_dir }}/../prepare_cp/roles/pre_requisite" # noqa: role-name[path] - tasks_from: prepare_cp_status.yml + name: "{{ playbook_dir }}/../prepare_oim/roles/pre_requisite" # noqa: role-name[path] + tasks_from: prepare_oim_status.yml - name: Validate discovery parameters ansible.builtin.include_role: name: discovery_validations/common tasks_from: validation_status_check.yml # noqa: role-name[path] -- name: Discovery roles of control plane for provisioning + - name: Discovery roles of Omnia Infrastructure Manager for provisioning + ansible.builtin.include_role: + name: db_operations + + - name: Enter Omnia Infrastructure Manager details in cluster.nodeinfo table + ansible.builtin.include_role: + name: "{{ playbook_dir }}/../utils/server_spec_update/roles/network_update" # noqa: role-name[path] + tasks_from: add_nic_db.yml + when: add_network_status + +- name: Discovery roles of Omnia Infrastructure Manager for provisioning hosts: localhost connection: local roles: - - role: db_operations - - role: ../server_spec_update/roles/create_nicinfo_db # noqa: role-name[path] - role: configure_xcat/common # noqa: role-name[path] - role: configure_synclist - role: discovery_mechanism/common # noqa: role-name[path] diff --git a/discovery/roles/configure_os_image/mapping/files/nodeset_nodes.py b/discovery/roles/configure_os_image/mapping/files/nodeset_nodes.py new file mode 100644 index 000000000..22ea5dae9 --- /dev/null +++ b/discovery/roles/configure_os_image/mapping/files/nodeset_nodes.py @@ -0,0 +1,89 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This module contains functions for validating and processing nodeset nodes. +""" + +import sys +import subprocess + +db_path = sys.argv[1] +sys.path.insert(0, db_path) +import omniadb_connection + +DISCOVERY_MECHANISM = "mapping" + +def validate_osimage(osimage): + """ + Validates if the given `osimage` is a string. + + Parameters: + osimage (Any): The object to be validated. + + Raises: + ValueError: If `osimage` is not a string. + + Returns: + None + """ + if not isinstance(osimage, str): + raise ValueError("osimage must be a string") + return osimage + +def nodeset_mapping_nodes(): + """ + Retrieves the list of nodes from the `cluster.nodeinfo` table in the database + and checks if each node is present in the `nodelist` table of the `xcatdb` database. + If a node is present in both tables and status is NULL, + it is added to the `new_mapping_nodes` list. + The function then executes the `/opt/xcat/sbin/nodeset` command for each node in the + `new_mapping_nodes` list with the specified `osimage` parameter. + + Parameters: + None + + Returns: + None + """ + + # Establish connection with cluster.nodeinfo + conn = omniadb_connection.create_connection() + cursor = conn.cursor() + sql = "SELECT node FROM cluster.nodeinfo WHERE discovery_mechanism = %s" + cursor.execute(sql, (DISCOVERY_MECHANISM,)) + node_name = cursor.fetchall() + cursor.close() + conn.close() + + osimage = validate_osimage(sys.argv[2]) + + # Establish connection with xcatdb + conn_x = omniadb_connection.create_connection_xcatdb() + cursor_x = conn_x.cursor() + new_mapping_nodes = [] + for node in node_name: + sql = "SELECT exists(SELECT node FROM nodelist WHERE node = %s AND status IS NULL)" + cursor_x.execute(sql, (node[0],)) + output = cursor_x.fetchone()[0] + if output: + new_mapping_nodes.append(node[0]) + command = ["/opt/xcat/sbin/nodeset", node[0], f"osimage={osimage}"] + subprocess.run(command, capture_output=True, shell=False, check=True) + + print(new_mapping_nodes) + cursor_x.close() + conn_x.close() + +nodeset_mapping_nodes() diff --git a/discovery/roles/configure_os_image/mapping/tasks/set_provision_image_mapping.yml b/discovery/roles/configure_os_image/mapping/tasks/set_provision_image_mapping.yml index 3ddbf6319..830b7738c 100644 --- a/discovery/roles/configure_os_image/mapping/tasks/set_provision_image_mapping.yml +++ b/discovery/roles/configure_os_image/mapping/tasks/set_provision_image_mapping.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,13 +16,16 @@ - name: Task for set osimage to node object for mapping block: - name: Set osimage to node object for mapping - ansible.builtin.command: nodeset {{ mapping_node_group }} osimage={{ hostvars['localhost']['provision_os_image'] }} - changed_when: true + ansible.builtin.command: "{{ python_version }} {{ nodeset_nodes_py }} {{ db_operations_file_path }} {{ hostvars['localhost']['provision_os_image'] }}" + changed_when: false register: set_osimage_mapping - - name: Installation status for mapping + - name: Status for nodeset command ansible.builtin.debug: - msg: "{{ mapping_xcat_install_success_msg }}" + msg: "{{ set_osimage_mapping.stdout }} {{ mapping_xcat_install_success_msg }}" + when: + - set_osimage_mapping.stdout is defined + - set_osimage_mapping.stdout | length > 0 rescue: - name: Verify set node object is successful for mapping ansible.builtin.debug: diff --git a/discovery/roles/configure_os_image/mapping/vars/main.yml b/discovery/roles/configure_os_image/mapping/vars/main.yml index 72e5643e8..f4216a9de 100644 --- a/discovery/roles/configure_os_image/mapping/vars/main.yml +++ b/discovery/roles/configure_os_image/mapping/vars/main.yml @@ -14,7 +14,11 @@ --- # Usage: set_provision_image_mapping.yml -mapping_xcat_install_success_msg: "Discovery tasks executed successfully for discovered nodes using mapping. -Configure PXE and reboot the cluster nodes manually if bmc IP not given for provisioning servers." +mapping_xcat_install_success_msg: "nodes configured with osimage successfully for pxe boot. +Configure PXE and reboot the cluster nodes manually if bmc IP not given in mapping file." mapping_set_osimage_warning_msg: "[WARNING] nodeset command is not successful for discoverd nodes using mapping. Error:" mapping_node_group: "mapping" +xcat_sbin_path: /opt/xcat/sbin +nodeset_nodes_py: "{{ role_path }}/../mapping/files/nodeset_nodes.py" +python_version: "{{ ansible_python_interpreter }}" +db_operations_file_path: "{{ role_path }}/../../db_operations/files" diff --git a/discovery/roles/configure_os_image/switch_based/tasks/set_provision_image_switch_based.yml b/discovery/roles/configure_os_image/switch_based/tasks/set_provision_image_switch_based.yml index b2d559977..5775c43d8 100644 --- a/discovery/roles/configure_os_image/switch_based/tasks/set_provision_image_switch_based.yml +++ b/discovery/roles/configure_os_image/switch_based/tasks/set_provision_image_switch_based.yml @@ -16,7 +16,7 @@ - name: Task for set osimage to node object for switch_based block: - name: Set osimage to node object for switch_based - ansible.builtin.command: chdef {{ switch_based_node_group }} chain="runcmd=bmcsetup,osimage={{ provision_os_image }}" + ansible.builtin.command: "{{ xcat_path }}/chdef {{ switch_based_node_group }} chain=\"runcmd=bmcsetup,osimage={{ provision_os_image }}\"" changed_when: true register: set_osimage_switch_based diff --git a/discovery/roles/configure_os_image/switch_based/vars/main.yml b/discovery/roles/configure_os_image/switch_based/vars/main.yml index 04dc95dcd..c329b5326 100644 --- a/discovery/roles/configure_os_image/switch_based/vars/main.yml +++ b/discovery/roles/configure_os_image/switch_based/vars/main.yml @@ -18,3 +18,4 @@ switch_based_node_group: "switch_based" switch_based_xcat_install_success_msg: "Provision tasks executed successfully for discoverd nodes using switch_based. Configure PXE and reboot the cluster nodes manually for provisioning servers." switch_based_set_osimage_warning_msg: "[WARNING] Command to set osimage to nodes is not successful for discovered nodes using switch_based. Error:" +xcat_path: /opt/xcat/bin diff --git a/discovery/roles/configure_synclist/tasks/configure_synclist.yml b/discovery/roles/configure_synclist/tasks/configure_synclist.yml index fd62d3ab7..61237689a 100644 --- a/discovery/roles/configure_synclist/tasks/configure_synclist.yml +++ b/discovery/roles/configure_synclist/tasks/configure_synclist.yml @@ -19,12 +19,21 @@ state: directory recurse: true +- name: Check if {{ syncfiles_dir }}/keyrings is not empty # noqa: name[template] + ansible.builtin.find: + paths: "{{ tmp_keyrings_path }}" + file_type: file + register: keyring_files + when: oim_os in "ubuntu" + - name: Copy synclist in xcat folder + vars: + is_keyring_not_empty: "{{ keyring_files.matched > 0 if oim_os == 'ubuntu' else False }}" ansible.builtin.template: src: "{{ synclists_src_path }}" dest: "{{ synclists_dest_path }}" mode: "{{ file_permission }}" - name: Configure synclist to osimage - ansible.builtin.command: "chdef -t osimage -o {{ provision_os_image }} synclists={{ synclists_dest_path }}" + ansible.builtin.command: "{{ xcat_path }}/chdef -t osimage -o {{ provision_os_image }} synclists={{ synclists_dest_path }}" changed_when: true diff --git a/discovery/roles/configure_synclist/tasks/create_files_local_registry.yml b/discovery/roles/configure_synclist/tasks/create_files_local_registry.yml index af3ed382a..8c9fa0883 100644 --- a/discovery/roles/configure_synclist/tasks/create_files_local_registry.yml +++ b/discovery/roles/configure_synclist/tasks/create_files_local_registry.yml @@ -13,19 +13,19 @@ # limitations under the License. --- -- name: Read hostname of control plane +- name: Read hostname of Omnia Infrastructure Manager ansible.builtin.command: hostname changed_when: false register: hostname_result - name: Remove existing temp certificate directory for default registries ansible.builtin.file: - path: "/tmp/certs.d/" + path: "{{ syncfiles_dir }}/certs.d/" state: absent - name: Create temp certificate directory for registry configurations ansible.builtin.file: - path: "/tmp/certs.d/_default" + path: "{{ syncfiles_dir }}/certs.d/_default" state: directory recurse: true mode: "{{ dir_permission }}" @@ -33,5 +33,5 @@ - name: Create hosts.toml file for all registry items ansible.builtin.template: src: "{{ config_omnia_registry_src_path }}" - dest: "/tmp/certs.d/_default/hosts.toml" + dest: "{{ syncfiles_dir }}/certs.d/_default/hosts.toml" mode: "{{ file_permission }}" diff --git a/discovery/roles/configure_synclist/tasks/create_files_local_repo_common.yml b/discovery/roles/configure_synclist/tasks/create_files_local_repo_common.yml index 94b97f8bf..1189636c8 100644 --- a/discovery/roles/configure_synclist/tasks/create_files_local_repo_common.yml +++ b/discovery/roles/configure_synclist/tasks/create_files_local_repo_common.yml @@ -25,23 +25,23 @@ dest: "/install{{ repo_store_path }}" state: link -- name: Remove existing /tmp/repos directory +- name: Remove existing {{ syncfiles_dir }}/repos directory # noqa: name[template] ansible.builtin.file: path: "{{ temp_dir_repo }}" state: absent -- name: Create /tmp/repos directory +- name: Create {{ syncfiles_dir }}/repos directory # noqa: name[template] ansible.builtin.file: path: "{{ temp_dir_repo }}" state: directory mode: "{{ dir_permission }}" -- name: Remove existing /tmp/conf directory +- name: Remove existing {{ syncfiles_dir }}/conf directory # noqa: name[template] ansible.builtin.file: path: "{{ temp_conf_repo }}" state: absent -- name: Create /tmp/conf directory +- name: Create {{ syncfiles_dir }}/conf directory # noqa: name[template] ansible.builtin.file: path: "{{ temp_conf_repo }}" state: directory diff --git a/discovery/roles/configure_synclist/tasks/create_files_local_repo_ubuntu.yml b/discovery/roles/configure_synclist/tasks/create_files_local_repo_ubuntu.yml index 43f146f64..8c6c78c38 100644 --- a/discovery/roles/configure_synclist/tasks/create_files_local_repo_ubuntu.yml +++ b/discovery/roles/configure_synclist/tasks/create_files_local_repo_ubuntu.yml @@ -27,12 +27,12 @@ - name: Create repos for compute nodes when: repo_config == 'partial' block: - - name: Remove /tmp/keyrings directory if already exists + - name: Remove {{ syncfiles_dir }}/keyrings directory if already exists # noqa: name[template] ansible.builtin.file: path: "{{ tmp_keyrings_path }}" state: absent - - name: Create /tmp/keyrings directory + - name: Create {{ syncfiles_dir }}/keyrings directory # noqa: name[template] ansible.builtin.file: path: "{{ tmp_keyrings_path }}" state: directory @@ -75,6 +75,16 @@ path: "{{ rocm_file_path }}" register: rocm_file + - name: Check if intelgaudi repo exists + ansible.builtin.stat: + path: "{{ intelgaudi_file_path }}" + register: intelgaudi_file + + - name: Check if intel repo exists + ansible.builtin.stat: + path: "{{ intel_file_path }}" + register: intel_file + - name: Generate and copy software repository configurations ansible.builtin.template: src: "{{ repo_config_template_src }}" @@ -86,6 +96,17 @@ loop_control: loop_var: item + - name: Generate and copy software repository configurations for Intel Gaudi + ansible.builtin.template: + src: "{{ repo_config_template_intelgaudi_src }}" + dest: "{{ repo_config_file_intelgaudi }}" + mode: "{{ repo_file_permission }}" + loop: "{{ user_config.softwares + user_config.intelgaudi | default([]) }}" + when: "(intelgaudi_name in item.name and intelgaudi_file.stat.exists) + or (intel_name in item.name and intel_file.stat.exists)" + loop_control: + loop_var: item + # always - name: Create repos for compute nodes when: repo_config == 'always' @@ -111,6 +132,16 @@ path: "{{ rocm_file_path }}" register: rocm_file + - name: Check if intelgaudi repo exists + ansible.builtin.stat: + path: "{{ intelgaudi_file_path }}" + register: intelgaudi_file + + - name: Check if intel repo exists + ansible.builtin.stat: + path: "{{ intel_file_path }}" + register: intel_file + - name: Generate and copy software repository configurations ansible.builtin.template: src: "{{ repo_config_template_src }}" @@ -118,7 +149,18 @@ mode: "{{ repo_file_permission }}" loop: "{{ user_config.softwares + user_config.amdgpu | default([]) }}" when: "(beegfs_name in item.name and beegfs_file.stat.exists) or (amdgpu_name in item.name and amdgpu_file.stat.exists) - or (rocm_name in item.name and rocm_file.stat.exists)" + or (rocm_name in item.name and rocm_file.stat.exists)" + loop_control: + loop_var: item + + - name: Generate and copy software repository configurations for Intel Gaudi + ansible.builtin.template: + src: "{{ repo_config_template_intelgaudi_src }}" + dest: "{{ repo_config_file_intelgaudi }}" + mode: "{{ repo_file_permission }}" + loop: "{{ user_config.softwares + user_config.intelgaudi | default([]) }}" + when: "(intelgaudi_name in item.name and intelgaudi_file.stat.exists) + or (intel_name in item.name and intel_file.stat.exists)" loop_control: loop_var: item @@ -127,12 +169,12 @@ when: repo_config == 'never' block: - - name: Remove /tmp/keyrings directory if already exists + - name: Remove {{ syncfiles_dir }}/keyrings directory if already exists # noqa: name[template] ansible.builtin.file: path: "{{ tmp_keyrings_path }}" state: absent - - name: Create /tmp/keyrings directory + - name: Create {{ syncfiles_dir }}/keyrings directory # noqa: name[template] ansible.builtin.file: path: "{{ tmp_keyrings_path }}" state: directory diff --git a/discovery/roles/configure_synclist/tasks/create_local_repo_access_yml_file.yml b/discovery/roles/configure_synclist/tasks/create_local_repo_access_yml_file.yml index 922e5ea6c..532279b7a 100644 --- a/discovery/roles/configure_synclist/tasks/create_local_repo_access_yml_file.yml +++ b/discovery/roles/configure_synclist/tasks/create_local_repo_access_yml_file.yml @@ -20,11 +20,6 @@ recurse: true mode: "{{ dir_permission }}" -- name: Read hostname of control plane - ansible.builtin.command: hostname - changed_when: false - register: hostname_result - - name: Create local_repo_access.yml file ansible.builtin.template: src: "{{ local_repo_access_src_path }}" diff --git a/discovery/roles/configure_synclist/tasks/main.yml b/discovery/roles/configure_synclist/tasks/main.yml index 2418a8cc5..040fa052a 100644 --- a/discovery/roles/configure_synclist/tasks/main.yml +++ b/discovery/roles/configure_synclist/tasks/main.yml @@ -22,10 +22,17 @@ block: - name: Saving distribution of os ansible.builtin.set_fact: - control_plane_os: "{{ ansible_distribution | lower }}" + oim_os: "{{ ansible_distribution | lower }}" - - name: Include vars for {{ control_plane_os }} - ansible.builtin.include_vars: "{{ role_path }}/vars/{{ control_plane_os }}.yml" + - name: Include vars for {{ oim_os }} + ansible.builtin.include_vars: "{{ role_path }}/vars/{{ oim_os }}.yml" + + - name: Create syncfiles directory + ansible.builtin.file: + path: "{{ syncfiles_dir }}" + state: directory + mode: "{{ dir_permission }}" + recurse: true - name: Include vars ansible.builtin.include_vars: "{{ role_path }}/vars/main.yml" @@ -37,7 +44,7 @@ ansible.builtin.include_tasks: create_files_local_repo_common.yml - name: Create files to be synced with xCAT for omnia local_repo - ansible.builtin.include_tasks: create_files_local_repo_{{ control_plane_os }}.yml + ansible.builtin.include_tasks: create_files_local_repo_{{ oim_os }}.yml - name: Create files to be synced with xCAT for omnia local_registry ansible.builtin.include_tasks: create_files_local_registry.yml diff --git a/discovery/roles/configure_synclist/templates/cluster_repo_template_redhat.j2 b/discovery/roles/configure_synclist/templates/cluster_repo_template_redhat.j2 index 0cab4df23..2c112c8b0 100644 --- a/discovery/roles/configure_synclist/templates/cluster_repo_template_redhat.j2 +++ b/discovery/roles/configure_synclist/templates/cluster_repo_template_redhat.j2 @@ -4,3 +4,6 @@ baseurl=http://{{ admin_nic_ip }}:80/install{{ repo_store_path }}/cluster/{{ pro enabled=1 gpgcheck=0 skip_if_unavailable=True +{% if proxy_status %} +proxy=_none_ +{% endif %} diff --git a/discovery/roles/configure_synclist/templates/config_omnia_registry_redhat.toml.j2 b/discovery/roles/configure_synclist/templates/config_omnia_registry.toml.j2 similarity index 100% rename from discovery/roles/configure_synclist/templates/config_omnia_registry_redhat.toml.j2 rename to discovery/roles/configure_synclist/templates/config_omnia_registry.toml.j2 diff --git a/discovery/roles/configure_synclist/templates/config_omnia_registry_ubuntu.toml.j2 b/discovery/roles/configure_synclist/templates/config_omnia_registry_ubuntu.toml.j2 deleted file mode 100644 index ceb834dc5..000000000 --- a/discovery/roles/configure_synclist/templates/config_omnia_registry_ubuntu.toml.j2 +++ /dev/null @@ -1,15 +0,0 @@ -{% if repo_config == "partial" or repo_config == "never" %} -{% if user_registry is defined and user_registry is not none and user_registry | length > 0 %} -{% for registry in user_registry %} -[host."{{ registry.host }}"] -capabilities = ["pull", "resolve"] -skip_verify = true -{% endfor %} -{% endif %} -{% endif %} - -{% if repo_config == "always" or repo_config == "partial" %} -[host."{{ hostname_result.stdout }}:{{ nerdctl_registry_port }}"] -capabilities = ["pull", "resolve"] -skip_verify = true -{% endif %} diff --git a/discovery/roles/configure_synclist/templates/local_repo_access.yml.j2 b/discovery/roles/configure_synclist/templates/local_repo_access.yml.j2 index 8cfe1aac7..cc8b7d070 100644 --- a/discovery/roles/configure_synclist/templates/local_repo_access.yml.j2 +++ b/discovery/roles/configure_synclist/templates/local_repo_access.yml.j2 @@ -6,4 +6,9 @@ offline_git_path: "http://{{ admin_nic_ip }}:80/install{{ repo_store_path }}/clu offline_ansible_galaxy_collection_path: "http://{{ admin_nic_ip }}:80/install{{ repo_store_path }}/cluster/ansible_galaxy_collection" offline_manifest_path: "http://{{ admin_nic_ip }}:80/install{{ repo_store_path }}/cluster/manifest" http_proxy: "http://{{ admin_nic_ip }}:3128" -https_proxy: "http://{{ admin_nic_ip }}:3128" \ No newline at end of file +https_proxy: "http://{{ admin_nic_ip }}:3128" +oim_hostname: "{{ oim_hostname }}" +domain_name: "{{ oim_domain_name }}" +proxy_status: {{ proxy_status | lower }} +no_proxy_input_status: {{ no_proxy_input_status | lower }} +user_no_proxy: "{{ proxy[0].no_proxy | default('',true) }}" diff --git a/discovery/roles/configure_synclist/templates/redhat.synclist.j2 b/discovery/roles/configure_synclist/templates/redhat.synclist.j2 index b184e3b37..2e8b372c9 100644 --- a/discovery/roles/configure_synclist/templates/redhat.synclist.j2 +++ b/discovery/roles/configure_synclist/templates/redhat.synclist.j2 @@ -1,10 +1,14 @@ +{{ syncfiles_dir }}/conf/pip.conf -> /etc/pip.conf +{{ syncfiles_dir }}/conf/dnf.conf -> /etc/dnf/dnf.conf +{{ syncfiles_dir }}/repos/* -> /etc/yum.repos.d/* + {% if repo_config == "always" %} -/tmp/certs.d/* -> /etc/containerd/certs.d/* +{{ syncfiles_dir }}/certs.d/* -> /etc/containerd/certs.d/* /etc/containerd/certs.d/{{ hostname_result.stdout }}:{{ nerdctl_registry_port }}/ca.crt -> /opt/omnia/registry/certs.d/{{ hostname_result.stdout }}:{{ nerdctl_registry_port }}/ca.crt {% endif %} {% if repo_config == "partial" %} -/tmp/certs.d/* -> /etc/containerd/certs.d/* +{{ syncfiles_dir }}/certs.d/* -> /etc/containerd/certs.d/* /etc/containerd/certs.d/{{ hostname_result.stdout }}:{{ nerdctl_registry_port }}/ca.crt -> /opt/omnia/registry/certs.d/{{ hostname_result.stdout }}:{{ nerdctl_registry_port }}/ca.crt {% if user_registry is defined and user_registry is not none and user_registry | length > 0 %} {% for registry in user_registry %} @@ -17,15 +21,11 @@ {% if repo_config == "never" %} {% if user_registry is defined and user_registry is not none and user_registry | length > 0 %} -/tmp/certs.d/* -> /etc/containerd/certs.d/* +{{ syncfiles_dir }}/certs.d/* -> /etc/containerd/certs.d/* {% for registry in user_registry %} {% if registry.cert_path is defined and registry.cert_path != "" %} {{ registry.cert_path }} -> /opt/omnia/registry/certs.d/{{ registry.host }}/ca.crt {% endif %} {% endfor %} {% endif %} -{% endif %} - -/tmp/conf/pip.conf -> /etc/pip.conf -/tmp/conf/dnf.conf -> /etc/dnf/dnf.conf -/tmp/repos/* -> /etc/yum.repos.d/* +{% endif %} \ No newline at end of file diff --git a/discovery/roles/configure_synclist/templates/repo_config_template_intelgaudi_ubuntu.j2 b/discovery/roles/configure_synclist/templates/repo_config_template_intelgaudi_ubuntu.j2 new file mode 100644 index 000000000..4c03164f5 --- /dev/null +++ b/discovery/roles/configure_synclist/templates/repo_config_template_intelgaudi_ubuntu.j2 @@ -0,0 +1 @@ +deb [trusted=yes] http://{{ admin_nic_ip }}:80/install{{ repo_store_path }}/cluster/apt/{{ item.name }}/{{ intelgaudi_version | default('') }} ./ \ No newline at end of file diff --git a/discovery/roles/configure_synclist/templates/repo_config_template_redhat.j2 b/discovery/roles/configure_synclist/templates/repo_config_template_redhat.j2 index c2f0ded09..2f8327379 100644 --- a/discovery/roles/configure_synclist/templates/repo_config_template_redhat.j2 +++ b/discovery/roles/configure_synclist/templates/repo_config_template_redhat.j2 @@ -4,3 +4,6 @@ baseurl=http://{{ admin_nic_ip }}:80/install{{ repo_store_path }}/cluster/yum/{{ enabled=1 gpgcheck=0 skip_if_unavailable=True +{% if proxy_status %} +proxy=_none_ +{% endif %} diff --git a/discovery/roles/configure_synclist/templates/ubuntu.synclist.j2 b/discovery/roles/configure_synclist/templates/ubuntu.synclist.j2 index 7803f4d72..e045fc184 100644 --- a/discovery/roles/configure_synclist/templates/ubuntu.synclist.j2 +++ b/discovery/roles/configure_synclist/templates/ubuntu.synclist.j2 @@ -1,11 +1,16 @@ +{{ syncfiles_dir }}/repos/* -> /etc/apt/sources.list.d/* +{{ syncfiles_dir }}/conf/pip.conf -> /etc/pip.conf + {% if repo_config == "always" %} -/tmp/certs.d/* -> /etc/containerd/certs.d/* +{{ syncfiles_dir }}/certs.d/* -> /etc/containerd/certs.d/* /etc/containerd/certs.d/{{ hostname_result.stdout }}:{{ nerdctl_registry_port }}/ca.crt -> /opt/omnia/registry/certs.d/{{ hostname_result.stdout }}:{{ nerdctl_registry_port }}/ca.crt {% endif %} {% if repo_config == "partial" %} -/tmp/keyrings/* -> /etc/apt/keyrings/* -/tmp/certs.d/* -> /etc/containerd/certs.d/* +{% if is_keyring_not_empty %} +{{ syncfiles_dir }}/keyrings/* -> /etc/apt/keyrings/* +{% endif %} +{{ syncfiles_dir }}/certs.d/* -> /etc/containerd/certs.d/* /etc/containerd/certs.d/{{ hostname_result.stdout }}:{{ nerdctl_registry_port }}/ca.crt -> /opt/omnia/registry/certs.d/{{ hostname_result.stdout }}:{{ nerdctl_registry_port }}/ca.crt {% if user_registry is defined and user_registry is not none and user_registry | length > 0 %} {% for registry in user_registry %} @@ -17,16 +22,15 @@ {% endif %} {% if repo_config == "never" %} -/tmp/keyrings/* -> /etc/apt/keyrings/* +{% if is_keyring_not_empty %} +{{ syncfiles_dir }}/keyrings/* -> /etc/apt/keyrings/* +{% endif %} {% if user_registry is defined and user_registry is not none and user_registry | length > 0 %} -/tmp/certs.d/* -> /etc/containerd/certs.d/* +{{ syncfiles_dir }}/certs.d/* -> /etc/containerd/certs.d/* {% for registry in user_registry %} {% if registry.cert_path is defined and registry.cert_path != "" %} {{ registry.cert_path }} -> /opt/omnia/registry/certs.d/{{ registry.host }}/ca.crt {% endif %} {% endfor %} {% endif %} -{% endif %} - -/tmp/repos/* -> /etc/apt/sources.list.d/* -/tmp/conf/pip.conf -> /etc/pip.conf \ No newline at end of file +{% endif %} \ No newline at end of file diff --git a/discovery/roles/configure_synclist/vars/main.yml b/discovery/roles/configure_synclist/vars/main.yml index 8268abbb2..8bc3a4e8a 100644 --- a/discovery/roles/configure_synclist/vars/main.yml +++ b/discovery/roles/configure_synclist/vars/main.yml @@ -18,29 +18,34 @@ xcat_root_env: "/opt/xcat" xcat_path_env: "/opt/xcat/bin:/opt/xcat/sbin:/opt/xcat/share/xcat/tools" xcat_manpath_env: "/opt/xcat/share/man:$MANPATH" perl_badlang_env: 0 +syncfiles_dir: "/opt/omnia/syncfiles" # usage: create_local_repo_access_yml_file.yml dir_permission: "0755" omnia_files_folder: "/opt/omnia" -local_repo_access_src_path: "../templates/local_repo_access.yml.j2" +local_repo_access_src_path: "{{ role_path }}/templates/local_repo_access.yml.j2" local_repo_access_dest_path: "{{ omnia_files_folder }}/offline/local_repo_access.yml" file_permission: "0755" repo_file_permission: "0644" # Usage: create_files_local_repo_common.yml -temp_dir_repo: "/tmp/repos" -temp_conf_repo: "/tmp/conf" -pip_conf_src: "../templates/pip.conf.j2" -pip_conf_dest: "/tmp/conf/pip.conf" +temp_dir_repo: "{{ syncfiles_dir }}/repos" +temp_conf_repo: "{{ syncfiles_dir }}/conf" +pip_conf_src: "{{ role_path }}/templates/pip.conf.j2" +pip_conf_dest: "{{ syncfiles_dir }}/conf/pip.conf" # usage: create_files_local_registry.yml nerdctl_registry_port: 5001 +config_omnia_registry_src_path: "{{ role_path }}/templates/config_omnia_registry.toml.j2" # usage: configure_synclist.yml synclists_dir: "/install/synclists" +xcat_path: /opt/xcat/bin # Usage: create_files_local_repo_redhat.yml, create_files_local_repo_ubuntu.yml, create_files_local_repo_rocky.yml beegfs_name: "beegfs" amdgpu_name: "amdgpu" rocm_name: "rocm" +intelgaudi_name: "intelgaudi" +intel_name: "intel" software_config_default: "omnia_default" diff --git a/discovery/roles/configure_synclist/vars/redhat.yml b/discovery/roles/configure_synclist/vars/redhat.yml index 0af7c52c0..f327fb903 100644 --- a/discovery/roles/configure_synclist/vars/redhat.yml +++ b/discovery/roles/configure_synclist/vars/redhat.yml @@ -26,7 +26,7 @@ beegfs_file_path: "{{ repo_store_path }}/cluster/yum/beegfs/{{ beegfs_version }} amdgpu_file_path: "{{ repo_store_path }}/cluster/yum/amdgpu/{{ amdgpu_version }}" rocm_file_path: "{{ repo_store_path }}/cluster/yum/rocm/{{ rocm_version }}" dnf_conf_src: "{{ role_path }}/templates/dnf.conf.j2" -dnf_conf_dest: "/tmp/conf/dnf.conf" +dnf_conf_dest: "{{ syncfiles_dir }}/conf/dnf.conf" dnf_file_permission: "0644" cluster_repo_template_fail: "Failed. Please run local_repo/local_repo.yml and re-execute discovery/discovery.yml or discovery_provision.yml" base_url_redhat: "/install{{ repo_store_path }}/cluster/{{ provision_os }}/{{ provision_os_version }}/rpm" @@ -36,5 +36,4 @@ synclists_src_path: "{{ role_path }}/templates/redhat.synclist.j2" synclists_dest_path: "{{ synclists_dir }}/redhat.synclist" # Usage: create_files_local_registry.yml -config_omnia_registry_src_path: "{{ role_path }}/templates/config_omnia_registry_redhat.toml.j2" config_user_registry_src_path: "{{ role_path }}/templates/config_user_registry_redhat.toml.j2" diff --git a/discovery/roles/configure_synclist/vars/ubuntu.yml b/discovery/roles/configure_synclist/vars/ubuntu.yml index cdea273ac..f3c63df13 100644 --- a/discovery/roles/configure_synclist/vars/ubuntu.yml +++ b/discovery/roles/configure_synclist/vars/ubuntu.yml @@ -15,8 +15,10 @@ # Usage: create_files_local_repo.yml repo_config_template_src: "{{ role_path }}/templates/repo_config_template_ubuntu.j2" +repo_config_template_intelgaudi_src: "{{ role_path }}/templates/repo_config_template_intelgaudi_ubuntu.j2" cluster_repo_template_src: "{{ role_path }}/templates/cluster_repo_template_ubuntu.j2" repo_config_file: "{{ temp_dir_repo }}/{{ item.name }}-{{ item.version }}.list" +repo_config_file_intelgaudi: "{{ temp_dir_repo }}/{{ item.name }}-{{ intelgaudi_version }}.list" cluster_repo_config_file: "{{ temp_dir_repo }}/cluster-deb.list" file_mode: "0644" dir_mode: "0755" @@ -25,16 +27,15 @@ omnia_repo_template_src: "{{ role_path }}/templates/omnia_repo_config_ubuntu.j2" beegfs_file_path: "{{ repo_store_path }}/cluster/apt/beegfs/{{ beegfs_version }}" amdgpu_file_path: "{{ repo_store_path }}/cluster/apt/amdgpu/{{ amdgpu_version }}" rocm_file_path: "{{ repo_store_path }}/cluster/apt/rocm/{{ rocm_version }}" +intelgaudi_file_path: "{{ repo_store_path }}/cluster/apt/intelgaudi/{{ intelgaudi_version }}" +intel_file_path: "{{ repo_store_path }}/cluster/apt/intel/{{ intelgaudi_version }}" software_config_json_file: "{{ role_path }}/../../../input/software_config.json" local_repo_config_file: "{{ role_path }}/../../../input/local_repo_config.yml" local_repo_config_syntax_fail_msg: "Failed. Syntax errors present in local_repo_config.yml. Fix errors and re-run playbook again." -tmp_keyrings_path: "/tmp/keyrings" +tmp_keyrings_path: "{{ syncfiles_dir }}/keyrings" cluster_repo_template_fail: "Failed. Please run local_repo/local_repo.yml and re-execute discovery/discovery.yml or discovery_provision.yml" base_url_ubuntu: "/install{{ repo_store_path }}/cluster/{{ provision_os }}/{{ provision_os_version }}/deb" # Usage: configure_synclist.yml synclists_src_path: "{{ role_path }}/templates/ubuntu.synclist.j2" synclists_dest_path: "{{ synclists_dir }}/ubuntu.synclist" - -# Usage: create_files_local_registry.yml -config_omnia_registry_src_path: "../templates/config_omnia_registry_ubuntu.toml.j2" diff --git a/discovery/roles/configure_xcat/common/files/delete_networks.py b/discovery/roles/configure_xcat/common/files/delete_networks.py index 83826c33b..5f2d1106f 100644 --- a/discovery/roles/configure_xcat/common/files/delete_networks.py +++ b/discovery/roles/configure_xcat/common/files/delete_networks.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys +import sys, os import yaml import subprocess @@ -20,7 +20,7 @@ sys.path.insert(0, db_path) import omniadb_connection -network_spec_path = sys.argv[2] +network_spec_path = os.path.abspath(sys.argv[2]) nw_names = [] omnia_nw_names = [] @@ -50,7 +50,7 @@ def delete_misc_networks(): omnia_networks() for i in nw_names: if i not in omnia_nw_names: - command = f"rmdef -t network {i}" + command = f"/opt/xcat/bin/rmdef -t network {i}" command_list = command.split() subprocess.run(command_list, capture_output=True) diff --git a/discovery/roles/configure_xcat/common/files/otherpkgs_updated.patch b/discovery/roles/configure_xcat/common/files/otherpkgs_updated.patch new file mode 100644 index 000000000..e9ecbbf44 --- /dev/null +++ b/discovery/roles/configure_xcat/common/files/otherpkgs_updated.patch @@ -0,0 +1,26 @@ +--- /install/postscripts/otherpkgs 2024-09-03 10:47:58.920064505 +0530 ++++ /install/postscripts/otherpkgs_updated 2024-09-03 10:45:43.521072346 +0530 +@@ -598,6 +598,7 @@ + echo "enabled=1" >> $REPOFILE + echo "gpgcheck=0" >> $REPOFILE + echo "skip_if_unavailable=True" >> $REPOFILE ++ echo "proxy=_none_" >> $REPOFILE + fi + i=$((i+1)) + done +@@ -663,6 +664,7 @@ + echo "enabled=1" >> $REPOFILE + echo "gpgcheck=0" >> $REPOFILE + echo "skip_if_unavailable=True" >> $REPOFILE ++ echo "proxy=_none_" >> $REPOFILE + + elif [ $hasapt -eq 1 ] ; then + REPOFILE="$repo_base/xCAT-otherpkgs${urlrepoindex}.list" +@@ -768,6 +770,7 @@ + echo "enabled=1" >> $REPOFILE + echo "gpgcheck=0" >> $REPOFILE + echo "skip_if_unavailable=True" >> $REPOFILE ++ echo "proxy=_none_" >> $REPOFILE + if [ $hasyum -eq 1 ]; then + yum clean all + fi diff --git a/discovery/roles/configure_xcat/common/files/yum_updated.patch b/discovery/roles/configure_xcat/common/files/yum_updated.patch new file mode 100644 index 000000000..020c04153 --- /dev/null +++ b/discovery/roles/configure_xcat/common/files/yum_updated.patch @@ -0,0 +1,10 @@ +--- /opt/xcat/lib/perl/xCAT/Yum.pm 2024-09-10 07:39:16.390507177 -0400 ++++ /opt/xcat/lib/perl/xCAT/Yum_updated.pm 2024-09-10 07:31:39.687474283 -0400 +@@ -61,6 +61,7 @@ + print $yumrepofile "name=xCAT configured yum repository for $yumurl\n"; + print $yumrepofile "baseurl=$yumurl\n"; + print $yumrepofile "enabled=1\n"; ++ print $yumrepofile "proxy=_none_\n"; + print $yumrepofile "gpgcheck=0\n\n"; + } + diff --git a/discovery/roles/configure_xcat/common/tasks/configure_basic_xcat_details.yml b/discovery/roles/configure_xcat/common/tasks/configure_basic_xcat_details.yml index 6bcff6dfc..e9009856e 100644 --- a/discovery/roles/configure_xcat/common/tasks/configure_basic_xcat_details.yml +++ b/discovery/roles/configure_xcat/common/tasks/configure_basic_xcat_details.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ - name: Fetch default networks table entries block: - name: Fetch default network table entries - ansible.builtin.command: lsdef -t network + ansible.builtin.command: "{{ xcat_path }}/lsdef -t network" changed_when: false register: default_network rescue: @@ -30,7 +30,7 @@ - xcatd - name: Fetch default network table entries - ansible.builtin.command: lsdef -t network + ansible.builtin.command: "{{ xcat_path }}/lsdef -t network" changed_when: false register: default_network @@ -41,46 +41,52 @@ - name: Configure site table ansible.builtin.shell: > - chdef -t site dhcpinterfaces="{{ pxe_nic }}" master="{{ pxe_nic_ip }}" dhcplease="{{ default_lease_time }}" + {{ xcat_path }}/chdef -t site dhcpinterfaces="{{ pxe_nic }}" master="{{ pxe_nic_ip }}" dhcplease="{{ default_lease_time }}" timezone="{{ timezone }}" forwarders="" nameservers="" changed_when: true -- name: Make node object for control plane +- name: Configure forwarders in site table + ansible.builtin.shell: > + {{ xcat_path }}/chdef -t site forwarders="{{ pxe_nic_forwarders }}" + changed_when: true + +- name: Make node object for Omnia Infrastructure Manager ansible.builtin.command: >- - mkdef -t node control_plane groups=control_plane mgt=ipmi cons=ipmi ip={{ admin_nic_ip }} bmc={{ bmc_nic_ip }} primarynic=mac mac={{ pxe_mac_address }} + {{ xcat_path }}/mkdef -t node oim groups=oim mgt=ipmi cons=ipmi ip={{ admin_nic_ip }} + bmc={{ bmc_nic_ip }} primarynic=mac mac={{ pxe_mac_address }} changed_when: false failed_when: false when: network_interface_type == "lom" -- name: Make node object for control plane +- name: Make node object for Omnia Infrastructure Manager ansible.builtin.command: >- - mkdef -t node control_plane groups=control_plane mgt=ipmi cons=ipmi ip={{ admin_nic_ip }} primarynic=mac mac={{ pxe_mac_address }} + {{ xcat_path }}/mkdef -t node oim groups=oim mgt=ipmi cons=ipmi ip={{ admin_nic_ip }} primarynic=mac mac={{ pxe_mac_address }} changed_when: false failed_when: false when: network_interface_type == "dedicated" - name: Update the site table ansible.builtin.shell: > - chdef -t site excludenodes="control_plane" + {{ xcat_path }}/chdef -t site excludenodes="oim" changed_when: true - name: Configure site table - ansible.builtin.command: chdef -t site nameservers="{{ pxe_nic_ip }}" + ansible.builtin.command: "{{ xcat_path }}/chdef -t site nameservers=\"{{ pxe_nic_ip }}\"" changed_when: true - name: Configure domain in site table - ansible.builtin.command: chdef -t site domain="{{ domain_name }}" + ansible.builtin.command: "{{ xcat_path }}/chdef -t site domain=\"{{ domain_name }}\"" changed_when: true - name: Configure system password ansible.builtin.shell: > - chtab key=system passwd.username=root passwd.password=`openssl passwd -1 {{ provision_password }}` + {{ xcat_sbin_path }}/chtab key=system passwd.username=root passwd.password=`openssl passwd -1 {{ provision_password }}` changed_when: true no_log: true - name: Configure admin_network in networks table ansible.builtin.shell: > - chdef -t network -o admin_network net={{ pxe_nic_subnet }} mask={{ pxe_nic_netmask }} mgtifname={{ pxe_nic }} + {{ xcat_path }}/chdef -t network -o admin_network net={{ pxe_nic_subnet }} mask={{ pxe_nic_netmask }} mgtifname={{ pxe_nic }} gateway={{ pxe_nic_ip }} dhcpserver={{ pxe_nic_ip }} tftpserver={{ pxe_nic_ip }} staticrange="{{ pxe_nic_start_range }}-{{ pxe_nic_end_range }}" dynamicrange="{{ pxe_nic_dynamic_start_range }}-{{ pxe_nic_dynamic_end_range }}" mtu={{ network_data.admin_network.MTU }} changed_when: true @@ -88,7 +94,7 @@ - name: Configure admin_network in networks table ansible.builtin.shell: > - chdef -t network -o admin_network net={{ pxe_nic_subnet }} mask={{ pxe_nic_netmask }} mgtifname={{ pxe_nic }} + {{ xcat_path }}/chdef -t network -o admin_network net={{ pxe_nic_subnet }} mask={{ pxe_nic_netmask }} mgtifname={{ pxe_nic }} gateway={{ pxe_nic_ip }} dhcpserver={{ pxe_nic_ip }} tftpserver={{ pxe_nic_ip }} staticrange="{{ pxe_nic_start_range }}-{{ pxe_nic_end_range }}" mtu={{ network_data.admin_network.MTU }} changed_when: true diff --git a/discovery/roles/configure_xcat/common/tasks/configure_proxy.yml b/discovery/roles/configure_xcat/common/tasks/configure_proxy.yml index 84a471bec..4944948c3 100644 --- a/discovery/roles/configure_xcat/common/tasks/configure_proxy.yml +++ b/discovery/roles/configure_xcat/common/tasks/configure_proxy.yml @@ -18,11 +18,52 @@ path: "{{ squid_proxy_conf_path }}" line: "acl localnet src {{ pxe_nic_subnet }}/{{ network_data.admin_network.netmask_bits }}" insertafter: "^acl localnet src 192.168.0.0/16" - register: configure_proxy + register: configure_proxy_admin_subnet -- name: Start and enable squid service +- name: Configure proxy details in squid.conf + when: proxy_status + block: + - name: Set proxy_host and proxy_port variables + ansible.builtin.set_fact: + proxy_host: "{{ proxy[0].http_proxy | regex_search('http://([^:]+):', '\\1') }}" + proxy_port: "{{ proxy[0].http_proxy | regex_search(':(\\d+)', '\\1') }}" + + - name: Update proxy configuration + ansible.builtin.lineinfile: + path: "{{ squid_proxy_conf_path }}" + line: "{{ item.line }}" + insertafter: "{{ item.insertafter }}" + register: configure_proxy_input + with_items: + - { line: "cache_peer {{ proxy_host[0] }} parent {{ proxy_port[0] }} 0 no-query default", insertafter: "EOF" } + - { line: "never_direct allow localnet", insertafter: "^http_access allow localnet" } + +- name: Restart squid service if config changed ansible.builtin.service: name: squid state: restarted enabled: true - when: configure_proxy.changed # noqa: no-handler + when: + - configure_proxy_admin_subnet.changed + or configure_proxy_input.changed + +- name: Gathering service facts + ansible.builtin.service_facts: + +- name: Restart squid service if not started + ansible.builtin.service: + name: squid + state: restarted + enabled: true + when: "'running' not in ansible_facts.services['squid.service'].state" + +- name: Apply xcat proxy patches + ansible.posix.patch: + src: "{{ item.src }}" + dest: "{{ item.dest }}" + state: present + with_items: "{{ xcat_proxy_patch_file }}" + when: + - proxy_status + - oim_os in oim_os_redhat or + oim_os in oim_os_rocky diff --git a/discovery/roles/configure_xcat/common/tasks/configure_xcat_passwd.yml b/discovery/roles/configure_xcat/common/tasks/configure_xcat_passwd.yml index 648d6fea4..c0baef514 100644 --- a/discovery/roles/configure_xcat/common/tasks/configure_xcat_passwd.yml +++ b/discovery/roles/configure_xcat/common/tasks/configure_xcat_passwd.yml @@ -14,13 +14,13 @@ --- - name: Update IPMI table - ansible.builtin.command: "chtab key=ipmi passwd.username={{ bmc_username }} passwd.password={{ bmc_password }}" + ansible.builtin.command: "{{ xcat_sbin_path }}/chtab key=ipmi passwd.username={{ bmc_username }} passwd.password={{ bmc_password }}" changed_when: true - name: Check for IPMI entries in password table ansible.builtin.shell: > set -o pipefail && \ - tabdump passwd | grep ipmi | wc -l + {{ xcat_sbin_path }}/tabdump passwd | grep ipmi | wc -l register: ipmi_entries changed_when: true diff --git a/discovery/roles/configure_xcat/common/tasks/create_osimage.yml b/discovery/roles/configure_xcat/common/tasks/create_osimage.yml index c41de909c..6f472a4d5 100644 --- a/discovery/roles/configure_xcat/common/tasks/create_osimage.yml +++ b/discovery/roles/configure_xcat/common/tasks/create_osimage.yml @@ -20,7 +20,7 @@ - name: Check diskful osimage is created ansible.builtin.shell: > set -o pipefail && \ - lsdef -t osimage | grep "{{ osimage_search_key }}" | grep "{{ provision_os }}" | grep "{{ provision_os_version }}" + {{ xcat_path }}/lsdef -t osimage | grep "{{ osimage_search_key }}" | grep "{{ provision_os }}" | grep "{{ provision_os_version }}" changed_when: false failed_when: false register: diskful_osimage_check @@ -32,5 +32,5 @@ - provision_os in diskful_osimage_check.stdout - name: Create diskful osimage - ansible.builtin.command: copycds {{ iso_file_path }} + ansible.builtin.command: "{{ xcat_sbin_path }}/copycds {{ iso_file_path }}" changed_when: true diff --git a/discovery/roles/configure_xcat/common/tasks/fetch_osimage.yml b/discovery/roles/configure_xcat/common/tasks/fetch_osimage.yml index f4d17e00c..c5021f056 100644 --- a/discovery/roles/configure_xcat/common/tasks/fetch_osimage.yml +++ b/discovery/roles/configure_xcat/common/tasks/fetch_osimage.yml @@ -16,7 +16,7 @@ - name: Fetch xcat osimage name for provision_os ansible.builtin.shell: > set -o pipefail && \ - lsdef -t osimage | grep "{{ osimage_search_key }}" | grep "{{ provision_os }}" | grep "{{ provision_os_version }}" + {{ xcat_path }}/lsdef -t osimage | grep "{{ osimage_search_key }}" | grep "{{ provision_os }}" | grep "{{ provision_os_version }}" changed_when: false register: fetch_osimage failed_when: false @@ -29,7 +29,7 @@ - name: Retry fetching xcat osimage name for provision_os ansible.builtin.shell: > set -o pipefail && \ - lsdef -t osimage | grep "{{ osimage_search_key }}" | grep "{{ provision_os }}" + {{ xcat_path }}/lsdef -t osimage | grep "{{ osimage_search_key }}" | grep "{{ provision_os }}" changed_when: false register: retry_fetch_osimage when: fetch_osimage.rc != 0 diff --git a/discovery/roles/configure_xcat/common/tasks/main.yml b/discovery/roles/configure_xcat/common/tasks/main.yml index 032c8f448..8e039d4f3 100644 --- a/discovery/roles/configure_xcat/common/tasks/main.yml +++ b/discovery/roles/configure_xcat/common/tasks/main.yml @@ -20,12 +20,12 @@ MANPATH: "{{ xcat_manpath_env }}" PERL_BADLANG: "{{ perl_badlang_env }}" block: - - name: Set control_plane_os + - name: Set oim_os ansible.builtin.set_fact: - control_plane_os: "{{ ansible_distribution | lower }}" + oim_os: "{{ ansible_distribution | lower }}" - - name: Include vars for {{ control_plane_os }} - ansible.builtin.include_vars: "{{ role_path }}/../{{ control_plane_os }}/vars/main.yml" + - name: Include vars for {{ oim_os }} + ansible.builtin.include_vars: "{{ role_path }}/../{{ oim_os }}/vars/main.yml" - name: Check configure_xcat role pre-requisites ansible.builtin.include_tasks: pre_requisites.yml @@ -49,5 +49,5 @@ - name: Fetch osimage name ansible.builtin.include_tasks: fetch_osimage.yml - - name: Configure xCAT on {{ control_plane_os }} - ansible.builtin.include_tasks: "{{ role_path }}/../{{ control_plane_os }}/tasks/main.yml" + - name: Configure xCAT on {{ oim_os }} + ansible.builtin.include_tasks: "{{ role_path }}/../{{ oim_os }}/tasks/main.yml" diff --git a/discovery/roles/configure_xcat/common/tasks/pre_requisites.yml b/discovery/roles/configure_xcat/common/tasks/pre_requisites.yml index a7898c505..af3b74d16 100644 --- a/discovery/roles/configure_xcat/common/tasks/pre_requisites.yml +++ b/discovery/roles/configure_xcat/common/tasks/pre_requisites.yml @@ -19,7 +19,7 @@ repo_validation_status: true - name: Check output of network table - ansible.builtin.command: lsdef -t network + ansible.builtin.command: "{{ xcat_path }}/lsdef -t network" changed_when: false failed_when: false register: network_table_check diff --git a/discovery/roles/configure_xcat/common/tasks/update_xcat_network.yml b/discovery/roles/configure_xcat/common/tasks/update_xcat_network.yml index 9709563dc..eb77f7761 100644 --- a/discovery/roles/configure_xcat/common/tasks/update_xcat_network.yml +++ b/discovery/roles/configure_xcat/common/tasks/update_xcat_network.yml @@ -15,7 +15,7 @@ - name: Configure bmc_network in networks table with static and dynamic ranges ansible.builtin.shell: > - chdef -t network -o bmc_network net={{ bmc_nic_subnet }} mask={{ bmc_nic_netmask }} mgtifname={{ bmc_nic }} + {{ xcat_path }}/chdef -t network -o bmc_network net={{ bmc_nic_subnet }} mask={{ bmc_nic_netmask }} mgtifname={{ bmc_nic }} gateway={{ bmc_nic_ip }} dhcpserver={{ bmc_nic_ip }} dynamicrange="{{ bmc_dynamic_start_range }}-{{ bmc_dynamic_end_range }}" staticrange="{{ bmc_static_start_range }}-{{ bmc_static_end_range }}" mtu={{ network_data.bmc_network.MTU }} changed_when: true diff --git a/discovery/roles/configure_xcat/common/vars/main.yml b/discovery/roles/configure_xcat/common/vars/main.yml index 0979cb0f3..ddb0fb1e8 100644 --- a/discovery/roles/configure_xcat/common/vars/main.yml +++ b/discovery/roles/configure_xcat/common/vars/main.yml @@ -20,12 +20,14 @@ xcat_manpath_env: "/opt/xcat/share/man:$MANPATH" perl_badlang_env: 0 local_repo_config_file: "{{ role_path }}/../../../../input/local_repo_config.yml" software_config_file: "{{ role_path }}/../../../../input/software_config.json" +xcat_path: /opt/xcat/bin +xcat_sbin_path: /opt/xcat/sbin # Usage: pre_requisites.yml xcat_connection_search_key: "connection failure" -xcat_status_fail_msg: "Failed. xCAT services are not running. Please run provision.yml or prepare_cp.yml to install xCAT." -control_plane_os_redhat: "redhat" -control_plane_os_rocky: "rocky" +xcat_status_fail_msg: "Failed. xCAT services are not running. Please run provision.yml or prepare_oim.yml to install xCAT." +oim_os_redhat: "redhat" +oim_os_rocky: "rocky" # Usage: create_osimage.yml osimage_search_key: install-compute @@ -42,9 +44,12 @@ fail_bmc_nic: "Failed. Please provide proper bmc subnet." # Usage: configue_proxy.yml squid_proxy_conf_path: /etc/squid/squid.conf +xcat_proxy_patch_file: + - { src: "{{ role_path }}/files/otherpkgs_updated.patch", dest: "/install/postscripts/otherpkgs", mode: "755" } + - { src: "{{ role_path }}/files/yum_updated.patch", dest: "/opt/xcat/lib/perl/xCAT/Yum.pm", mode: "755" } # Usage: configure_xcat_basic_details -python_version: "3.9" +python_version: "{{ ansible_python_interpreter }}" network_spec_path: "{{ role_path }}/../../../../input/network_spec.yml" path_db: "{{ role_path }}/../../db_operations/files" delete_misc_networks: "{{ role_path }}/../common/files/delete_networks.py" diff --git a/discovery/roles/configure_xcat/redhat/tasks/configure_kickstart.yml b/discovery/roles/configure_xcat/redhat/tasks/configure_kickstart.yml index 664e21d4d..81e5fe4f3 100644 --- a/discovery/roles/configure_xcat/redhat/tasks/configure_kickstart.yml +++ b/discovery/roles/configure_xcat/redhat/tasks/configure_kickstart.yml @@ -26,14 +26,11 @@ regexp: '^lang ks_language' replace: 'lang {{ language }}' -- name: Fetch control_plane hostname - ansible.builtin.command: hostname - changed_when: false - register: cp_hostname - -- name: Update control plane IP in /etc/hosts of compute node +- name: Update Omnia Infrastructure Manager IP in /etc/hosts of compute node ansible.builtin.lineinfile: path: "{{ xcat_rhel8_post_script }}" insertafter: "EOF" state: present - line: 'echo "{{ admin_nic_ip }} {{ cp_hostname.stdout }}" >> /etc/hosts' \ No newline at end of file + line: | + echo "127.0.0.1 localhost" >> /etc/hosts + echo "{{ admin_nic_ip }} {{ oim_hostname }}" >> /etc/hosts diff --git a/discovery/roles/configure_xcat/redhat/tasks/omnia_repo_config.yml b/discovery/roles/configure_xcat/redhat/tasks/omnia_repo_config.yml index 7b1aaf86e..6925692dd 100644 --- a/discovery/roles/configure_xcat/redhat/tasks/omnia_repo_config.yml +++ b/discovery/roles/configure_xcat/redhat/tasks/omnia_repo_config.yml @@ -16,7 +16,7 @@ - name: Fetch otherpkgdir path ansible.builtin.shell: > set -o pipefail && \ - lsdef -t osimage -o {{ provision_os_image }} -i otherpkgdir | grep otherpkgdir + {{ xcat_path }}/lsdef -t osimage -o {{ provision_os_image }} -i otherpkgdir | grep otherpkgdir changed_when: false register: fetch_otherpkgdir @@ -30,43 +30,50 @@ state: directory mode: "{{ file_permission }}" -- name: Fetch racadm package - ansible.builtin.get_url: - url: "{{ racadm_url }}" - dest: "{{ racadm_file }}" - mode: "{{ file_permission }}" - register: download_racadm - until: download_racadm is not failed - retries: "{{ max_retries }}" - -- name: Create racadm directory - ansible.builtin.file: - path: "{{ racadm_path }}" - state: directory - mode: "{{ file_permission }}" - -- name: Uarchive racadm package - ansible.builtin.unarchive: - src: "{{ racadm_file }}" - dest: "{{ racadm_path }}" - - name: Create omnia repo directory ansible.builtin.file: path: "{{ omnia_common_xcat_repo }}" state: directory mode: "{{ file_permission }}" -- name: Copy racadm file to omnia repo - ansible.builtin.copy: - src: "{{ racadm_rhel8_file_path }}" - dest: "{{ omnia_common_xcat_repo }}" - mode: "{{ file_permission }}" +- name: Download and configure racadm + block: + - name: Fetch racadm package + ansible.builtin.get_url: + url: "{{ racadm_url }}" + dest: "{{ racadm_file }}" + mode: "{{ file_permission }}" + register: download_racadm + until: download_racadm is not failed + retries: "{{ max_retries }}" -- name: Copy ipmitool file to omnia repo - ansible.builtin.copy: - src: "{{ ipmitool_rhel8_file_path }}" - dest: "{{ omnia_common_xcat_repo }}" - mode: "{{ file_permission }}" + - name: Create racadm directory + ansible.builtin.file: + path: "{{ racadm_path }}" + state: directory + mode: "{{ file_permission }}" + + - name: Uarchive racadm package + ansible.builtin.unarchive: + src: "{{ racadm_file }}" + dest: "{{ racadm_path }}" + + - name: Copy racadm file to omnia repo + ansible.builtin.copy: + src: "{{ racadm_rhel8_file_path }}" + dest: "{{ omnia_common_xcat_repo }}" + mode: "{{ file_permission }}" + + - name: Copy ipmitool file to omnia repo + ansible.builtin.copy: + src: "{{ ipmitool_rhel8_file_path }}" + dest: "{{ omnia_common_xcat_repo }}" + mode: "{{ file_permission }}" + rescue: + - name: Warning - Failed to download racadm package + ansible.builtin.pause: + prompt: "{{ download_racadm_warning_msg }}" + seconds: "{{ warning_time }}" - name: Downloading omnia common repo packages block: @@ -101,5 +108,5 @@ mode: "{{ other_pkg_list_mode }}" - name: Configure omnia pkglist to osimage - ansible.builtin.command: chdef -t osimage -o {{ provision_os_image }} otherpkglist={{ other_pkg_list_dest }} + ansible.builtin.command: "{{ xcat_path }}/chdef -t osimage -o {{ provision_os_image }} otherpkglist={{ other_pkg_list_dest }}" changed_when: true diff --git a/discovery/roles/configure_xcat/redhat/vars/main.yml b/discovery/roles/configure_xcat/redhat/vars/main.yml index 6bd84efa8..d587b52c0 100644 --- a/discovery/roles/configure_xcat/redhat/vars/main.yml +++ b/discovery/roles/configure_xcat/redhat/vars/main.yml @@ -20,6 +20,7 @@ postgresql_service: "postgresql.service" # Usage: main.yml os_supported_rhel: "redhat" provision_os_rhel: "rhel" +xcat_path: /opt/xcat/bin # Usage: configure_kickstart.yml xcat_rhel8_path: @@ -48,7 +49,7 @@ ipmitool_rhel8_file_path: /opt/racadm/iDRACTools/ipmitool/RHEL8_x86_64/ file_permission: "755" xcat_directory: /root/xcat max_retries: 10 -download_common_package_fail_msg: "Failed. Unable to download package sshpass from the repos configured in control plane. +download_common_package_fail_msg: "Failed. Unable to download package sshpass from the repos configured in Omnia Infrastructure Manager. Enable repos which can be used to download sshpass and re-run the provision.yml to install sshpass during provisioning" other_pkg_list_dir: "/install/post/custom/{{ provision_os }}{{ provision_os_version }}" other_pkg_list_src: "{{ role_path }}/../redhat/files/omnia.pkglist" @@ -58,21 +59,23 @@ crb_repo_packages: "libaec lua-posix lua-filesystem munge-devel perl-Switch rrdt perl-File-BaseDir opencl-headers ocl-icd-devel pmix-devel" download_crb_package_fail_msg: "Failed. Unable to download required crb packages libaec, lua-posix, lua-filesystem, munge-devel, perl-Switch, rrdtool-devel, lua-devel, hwloc-devel, http-parser-devel, perl-File-BaseDir, opencl-headers, ocl-icd-devel and pmix-devel. -Make sure crb repos configured in control plane and re-run provision.yml" +Make sure crb repos configured in Omnia Infrastructure Manager and re-run provision.yml" epel_repo_packages: "slurm-slurmd slurm-pmi slurm-slurmctld slurm-slurmdbd slurm-slurmrestd man2html Lmod hdf5 libjwt lua-term slurm slurm-libs man2html-core dkms perl-URI-Encode rocm-comgr rocm-opencl rocm-opencl-devel pdsh pdsh-rcmd-ssh" -download_epel_package_fail_msg: "Failed. Unable to download packages from the epel repo configured in control plane. +download_epel_package_fail_msg: "Failed. Unable to download packages from the epel repo configured in Omnia Infrastructure Manager. This might be due to internet connectivity issue when accessing the epel repository. Try re-running playbook after sometime." rhel_epel_repo8: https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm custom_software_repo: "{{ other_pkg_dir }}/custom_software/Packages" -control_plane_os_redhat: "redhat" -control_plane_os_rocky: "rocky" +oim_os_redhat: "redhat" +oim_os_rocky: "rocky" download_apptainer_fail_msg: "Failed. Unable to download apptainer package from epel repository. This might be due to internet connectivity issue when accessing the epel repository. Try re-running playbook after sometime." apptainer_packages: "apptainer fakeroot fakeroot-libs squashfuse squashfuse-libs" telemetry_pckg_path: "{{ role_path }}/../redhat/files/telemetry_pkglist" telemetry_pkg_list_src: "{{ role_path }}/../redhat/files/telemetry.pkglist" +download_racadm_warning_msg: "[WARNING] Failed to download racadm package {{ racadm_url }}. This can be due to internet connectivity issue. Please verify the connectivity and try again. Skipping racadm configuration in the cluster." # noqa: yaml[line-length] +warning_time: 30 # Usage: manage_subscription_manager.yml rhsm_release_file_path: "/opt/omnia/.data/rhsm_release" -conf_file_mode: "0644" \ No newline at end of file +conf_file_mode: "0644" diff --git a/discovery/roles/configure_xcat/ubuntu/tasks/configure_kickstart.yml b/discovery/roles/configure_xcat/ubuntu/tasks/configure_kickstart.yml index 1bea61c35..04d22d95d 100644 --- a/discovery/roles/configure_xcat/ubuntu/tasks/configure_kickstart.yml +++ b/discovery/roles/configure_xcat/ubuntu/tasks/configure_kickstart.yml @@ -27,7 +27,7 @@ - name: Fetch ubuntu 22 directory name ansible.builtin.shell: > set -o pipefail && \ - lsdef -t osimage -o {{ provision_os_image }} | grep osvers + {{ xcat_path }}/lsdef -t osimage -o {{ provision_os_image }} | grep osvers changed_when: false register: fetch_ubuntu22_directory @@ -39,7 +39,7 @@ ansible.builtin.file: path: "{{ ubuntu22_unwanted_package }}" state: absent - + - name: Copy ubuntu kickstart files to xcat configuration path ansible.builtin.template: src: "{{ item.src }}" diff --git a/discovery/roles/configure_xcat/ubuntu/tasks/racadm_config.yml b/discovery/roles/configure_xcat/ubuntu/tasks/racadm_config.yml index c11ac39c4..00f58e510 100644 --- a/discovery/roles/configure_xcat/ubuntu/tasks/racadm_config.yml +++ b/discovery/roles/configure_xcat/ubuntu/tasks/racadm_config.yml @@ -19,11 +19,18 @@ state: directory mode: "{{ file_permission }}" -- name: Fetch racadm package - ansible.builtin.get_url: - url: "{{ racadm_tar_url }}" - dest: "{{ racadm_tar_dest }}" - mode: "{{ file_permission }}" - register: download_racadm - until: download_racadm is not failed - retries: "{{ max_retries }}" +- name: Download racadm + block: + - name: Fetch racadm package + ansible.builtin.get_url: + url: "{{ racadm_tar_url }}" + dest: "{{ racadm_tar_dest }}" + mode: "{{ file_permission }}" + register: download_racadm + until: download_racadm is not failed + retries: "{{ max_retries }}" + rescue: + - name: Warning - Failed to download racadm package + ansible.builtin.pause: + prompt: "{{ download_racadm_warning_msg }}" + seconds: "{{ warning_time }}" diff --git a/discovery/roles/configure_xcat/ubuntu/vars/main.yml b/discovery/roles/configure_xcat/ubuntu/vars/main.yml index 27ebf0735..6e8bc0aef 100644 --- a/discovery/roles/configure_xcat/ubuntu/vars/main.yml +++ b/discovery/roles/configure_xcat/ubuntu/vars/main.yml @@ -32,12 +32,16 @@ remoteshell_script_path: /install/postscripts/remoteshell remoteshell_regexp: "^(.*)MaxStartups 1024(.*)" pre_scripts_ubuntu_path: - { src: "{{ role_path }}/../ubuntu/templates/temp_pre_scripts_ubuntu.j2", dest: "/opt/xcat/share/xcat/install/scripts/pre.ubuntu.subiquity", mode: "0644" } +xcat_path: /opt/xcat/bin # Usage: racadm_config.yml racadm_tar_url: "https://dl.dell.com/FOLDER08952875M/1/Dell-iDRACTools-Web-LX-11.0.0.0-5139_A00.tar.gz" tarball_path: "{{ repo_store_path }}/cluster/tarball/" racadm_tar_dest: "{{ tarball_path }}/racadm.tar.gz" max_retries: 10 +download_racadm_warning_msg: "[WARNING] Failed to download racadm package {{ racadm_tar_url }}. This can be due to internet connectivity issue. Please verify the connectivity and try again. +Skipping racadm configuration in the cluster." +warning_time: 30 # Usage: configure_kernel.yml hwe_search_key: "casper/hwe-initrd" diff --git a/discovery/roles/db_operations/files/add_cp_db.py b/discovery/roles/db_operations/files/add_oim_db.py similarity index 82% rename from discovery/roles/db_operations/files/add_cp_db.py rename to discovery/roles/db_operations/files/add_oim_db.py index bd0224291..8c0735cdc 100644 --- a/discovery/roles/db_operations/files/add_cp_db.py +++ b/discovery/roles/db_operations/files/add_oim_db.py @@ -20,19 +20,19 @@ admin_nic_ip = sys.argv[1] network_interface_type = sys.argv[2] pxe_mac_address = sys.argv[3] -cp_hostname = sys.argv[4] +oim_hostname = sys.argv[4] bmc_default = "0.0.0.0" if num_args == 5: - bmc_nic_ip = sys.argv[5] + bmc_nic_ip = sys.argv[5] else: bmc_nic_ip = "0.0.0.0" -node_name = "control_plane" +node_name = "oim" admin_nic_ip = ipaddress.IPv4Address(admin_nic_ip) bmc_nic_ip = ipaddress.IPv4Address(bmc_nic_ip) -def cp_details_db(): +def oim_details_db(): conn = omniadb_connection.create_connection() cursor = conn.cursor() sql = "select admin_mac from cluster.nodeinfo where admin_mac=%s" @@ -40,9 +40,9 @@ def cp_details_db(): pxe_mac_op = cursor.fetchone() if pxe_mac_op is None: if str(bmc_nic_ip) == "0.0.0.0": - omniadb_connection.insert_node_info(None, node_name, cp_hostname, pxe_mac_address, admin_nic_ip, None, None, None, None, None, None) + omniadb_connection.insert_node_info(None, node_name, oim_hostname, pxe_mac_address, admin_nic_ip, None, None, None, None, None, None) else: - omniadb_connection.insert_node_info(None, node_name, cp_hostname, pxe_mac_address, admin_nic_ip, bmc_nic_ip, None, None, None, None, None) + omniadb_connection.insert_node_info(None, node_name, oim_hostname, pxe_mac_address, admin_nic_ip, bmc_nic_ip, None, None, None, None, None) -cp_details_db() +oim_details_db() \ No newline at end of file diff --git a/discovery/roles/db_operations/files/omniadb_connection.py b/discovery/roles/db_operations/files/omniadb_connection.py index c15e2be0c..b04ac6973 100644 --- a/discovery/roles/db_operations/files/omniadb_connection.py +++ b/discovery/roles/db_operations/files/omniadb_connection.py @@ -15,11 +15,14 @@ import psycopg2 as pg from cryptography.fernet import Fernet -with open('/opt/omnia/.postgres/.postgres_pass.key', 'rb') as passfile: +key_file_path = '/opt/omnia/.postgres/.postgres_pass.key' +pass_file_path = '/opt/omnia/.postgres/.encrypted_pwd' + +with open(key_file_path, 'rb') as passfile: key = passfile.read() fernet = Fernet(key) -with open('/opt/omnia/.postgres/.encrypted_pwd', 'rb') as datafile: +with open(pass_file_path, 'rb') as datafile: encrypted_file_data = datafile.read() decrypted_pwd = fernet.decrypt(encrypted_file_data).decode() @@ -35,7 +38,6 @@ def create_connection(): conn.autocommit = True return conn - def create_connection_xcatdb(): # Create database connection conn = pg.connect( diff --git a/discovery/roles/db_operations/tasks/main.yml b/discovery/roles/db_operations/tasks/main.yml index 456cd996d..6aa756cb6 100644 --- a/discovery/roles/db_operations/tasks/main.yml +++ b/discovery/roles/db_operations/tasks/main.yml @@ -13,5 +13,5 @@ # limitations under the License. --- -- name: Enter control_plane details in cluster.nodeinfo table - ansible.builtin.include_tasks: cp_details_db.yml +- name: Enter oim details in cluster.nodeinfo table + ansible.builtin.include_tasks: oim_details_db.yml diff --git a/server_spec_update/roles/create_nicinfo_db/vars/main.yml b/discovery/roles/db_operations/tasks/oim_details_db.yml similarity index 70% rename from server_spec_update/roles/create_nicinfo_db/vars/main.yml rename to discovery/roles/db_operations/tasks/oim_details_db.yml index ba9944e5a..cce17873c 100644 --- a/server_spec_update/roles/create_nicinfo_db/vars/main.yml +++ b/discovery/roles/db_operations/tasks/oim_details_db.yml @@ -13,7 +13,8 @@ # limitations under the License. --- -# Usage: add_nic_db.yml -add_nic_db_path: "{{ role_path }}/files/additional_nic_table.py" -network_spec_path: "{{ role_path }}/../../../input/network_spec.yml" -node_db_path: "{{ role_path }}/../../../discovery/roles/db_operations/files" +- name: Create oim entry in cluster_info table + ansible.builtin.command: | + {{ python_version }} {{ oim_db_utility_path }} {{ admin_nic_ip }} {{ network_interface_type }} {{ pxe_mac_address }} + {{ oim_hostname }} {{ bmc_nic_ip }} + changed_when: false diff --git a/discovery/roles/db_operations/vars/main.yml b/discovery/roles/db_operations/vars/main.yml index 1d507c19f..072e5bc5e 100644 --- a/discovery/roles/db_operations/vars/main.yml +++ b/discovery/roles/db_operations/vars/main.yml @@ -13,6 +13,6 @@ # limitations under the License. --- -# Usage: cp_details_db.yml -python_version: "python3.9" -cp_db_utility_path: "{{ role_path }}/files/add_cp_db.py" +# Usage: oim_details_db.yml +python_version: "{{ ansible_python_interpreter }}" +oim_db_utility_path: "{{ role_path }}/files/add_oim_db.py" diff --git a/discovery/roles/discovery_mechanism/common/tasks/configure_dhcp.yml b/discovery/roles/discovery_mechanism/common/tasks/configure_dhcp.yml index 4719bdb52..9c2083652 100644 --- a/discovery/roles/discovery_mechanism/common/tasks/configure_dhcp.yml +++ b/discovery/roles/discovery_mechanism/common/tasks/configure_dhcp.yml @@ -16,7 +16,7 @@ - name: Task for creating DHCP configuration block: - name: Create DHCP configuration - ansible.builtin.command: makedhcp -n + ansible.builtin.command: "{{ xcat_sbin_path }}/makedhcp -n" changed_when: true register: create_dhcp_config rescue: @@ -28,7 +28,7 @@ - name: Task for adding hosts entry block: - name: Add hosts entry - ansible.builtin.command: makehosts all + ansible.builtin.command: "{{ xcat_sbin_path }}/makehosts all" changed_when: true register: create_hosts_entry rescue: @@ -40,7 +40,7 @@ - name: Task for applying DHCP configuration block: - name: Apply DHCP Configuration - ansible.builtin.command: makedhcp -a + ansible.builtin.command: "{{ xcat_sbin_path }}/makedhcp -a" changed_when: true register: apply_dhcp_config rescue: @@ -52,7 +52,7 @@ - name: Task for creating DNS configuration block: - name: Create DNS configuration - ansible.builtin.command: makedns -n + ansible.builtin.command: "{{ xcat_sbin_path }}/makedns -n" changed_when: true register: dns_config rescue: diff --git a/discovery/roles/discovery_mechanism/common/tasks/remove_old_ssh_key.yml b/discovery/roles/discovery_mechanism/common/tasks/remove_old_ssh_key.yml index b334c5256..0abcf8076 100644 --- a/discovery/roles/discovery_mechanism/common/tasks/remove_old_ssh_key.yml +++ b/discovery/roles/discovery_mechanism/common/tasks/remove_old_ssh_key.yml @@ -17,8 +17,9 @@ community.postgresql.postgresql_query: db: omniadb login_user: postgres - query: SELECT admin_ip,hostname,node FROM cluster.nodeinfo where (node!='control_plane'); + query: SELECT admin_ip,hostname,node FROM cluster.nodeinfo where (node!='oim'); login_password: "{{ hostvars['localhost']['postgresdb_password'] }}" + become: true become_user: postgres register: query_status failed_when: false diff --git a/discovery/roles/discovery_mechanism/common/vars/main.yml b/discovery/roles/discovery_mechanism/common/vars/main.yml index 918cfbd00..1f12e536e 100644 --- a/discovery/roles/discovery_mechanism/common/vars/main.yml +++ b/discovery/roles/discovery_mechanism/common/vars/main.yml @@ -28,3 +28,4 @@ dhcp_config_fail_msg: "Failed. makedhcp -n command is not successful. Error:" hosts_entry_warning_msg: "[WARNING] makehosts command is not successful. Error:" dhcp_config_apply_fail_msg: "Failed. makedhcp -a command is not successful. Error:" dns_config_warning_msg: "[WARNING] makedns -n command is not successful. Error:" +xcat_sbin_path: /opt/xcat/sbin diff --git a/discovery/roles/discovery_mechanism/mapping/files/mapping_file_db.py b/discovery/roles/discovery_mechanism/mapping/files/mapping_file_db.py index 7bd30717c..46e850ae5 100644 --- a/discovery/roles/discovery_mechanism/mapping/files/mapping_file_db.py +++ b/discovery/roles/discovery_mechanism/mapping/files/mapping_file_db.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys +import sys, os import pandas as pd import ipaddress @@ -21,7 +21,7 @@ import omniadb_connection -pxe_mapping_path = sys.argv[2] +pxe_mapping_path = os.path.abspath(sys.argv[2]) domain_name = sys.argv[3] discovery_mechanism = "mapping" admin_mac = [] @@ -79,4 +79,4 @@ def mapping_file_db_update(): conn.close() -mapping_file_db_update() +mapping_file_db_update() \ No newline at end of file diff --git a/discovery/roles/discovery_mechanism/mapping/tasks/check_nodes_mapping.yml b/discovery/roles/discovery_mechanism/mapping/tasks/check_nodes_mapping.yml index 159c338b2..7a6fb432d 100644 --- a/discovery/roles/discovery_mechanism/mapping/tasks/check_nodes_mapping.yml +++ b/discovery/roles/discovery_mechanism/mapping/tasks/check_nodes_mapping.yml @@ -18,7 +18,7 @@ mapping_node_status: false - name: Fetch nodes with group {{ mapping_node_group }} - ansible.builtin.command: lsdef {{ mapping_node_group }} + ansible.builtin.command: "{{ xcat_path }}/lsdef {{ mapping_node_group }}" changed_when: false register: check_mapping_nodes failed_when: false diff --git a/discovery/roles/discovery_mechanism/mapping/tasks/node_object_creation_mapping.yml b/discovery/roles/discovery_mechanism/mapping/tasks/node_object_creation_mapping.yml index a1f21d389..252c06bbf 100644 --- a/discovery/roles/discovery_mechanism/mapping/tasks/node_object_creation_mapping.yml +++ b/discovery/roles/discovery_mechanism/mapping/tasks/node_object_creation_mapping.yml @@ -18,7 +18,7 @@ mapping_bmc_node_group: "mapping_bmc" - name: Fetch all node object list - ansible.builtin.command: lsdef all -i mac + ansible.builtin.command: "{{ xcat_path }}/lsdef all -i mac" changed_when: false failed_when: false register: node_mac_list @@ -30,7 +30,7 @@ - name: Create node object from mapping file provided by user ansible.builtin.shell: > - mkdef -t node "{{ mapping_file_output.list[idx1].HOSTNAME.split('.')[0] }}" groups={{ mapping_node_group }},all + {{ xcat_path }}/mkdef -t node "{{ mapping_file_output.list[idx1].HOSTNAME.split('.')[0] }}" groups={{ mapping_node_group }},all ip="{{ mapping_file_output.list[idx1].ADMIN_IP | trim }}" mac="{{ mapping_file_output.list[idx1].ADMIN_MAC | trim }}" hostnames="{{ mapping_file_output.list[idx1].HOSTNAME | trim }}" serial="{{ mapping_file_output.list[idx1].SERVICE_TAG | trim }}" netboot=xnba arch=x86_64 -f @@ -44,7 +44,7 @@ - name: Create node object from mapping file provided by user ansible.builtin.shell: > - mkdef -t node "{{ mapping_file_output.list[idx1].HOSTNAME.split('.')[0] }}" groups={{ mapping_node_group }},{{ mapping_bmc_node_group }},all + {{ xcat_path }}/mkdef -t node "{{ mapping_file_output.list[idx1].HOSTNAME.split('.')[0] }}" groups={{ mapping_node_group }},{{ mapping_bmc_node_group }},all ip="{{ mapping_file_output.list[idx1].ADMIN_IP | trim }}" bmc="{{ mapping_file_output.list[idx1].BMC_IP | trim }}" mgt=ipmi mac="{{ mapping_file_output.list[idx1].ADMIN_MAC | trim }}" serial="{{ mapping_file_output.list[idx1].SERVICE_TAG | trim }}" hostnames="{{ mapping_file_output.list[idx1].HOSTNAME | trim }}" netboot=xnba arch=x86_64 -f diff --git a/discovery/roles/discovery_mechanism/mapping/tasks/set_provision_image_mapping.yml b/discovery/roles/discovery_mechanism/mapping/tasks/set_provision_image_mapping.yml deleted file mode 100644 index 8440a255c..000000000 --- a/discovery/roles/discovery_mechanism/mapping/tasks/set_provision_image_mapping.yml +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Task for set osimage to node object for mapping - block: - - name: Set osimage to node object for mapping - ansible.builtin.command: nodeset {{ mapping_node_group }} osimage={{ hostvars['localhost']['provision_os_image'] }} - changed_when: true - register: set_osimage_mapping - until: set_osimage_mapping is not failed - retries: "{{ max_retries }}" - - - name: Installation status for mapping - ansible.builtin.debug: - msg: "{{ mapping_xcat_install_success_msg }}" - rescue: - - name: Verify set node object is successful for mapping - ansible.builtin.debug: - msg: "{{ mapping_set_osimage_warning_msg }} {{ set_osimage_mapping.stderr }}" - when: set_osimage_mapping.stderr is defined diff --git a/discovery/roles/discovery_mechanism/mapping/vars/main.yml b/discovery/roles/discovery_mechanism/mapping/vars/main.yml index 0c5ba1709..b064c1713 100644 --- a/discovery/roles/discovery_mechanism/mapping/vars/main.yml +++ b/discovery/roles/discovery_mechanism/mapping/vars/main.yml @@ -16,6 +16,7 @@ # Usage: main.yml discovery_mech_vars_common: - "{{ role_path }}/../common/vars/main.yml" +xcat_path: /opt/xcat/bin # Usage: update_db_mapping.yml mapping_file_db_path: "{{ role_path }}/../mapping/files/mapping_file_db.py" @@ -29,6 +30,3 @@ file_mode: "0644" # Usage: check_nodes_mapping.yml mapping_nodes_warning_msg: "[WARNING] Nodes provided in mapping file were already provisioned by omnia using other discovery mechanism." warning_wait_time: 10 - -# Usage: set_provision_image_mapping.yml -max_retries: 5 diff --git a/discovery/roles/discovery_mechanism/mtms/files/bmc_discover_ranges.py b/discovery/roles/discovery_mechanism/mtms/files/bmc_discover_ranges.py index 6b16f1286..095b54f2a 100644 --- a/discovery/roles/discovery_mechanism/mtms/files/bmc_discover_ranges.py +++ b/discovery/roles/discovery_mechanism/mtms/files/bmc_discover_ranges.py @@ -12,23 +12,42 @@ # See the License for the specific language governing permissions and # limitations under the License. +""" +This module provides functionality for running BMC discovery on a range of IP addresses. +""" -import sys +import re +import sys, os import subprocess import calculate_ip_details + +def validate(ip_range): + # Define regex patterns + cidr_pattern = r'^(\d{1,3}\.){3}\d{1,3}/\d{1,2}$' + range_pattern = r'^(\d{1,3}\.){3}\d{1,3}-\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$' + valid_pattern = f'({cidr_pattern}|{range_pattern}|0.0.0.0)' + ip_range = ip_range.strip() + + if not re.fullmatch(valid_pattern, ip_range): + raise ValueError("Invalid IP range format") + return ip_range + if len(sys.argv) <= 3: bmc_dynamic_range = sys.argv[1] - dynamic_stanza = sys.argv[2] + bmc_dynamic_range = validate(bmc_dynamic_range) + dynamic_stanza = os.path.abspath(sys.argv[2]) + + # Pass proper variables if len(sys.argv) > 3: discovery_ranges = sys.argv[1] - discover_stanza = sys.argv[2] + discover_stanza = os.path.abspath(sys.argv[2]) bmc_static_subnet = sys.argv[3] - static_stanza = sys.argv[4] + static_stanza = os.path.abspath(sys.argv[4]) netmask_bits = sys.argv[5] bmc_static_range = sys.argv[6] - + bmc_static_range = validate(bmc_static_range) def cal_ranges(start_ip, end_ip): """ @@ -72,7 +91,7 @@ def create_ranges_dynamic(bmc_mode): Calls: if range is valid, call the function run_bmc_discover, for running bmcdiscovery. - """ + """ temp = bmc_dynamic_range.split('-') start_ip = temp[0].split('.') end_ip = temp[1].split('.') @@ -115,7 +134,8 @@ def create_ranges_discovery(bmc_mode): """ discover_range_list = discovery_ranges.split(',') for ip_range in discover_range_list: - temp = ip_range.split('-') + ip_obj = validate(ip_range) + temp = ip_obj.split('-') start_ip = temp[0].split('.') end_ip = temp[1].split('.') discover_subnet = calculate_ip_details.cal_ip_details(temp[0], netmask_bits)[1] @@ -146,15 +166,13 @@ def run_bmc_discover(final_range, stanza_path, bmc_mode): Proper stanza file with results of bmcdiscovery, else it gets timed out. """ - command_list = "" + if bmc_mode == "static" or bmc_mode == "discovery": - command = f"bmcdiscover --range {final_range} -z" - command_list = command.split() + command = ["/opt/xcat/bin/bmcdiscover", "--range", final_range, "-z"] elif bmc_mode == "dynamic": - command = f"bmcdiscover --range {final_range} -z -w" - command_list = command.split() + command = ["/opt/xcat/bin/bmcdiscover", "--range", final_range, "-z", "-w"] try: - node_objs = subprocess.run(command_list, capture_output=True, timeout=600) + node_objs = subprocess.run(command, capture_output=True, timeout=600, check=True) with open(stanza_path, 'r+') as f: f.write(node_objs.stdout.decode()) except subprocess.TimeoutExpired: @@ -180,4 +198,4 @@ def create_ranges(): create_ranges_dynamic(bmc_mode) -create_ranges() +create_ranges() \ No newline at end of file diff --git a/discovery/roles/discovery_mechanism/mtms/files/calculate_ip_details.py b/discovery/roles/discovery_mechanism/mtms/files/calculate_ip_details.py index 40bac3494..ccab8c3c6 100644 --- a/discovery/roles/discovery_mechanism/mtms/files/calculate_ip_details.py +++ b/discovery/roles/discovery_mechanism/mtms/files/calculate_ip_details.py @@ -78,8 +78,9 @@ def calculate_binary_ip(ip): """ try: octets = map(int, ip.split('.')) - binary = ''.join(f'{octet:08b}' for octet in octets) - return binary + if octets: + binary = ''.join(f'{octet:08b}' for octet in octets) + return binary except ValueError: return "Invalid IP address" diff --git a/discovery/roles/discovery_mechanism/mtms/files/create_dynamic_ip_list.py b/discovery/roles/discovery_mechanism/mtms/files/create_dynamic_ip_list.py index fde6338a9..b1b2334a4 100644 --- a/discovery/roles/discovery_mechanism/mtms/files/create_dynamic_ip_list.py +++ b/discovery/roles/discovery_mechanism/mtms/files/create_dynamic_ip_list.py @@ -14,7 +14,7 @@ import re import ipaddress -import sys +import sys, os from ipaddress import IPv4Address import subprocess @@ -26,7 +26,7 @@ ip_list = [] valid_ip_list = [] -dhcp_file_path = sys.argv[5] +dhcp_file_path = os.path.abspath(sys.argv[5]) dynamic_ip_path = "/opt/omnia/dynamic_ip_list" @@ -42,13 +42,13 @@ def create_temp_ip_list(): # declaring the regex pattern for IP addresses pattern = re.compile(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})') - - # extracting the IP addresses - for line in fstring: - if pattern.search(line) is not None: - ip = IPv4Address(pattern.search(line)[0]) - ip_list.append(ip) - ip = ipaddress.ip_address(f'{ip}') + if fstring: + # extracting the IP addresses + for line in fstring: + if pattern.search(line) is not None: + ip = IPv4Address(pattern.search(line)[0]) + ip_list.append(ip) + ip = ipaddress.ip_address(f'{ip}') file.close() extract_possible_bmc_ip() diff --git a/discovery/roles/discovery_mechanism/mtms/files/delete_misc_node_obj.py b/discovery/roles/discovery_mechanism/mtms/files/delete_misc_node_obj.py index 18c20605f..2ff9de18f 100644 --- a/discovery/roles/discovery_mechanism/mtms/files/delete_misc_node_obj.py +++ b/discovery/roles/discovery_mechanism/mtms/files/delete_misc_node_obj.py @@ -41,7 +41,7 @@ def extract_nodes(): op = cursor.fetchone()[0] if op: print(op) - command = f"rmdef {node[0]}" + command = f"/opt/xcat/bin/rmdef {node[0]}" command_list = command.split() subprocess.run(command_list, capture_output=True) diff --git a/discovery/roles/discovery_mechanism/mtms/files/modify_network_details.py b/discovery/roles/discovery_mechanism/mtms/files/modify_network_details.py index 9fcc228e1..b126ad208 100644 --- a/discovery/roles/discovery_mechanism/mtms/files/modify_network_details.py +++ b/discovery/roles/discovery_mechanism/mtms/files/modify_network_details.py @@ -15,7 +15,7 @@ import ipaddress import sys import re -control_plane = "control_plane" +oim = "oim" def extract_serial_bmc(stanza_path): @@ -106,7 +106,7 @@ def cal_uncorrelated_admin_ip(cursor, uncorrelated_admin_start_ip, admin_static_ admin_ip: A valid uncorrelated admin_ip for the node. """ - sql = f'''select admin_ip from cluster.nodeinfo where node!='control_plane' ORDER BY id DESC LIMIT 1''' + sql = f'''select admin_ip from cluster.nodeinfo where node!='oim' ORDER BY id DESC LIMIT 1''' cursor.execute(sql) last_admin_ip = cursor.fetchone() uncorr_output = check_presence_admin_ip(cursor, uncorrelated_admin_start_ip) diff --git a/discovery/roles/discovery_mechanism/mtms/files/mtms_db.py b/discovery/roles/discovery_mechanism/mtms/files/mtms_db.py index 2e92d7c50..047013e94 100644 --- a/discovery/roles/discovery_mechanism/mtms/files/mtms_db.py +++ b/discovery/roles/discovery_mechanism/mtms/files/mtms_db.py @@ -14,7 +14,7 @@ import ipaddress -import sys +import sys, os import warnings import correlation_admin_bmc import modify_network_details @@ -29,14 +29,14 @@ bmc_static_subnet = sys.argv[3] bmc_dynamic_range = sys.argv[4] bmc_dynamic_subnet = bmc_static_subnet -static_stanza_path = sys.argv[5] -dynamic_stanza_path = sys.argv[6] +static_stanza_path = os.path.abspath(sys.argv[5]) +dynamic_stanza_path = os.path.abspath(sys.argv[6]) node_name = sys.argv[7] domain_name = sys.argv[8] admin_static_range = sys.argv[9] admin_subnet = sys.argv[10] netmask_bits = sys.argv[11] -discover_stanza_path = sys.argv[12] +discover_stanza_path = os.path.abspath(sys.argv[12]) correlation_status = sys.argv[13] uncorrelated_admin_start_ip = ipaddress.IPv4Address(sys.argv[14]) discovery_mechanism = "mtms" diff --git a/discovery/roles/discovery_mechanism/mtms/files/mtms_dhcp_db.py b/discovery/roles/discovery_mechanism/mtms/files/mtms_dhcp_db.py index ff04a6097..56c5658e9 100644 --- a/discovery/roles/discovery_mechanism/mtms/files/mtms_dhcp_db.py +++ b/discovery/roles/discovery_mechanism/mtms/files/mtms_dhcp_db.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys +import re +import sys, os import warnings import ipaddress import correlation_admin_bmc @@ -23,7 +24,7 @@ import omniadb_connection -dynamic_stanza_path = sys.argv[1] +dynamic_stanza_path = os.path.abspath(sys.argv[1]) node_name = sys.argv[2] domain_name = sys.argv[3] pxe_subnet = sys.argv[4] diff --git a/discovery/roles/discovery_mechanism/mtms/files/update_bmc_network.py b/discovery/roles/discovery_mechanism/mtms/files/update_bmc_network.py index 3ffae5e94..1da4aecb5 100644 --- a/discovery/roles/discovery_mechanism/mtms/files/update_bmc_network.py +++ b/discovery/roles/discovery_mechanism/mtms/files/update_bmc_network.py @@ -12,16 +12,41 @@ # See the License for the specific language governing permissions and # limitations under the License. +""" +This module provides functionality for updating the xCAT networks table +with details of various BMC discovery ranges inserted as a network. +""" + +import re import subprocess import sys - import calculate_ip_details +def validate(ip_range): + # Define regex patterns + cidr_pattern = r'^(\d{1,3}\.){3}\d{1,3}/\d{1,2}$' + range_pattern = r'^(\d{1,3}\.){3}\d{1,3}-\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$' + valid_pattern = f'({cidr_pattern}|{range_pattern}|0.0.0.0)' + ip_range = ip_range.strip() + if not re.fullmatch(valid_pattern, ip_range): + raise ValueError("Invalid IP range format") + return ip_range + discovery_ranges = sys.argv[1] netmask_bits = sys.argv[2] def create_network_name(nw_name, subnet): + """ + Create a network name based on the given network name and subnet. + + Args: + nw_name (str): The base network name. + subnet (str): The subnet address. + + Returns: + str: The generated network name. + """ subnet = str(subnet) temp = subnet.split('.') n_w_name = nw_name + "_" + temp[0] + "_" + temp[1] + "_" + temp[2] + "_" + temp[3] @@ -30,24 +55,26 @@ def create_network_name(nw_name, subnet): def update_networks_table(): """ - Insert the network details in the xCAT networks table - Returns: - an updated networks table with details of various bmc discovery ranges inserted as a network. + Insert the network details in the xCAT networks table + Returns: + an updated networks table with details of various bmc discovery ranges inserted as a network. """ - ip_address = discovery_ranges.split(',') - for ip in ip_address: - start_ip = ip.split('-')[0] - end_ip = ip.split('-')[1] + ip_ranges = [] + if discovery_ranges: + ip_ranges = discovery_ranges.split(',') + + for ip_range in ip_ranges: + ip_obj = validate(ip_range) + start_ip, end_ip = ip_obj.split('-') details = calculate_ip_details.cal_ip_details(start_ip, netmask_bits) netmask = details[0] subnet = details[1] network_name = create_network_name("bmc_network", subnet) - command = f"chdef -t network -o {network_name} net={subnet} mask={netmask} staticrange={start_ip}-{end_ip}" - command_list = command.split() + command = ["/opt/xcat/bin/chdef", "-t", "network", "-o", network_name, f"net={subnet}", f"mask={netmask}", f"staticrange={start_ip}-{end_ip}"] try: - subprocess.run(command_list, capture_output=True) - except Exception as e: - print({e}) + subprocess.run(command, capture_output=True, check=True) + except subprocess.CalledProcessError as e: + print(f"Error: {e}") -update_networks_table() +update_networks_table() \ No newline at end of file diff --git a/discovery/roles/discovery_mechanism/mtms/files/update_node_objects.py b/discovery/roles/discovery_mechanism/mtms/files/update_node_objects.py index 9795c0d58..51c3ffc5c 100644 --- a/discovery/roles/discovery_mechanism/mtms/files/update_node_objects.py +++ b/discovery/roles/discovery_mechanism/mtms/files/update_node_objects.py @@ -31,11 +31,11 @@ def get_node_obj(): """ - Get a list of node objects present in control plane + Get a list of node objects present in Omnia Infrastrcuture Management (OIM) node Returns: formed a list of node object names """ - command = "lsdef" + command = "/opt/xcat/bin/lsdef" node_objs = subprocess.run(command.split(), capture_output=True) temp = str(node_objs.stdout).split('\n') for i in range(0, len(temp) - 1): @@ -77,21 +77,21 @@ def update_node_obj_nm(): if mode is None: print("No device is found!") if mode == "static": - command = ["chdef", node_name[0], f"ip={admin_ip[0]}", f"groups={groups_static}", + command = ["/opt/xcat/bin/chdef", node_name[0], f"ip={admin_ip[0]}", f"groups={groups_static}", f"chain={chain_setup},{chain_os}"] subprocess.run(command) if mode == "discovery": - command = ["chdef", node_name[0], f"ip={admin_ip[0]}", f"groups={groups_discover}", + command = ["/opt/xcat/bin/chdef", node_name[0], f"ip={admin_ip[0]}", f"groups={groups_discover}", f"chain={chain_setup},{chain_os}"] subprocess.run(command) if mode == "dynamic": sql = "select bmc_ip from cluster.nodeinfo where service_tag = '" + serial_output[i] + "'" cursor.execute(sql) bmc_ip = cursor.fetchone() - command = ["chdef", node_name[0], f"ip={admin_ip[0]}", f"groups={groups_dynamic}", + command = ["/opt/xcat/bin/chdef", node_name[0], f"ip={admin_ip[0]}", f"groups={groups_dynamic}", f"chain={chain_setup},{chain_os}"] subprocess.run(command) - command = ["chdef", node_name[0], f" bmc={bmc_ip[0]}"] + command = ["/opt/xcat/bin/chdef", node_name[0], f" bmc={bmc_ip[0]}"] subprocess.run(command) cursor.close() diff --git a/discovery/roles/discovery_mechanism/mtms/tasks/bmc_static_discovery.yml b/discovery/roles/discovery_mechanism/mtms/tasks/bmc_static_discovery.yml index faffe3f18..1c354c964 100644 --- a/discovery/roles/discovery_mechanism/mtms/tasks/bmc_static_discovery.yml +++ b/discovery/roles/discovery_mechanism/mtms/tasks/bmc_static_discovery.yml @@ -16,7 +16,7 @@ - name: BMC Discover on Static ranges ansible.builtin.shell: >- set -o pipefail && \ - bmcdiscover --range {{ bmc_static_range }} -z > {{ static_ip_file }} + {{ xcat_path }}/bmcdiscover --range {{ bmc_static_range }} -z > {{ static_ip_file }} register: static_discovery changed_when: false diff --git a/discovery/roles/discovery_mechanism/mtms/tasks/fetch_static_nodes.yml b/discovery/roles/discovery_mechanism/mtms/tasks/fetch_static_nodes.yml index 7f99d9c9a..fc14aebcf 100644 --- a/discovery/roles/discovery_mechanism/mtms/tasks/fetch_static_nodes.yml +++ b/discovery/roles/discovery_mechanism/mtms/tasks/fetch_static_nodes.yml @@ -14,7 +14,7 @@ --- - name: Fetch nodes with group {{ bmc_static_node_group }} - ansible.builtin.command: lsdef {{ bmc_static_node_group }} + ansible.builtin.command: "{{ xcat_path }}/lsdef {{ bmc_static_node_group }}" changed_when: false register: check_static_nodes failed_when: false diff --git a/discovery/roles/discovery_mechanism/mtms/tasks/update_node_obj.yml b/discovery/roles/discovery_mechanism/mtms/tasks/update_node_obj.yml index 31e2b8057..2e6d5da67 100644 --- a/discovery/roles/discovery_mechanism/mtms/tasks/update_node_obj.yml +++ b/discovery/roles/discovery_mechanism/mtms/tasks/update_node_obj.yml @@ -16,7 +16,7 @@ - name: Create static temp node objects ansible.builtin.shell: > set -o pipefail && \ - cat {{ static_ip_file }} | mkdef -z + cat {{ static_ip_file }} | {{ xcat_path }}/mkdef -z failed_when: false changed_when: false when: bmc_static_status @@ -24,7 +24,7 @@ - name: Create dynamic temp node objects ansible.builtin.shell: > set -o pipefail && \ - cat {{ dynamic_ip_file }} | mkdef -z + cat {{ dynamic_ip_file }} | {{ xcat_path }}/mkdef -z failed_when: false changed_when: false when: bmc_dynamic_status @@ -32,7 +32,7 @@ - name: Create discover temp node objects ansible.builtin.shell: > set -o pipefail && \ - cat {{ discover_ip_file }} | mkdef -z + cat {{ discover_ip_file }} | {{ xcat_path }}/mkdef -z failed_when: false changed_when: false when: bmc_discover_range_status diff --git a/discovery/roles/discovery_mechanism/mtms/vars/main.yml b/discovery/roles/discovery_mechanism/mtms/vars/main.yml index dac060cf1..8c48f5ea8 100644 --- a/discovery/roles/discovery_mechanism/mtms/vars/main.yml +++ b/discovery/roles/discovery_mechanism/mtms/vars/main.yml @@ -19,6 +19,7 @@ dhcp_timeout: 600 dhcp_timeout_msg: "[WARNING] Waiting for 10 minutes before starting dynamic discovery" static_dynamic_fail_msg: "Failed. No BMC found for both static and dynamic IP's. Please check the ranges again." dhcp_path: "/var/lib/dhcpd/dhcpd.leases" +xcat_path: /opt/xcat/bin # Usage: bmc_discover_ranges.yml discover_ip_file: "/opt/omnia/discover.stanzas" @@ -30,7 +31,7 @@ stanza_paths: - "{{ discover_ip_file }}" - "{{ static_ip_file }}" - "{{ dynamic_ip_file }}" -file_perm: 0644 +file_perm: "0644" bmcdiscover_python: "{{ role_path }}/../mtms/files/bmc_discover_ranges.py" # Usage: update_xcat_network_discovery_range.yml diff --git a/discovery/roles/discovery_mechanism/switch_based/files/create_node_object.py b/discovery/roles/discovery_mechanism/switch_based/files/create_node_object.py index a791c990f..a14ea384c 100644 --- a/discovery/roles/discovery_mechanism/switch_based/files/create_node_object.py +++ b/discovery/roles/discovery_mechanism/switch_based/files/create_node_object.py @@ -42,7 +42,7 @@ def create_node_object(conn): cursor.execute(sql) row_output = cursor.fetchone() - command = ["chdef", row_output[0], f"groups={groups_switch_based}", "mgt=ipmi", "cons=ipmi", f"ip={row_output[1]}", f"bmc={row_output[2]}", "netboot=xnba", "installnic=mac", "primarynic=mac", f"switch={row_output[3]}", f"switchport={row_output[4]}"] + command = ["/opt/xcat/bin/chdef", row_output[0], f"groups={groups_switch_based}", "mgt=ipmi", "cons=ipmi", f"ip={row_output[1]}", f"bmc={row_output[2]}", "netboot=xnba", "installnic=mac", "primarynic=mac", f"switch={row_output[3]}", f"switchport={row_output[4]}"] subprocess.run(command) print(f"Created node object with name {row_output[0]}") diff --git a/discovery/roles/discovery_mechanism/switch_based/files/create_switch_object.py b/discovery/roles/discovery_mechanism/switch_based/files/create_switch_object.py index c03c44ca0..2b32fbca7 100644 --- a/discovery/roles/discovery_mechanism/switch_based/files/create_switch_object.py +++ b/discovery/roles/discovery_mechanism/switch_based/files/create_switch_object.py @@ -45,37 +45,38 @@ def create_switch_object(conn,switch_ip,switch_snmp_username,switch_snmp_passwor """ cursor = conn.cursor() - for ip in switch_ip: - # Check for existing entries of switch_ip - sql = f"select exists(select switch_ip from cluster.switchinfo where switch_ip='{ip}')" - cursor.execute(sql) - output_switch_ip = cursor.fetchone()[0] - - if not output_switch_ip: - # Generate switch_name - sql = '''select id from cluster.switchinfo ORDER BY id DESC LIMIT 1''' + if switch_ip: + for ip in switch_ip: + # Check for existing entries of switch_ip + sql = f"select exists(select switch_ip from cluster.switchinfo where switch_ip='{ip}')" cursor.execute(sql) - id_number = cursor.fetchone() - if id_number is None: - id_number = [0] - switch_id = int(id_number[0]) + 1 - switch_name = switch_name_prefix + str(switch_id) + output_switch_ip = cursor.fetchone()[0] - omniadb_connection.insert_switch_info(cursor,switch_name,ip) + if not output_switch_ip: + # Generate switch_name + sql = '''select id from cluster.switchinfo ORDER BY id DESC LIMIT 1''' + cursor.execute(sql) + id_number = cursor.fetchone() + if id_number is None: + id_number = [0] + switch_id = int(id_number[0]) + 1 + switch_name = switch_name_prefix + str(switch_id) - # Create switch object - command = ["chdef", switch_name, f"ip={ip}", f"groups={switch_group}"] - subprocess.run(command) + omniadb_connection.insert_switch_info(cursor,switch_name,ip) - # Update xcat switches table with switch credentials - command = ["tabch", f"switch={switch_name}", f"switches.snmpversion={switch_snmp_version}", f"switches.username={switch_snmp_username}", f"switches.password={switch_snmp_password}", f"switches.auth={switch_auth_type}"] - subprocess.run(command) + # Create switch object + command = ["/opt/xcat/bin/chdef", switch_name, f"ip={ip}", f"groups={switch_group}"] + subprocess.run(command) - print(f"Created node object for switch: {switch_name}") + # Update xcat switches table with switch credentials + command = ["/opt/xcat/sbin/tabch", f"switch={switch_name}", f"switches.snmpversion={switch_snmp_version}", f"switches.username={switch_snmp_username}", f"switches.password={switch_snmp_password}", f"switches.auth={switch_auth_type}"] + subprocess.run(command) + + print(f"Created node object for switch: {switch_name}") cursor.close() def main(): - + # Fetch input arguments switch_ip = sys.argv[1:-3] switch_snmp_username = sys.argv[-3] diff --git a/discovery/roles/discovery_mechanism/switch_based/tasks/check_nodes_switch_based.yml b/discovery/roles/discovery_mechanism/switch_based/tasks/check_nodes_switch_based.yml index 77468357a..5379bcc6e 100644 --- a/discovery/roles/discovery_mechanism/switch_based/tasks/check_nodes_switch_based.yml +++ b/discovery/roles/discovery_mechanism/switch_based/tasks/check_nodes_switch_based.yml @@ -18,7 +18,7 @@ switch_based_node_status: false - name: Fetch nodes with group {{ switch_based_node_group }} - ansible.builtin.command: lsdef {{ switch_based_node_group }} + ansible.builtin.command: "{{ xcat_path }}/lsdef {{ switch_based_node_group }}" changed_when: false register: check_switch_based_nodes failed_when: false diff --git a/discovery/roles/discovery_mechanism/switch_based/tasks/switch_object_creation.yml b/discovery/roles/discovery_mechanism/switch_based/tasks/switch_object_creation.yml index 255da844f..dac9263f8 100644 --- a/discovery/roles/discovery_mechanism/switch_based/tasks/switch_object_creation.yml +++ b/discovery/roles/discovery_mechanism/switch_based/tasks/switch_object_creation.yml @@ -27,7 +27,7 @@ - name: Task for adding hosts entry block: - name: Add hosts entry - ansible.builtin.command: makehosts {{ switch_group }} + ansible.builtin.command: "{{ xcat_sbin_path }}/makehosts {{ switch_group }}" changed_when: true register: create_hosts_entry_switch rescue: @@ -39,7 +39,7 @@ - name: Task for creating DNS configuration block: - name: Create DNS configuration - ansible.builtin.command: makedns -n + ansible.builtin.command: "{{ xcat_sbin_path }}/makedns -n" changed_when: true register: dns_config_switch rescue: diff --git a/discovery/roles/discovery_mechanism/switch_based/vars/main.yml b/discovery/roles/discovery_mechanism/switch_based/vars/main.yml index 770feadff..e72fc0376 100644 --- a/discovery/roles/discovery_mechanism/switch_based/vars/main.yml +++ b/discovery/roles/discovery_mechanism/switch_based/vars/main.yml @@ -31,3 +31,5 @@ create_node_object_path_switch_based: "{{ role_path }}/../switch_based/files/cre switch_based_nodes_warning_msg: "[WARNING] switch_based node objects not found. Skipping remaining provisioning tasks for switch_based discovery." switch_based_node_group: "switch_based" warning_wait_time: 10 +xcat_path: /opt/xcat/bin +xcat_sbin_path: /opt/xcat/sbin diff --git a/discovery/roles/discovery_validations/common/files/validate_ips_count.py b/discovery/roles/discovery_validations/common/files/validate_ips_count.py index 6f901f0c0..3ae780f2a 100644 --- a/discovery/roles/discovery_validations/common/files/validate_ips_count.py +++ b/discovery/roles/discovery_validations/common/files/validate_ips_count.py @@ -20,8 +20,9 @@ start_ip = ipaddress.ip_address(start_ip) end_ip = ipaddress.ip_address(end_ip) -ip_range = ipaddress.summarize_address_range(start_ip, end_ip) -count = 0 -for subnet in ip_range: - count += subnet.num_addresses -print(count) +if start_ip and end_ip: + ip_range = ipaddress.summarize_address_range(start_ip, end_ip) + count = 0 + for subnet in ip_range: + count += subnet.num_addresses + print(count) diff --git a/discovery/roles/discovery_validations/common/tasks/include_local_repo_config.yml b/discovery/roles/discovery_validations/common/tasks/include_local_repo_config.yml index 7e1bc0a40..8ce8c512e 100644 --- a/discovery/roles/discovery_validations/common/tasks/include_local_repo_config.yml +++ b/discovery/roles/discovery_validations/common/tasks/include_local_repo_config.yml @@ -18,17 +18,7 @@ beegfs_version: "omnia_default" amdgpu_version: "omnia_default" rocm_version: "omnia_default" - -- name: Include local_repo_config.yml vars - block: - - name: Include local_repo_config.yml vars - ansible.builtin.include_vars: "{{ local_repo_config_file }}" - register: include_local_repo_config - no_log: true - rescue: - - name: Failed to local_repo_config.yml - ansible.builtin.fail: - msg: "{{ local_repo_config_syntax_fail_msg }} Error: {{ include_local_repo_config.message }}" + intelgaudi_version: "omnia_default" - name: Load software_config.json as user_config block: @@ -73,6 +63,27 @@ os_release: "focal" when: provision_os == 'ubuntu' and provision_os_version == '20.04' +- name: Set Openldap release version for Ubuntu 22.04 + ansible.builtin.set_fact: + openldap_release: "jammy" + when: provision_os == 'ubuntu' and os_release == 'jammy' + +- name: Set Openldap release version for Ubuntu 20.04 + ansible.builtin.set_fact: + openldap_release: "bullseye" + when: provision_os == 'ubuntu' and os_release == 'focal' + +- name: Include local_repo_config.yml vars + block: + - name: Include local_repo_config.yml vars + ansible.builtin.include_vars: "{{ local_repo_config_file }}" + register: include_local_repo_config + no_log: true + rescue: + - name: Failed to local_repo_config.yml + ansible.builtin.fail: + msg: "{{ local_repo_config_syntax_fail_msg }} Error: {{ include_local_repo_config.message }}" + - name: Generate software JSON file names ansible.builtin.set_fact: software_names: "{{ user_config.softwares | map(attribute='name') | select('defined') | list }}" diff --git a/discovery/roles/discovery_validations/common/tasks/include_provision_credentials_config.yml b/discovery/roles/discovery_validations/common/tasks/include_provision_credentials_config.yml index 2cbef2beb..736fc9b1b 100644 --- a/discovery/roles/discovery_validations/common/tasks/include_provision_credentials_config.yml +++ b/discovery/roles/discovery_validations/common/tasks/include_provision_credentials_config.yml @@ -13,12 +13,6 @@ # limitations under the License. --- -- name: Fetch ansible-vault path - ansible.builtin.command: whereis ansible-vault - changed_when: false - register: ansible_vault_path - tags: init - - name: Check provision_credentials_config.yml file is encrypted ansible.builtin.command: cat {{ provision_credentials_config_filename }} changed_when: false @@ -28,7 +22,7 @@ - name: Decrpyt provision_credentials_config.yml ansible.builtin.command: >- - {{ ansible_vault_path.stdout.split(' ')[1] }} decrypt {{ provision_credentials_config_filename }} + ansible-vault decrypt {{ provision_credentials_config_filename }} --vault-password-file {{ provision_credentials_vault_path }} changed_when: false when: ansible_vault_search_key in provision_credentials_config_content.stdout diff --git a/discovery/roles/discovery_validations/common/tasks/main.yml b/discovery/roles/discovery_validations/common/tasks/main.yml index 917445d90..af93746ac 100644 --- a/discovery/roles/discovery_validations/common/tasks/main.yml +++ b/discovery/roles/discovery_validations/common/tasks/main.yml @@ -26,11 +26,11 @@ ansible.builtin.include_vars: "{{ item }}" with_items: "{{ provision_validation_vars }}" -- name: Validate control plane OS - ansible.builtin.include_tasks: validate_cp_os.yml +- name: Validate Omnia Infrastructure Manager OS + ansible.builtin.include_tasks: validate_oim_os.yml -- name: Include vars for {{ control_plane_os }} - ansible.builtin.include_vars: "{{ role_path }}/vars/{{ control_plane_os }}.yml" +- name: Include vars for {{ oim_os }} + ansible.builtin.include_vars: "{{ role_path }}/vars/{{ oim_os }}.yml" - name: Include provision configuration variables ansible.builtin.include_tasks: include_provision_config.yml @@ -51,7 +51,7 @@ ansible.builtin.include_tasks: assign_network_interface.yml - name: Validate the nic parameters - ansible.builtin.include_tasks: validate_cp_nic.yml + ansible.builtin.include_tasks: validate_oim_nic.yml - name: Validate network spec input ansible.builtin.include_tasks: validate_network_spec.yml @@ -78,6 +78,9 @@ - name: Validate domain_name ansible.builtin.include_tasks: validate_domain_name.yml +- name: Validate site_config.yml + ansible.builtin.include_tasks: validate_site_config.yml + - name: Validate OFED and CUDA repo ansible.builtin.include_tasks: validate_ofed_cuda_repo.yml @@ -87,13 +90,16 @@ - name: Validate Broadcom repo ansible.builtin.include_tasks: validate_broadcom_repo.yml +- name: Validate Intel Gaudi repo + ansible.builtin.include_tasks: validate_intelgaudi_repo.yml + # Encrypt provision_config_credentials.yml after all the validations are successful - name: Encrypt config file once validations are successful block: - name: Encrypt provision_config_credentials.yml ansible.builtin.command: >- - {{ ansible_vault_path.stdout.split(' ')[1] }} encrypt {{ provision_credentials_config_filename }} + ansible-vault encrypt {{ provision_credentials_config_filename }} --vault-password-file {{ provision_credentials_vault_path }} changed_when: false tags: init diff --git a/discovery/roles/discovery_validations/common/tasks/package_installation.yml b/discovery/roles/discovery_validations/common/tasks/package_installation.yml index c8cde48be..2a5a91fc9 100644 --- a/discovery/roles/discovery_validations/common/tasks/package_installation.yml +++ b/discovery/roles/discovery_validations/common/tasks/package_installation.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -24,9 +24,9 @@ ansible.builtin.fail: msg: "{{ provision_package_fail_msg }}" -- name: Install python snmp - ansible.builtin.command: "{{ python_version }} -m pip install {{ snmp_python_package }}" - changed_when: true +# - name: Install python snmp +# ansible.builtin.command: "{{ python_version }} -m pip install {{ snmp_python_package }}" +# changed_when: true - name: Install python postgres ansible.builtin.command: "{{ python_version }} -m pip install {{ postgres_python_package }}" @@ -49,16 +49,12 @@ changed_when: true - name: Install netaddr and pexpect - ansible.builtin.command: "{{ pip_version }} install {{ item }}" + ansible.builtin.command: "{{ python_version }} -m pip install {{ item }}" changed_when: true with_items: - "{{ netaddr_pip_package }}" - "{{ pexpect_pip_package }}" -- name: Install ansible galaxy collection ansible.utils - ansible.builtin.command: ansible-galaxy collection install "{{ item }}" +- name: Install python commented config parser + ansible.builtin.command: "{{ python_version }} -m pip install {{ commentedconfigparser_python_package }}" changed_when: true - register: ansible_collection_install - until: ansible_collection_install is not failed - retries: "{{ max_retries }}" - with_items: "{{ ansible_galaxy_collection }}" diff --git a/discovery/roles/discovery_validations/common/tasks/upgrade_validations.yml b/discovery/roles/discovery_validations/common/tasks/upgrade_validations.yml index 5adb8d88a..62b9b6b84 100644 --- a/discovery/roles/discovery_validations/common/tasks/upgrade_validations.yml +++ b/discovery/roles/discovery_validations/common/tasks/upgrade_validations.yml @@ -26,11 +26,11 @@ ansible.builtin.include_vars: "{{ item }}" with_items: "{{ provision_validation_vars }}" -- name: Validate control plane OS - ansible.builtin.include_tasks: validate_cp_os.yml +- name: Validate Omnia Infrastructure Manager OS + ansible.builtin.include_tasks: validate_oim_os.yml -- name: Include vars for {{ control_plane_os }} - ansible.builtin.include_vars: "{{ role_path }}/vars/{{ control_plane_os }}.yml" +- name: Include vars for {{ oim_os }} + ansible.builtin.include_vars: "{{ role_path }}/vars/{{ oim_os }}.yml" - name: Include provision configuration variables ansible.builtin.include_tasks: include_provision_config.yml @@ -51,7 +51,7 @@ ansible.builtin.include_tasks: assign_network_interface.yml - name: Validate the nic parameters - ansible.builtin.include_tasks: validate_cp_nic.yml + ansible.builtin.include_tasks: validate_oim_nic.yml - name: Validate network spec input ansible.builtin.include_tasks: validate_network_spec.yml @@ -93,7 +93,7 @@ block: - name: Encrypt provision_config_credentials.yml ansible.builtin.command: >- - {{ ansible_vault_path.stdout.split(' ')[1] }} encrypt {{ provision_credentials_config_filename }} + ansible-vault encrypt {{ provision_credentials_config_filename }} --vault-password-file {{ provision_credentials_vault_path }} changed_when: false tags: init diff --git a/discovery/roles/discovery_validations/common/tasks/validate_admin_nic.yml b/discovery/roles/discovery_validations/common/tasks/validate_admin_nic.yml index 66aac77c4..a1f20e624 100644 --- a/discovery/roles/discovery_validations/common/tasks/validate_admin_nic.yml +++ b/discovery/roles/discovery_validations/common/tasks/validate_admin_nic.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -30,15 +30,13 @@ - network_data.admin_network.netmask_bits | int <= 32 fail_msg: "{{ fail_msg_admin_netmask_bits }}" -- name: Set pxe nic static start and end ranges +- name: Set pxe nic static, dynamic ranges and forwarders ansible.builtin.set_fact: pxe_nic_start_range: "{{ network_data.admin_network.static_range | split('-') | first }}" pxe_nic_end_range: "{{ network_data.admin_network.static_range | split('-') | last }}" - -- name: Set pxe nic dynamic start and end ranges - ansible.builtin.set_fact: pxe_nic_dynamic_start_range: "{{ network_data.admin_network.dynamic_range | split('-') | first }}" pxe_nic_dynamic_end_range: "{{ network_data.admin_network.dynamic_range | split('-') | last }}" + pxe_nic_forwarders: "{{ network_data.admin_network.DNS | default('', true) }}" - name: Validate if admin static ranges are valid ansible.builtin.assert: diff --git a/discovery/roles/discovery_validations/common/tasks/validate_domain_name.yml b/discovery/roles/discovery_validations/common/tasks/validate_domain_name.yml index cf91ac12e..0998db004 100644 --- a/discovery/roles/discovery_validations/common/tasks/validate_domain_name.yml +++ b/discovery/roles/discovery_validations/common/tasks/validate_domain_name.yml @@ -68,21 +68,26 @@ success_msg: "{{ server_domain_name_success_msg }}" fail_msg: "{{ server_domain_name_fail_msg }}" -- name: Fetch hostname of the server +- name: Read hostname of Omnia Infrastructure Manager ansible.builtin.command: hostname changed_when: false - register: machine_hostname + register: hostname_result -- name: Remove hosts file entry of control plane +- name: Set fact for the Omnia Infratsructure Management (OIM) node hostname and domain name + ansible.builtin.set_fact: + oim_hostname: "{{ hostname_result.stdout }}" + oim_domain_name: "{{ domain_name }}" + +- name: Remove hosts file entry of Omnia Infrastructure Manager ansible.builtin.lineinfile: path: "{{ hosts_file_path }}" - regexp: "^(.*){{ machine_hostname.stdout }}" + regexp: "^(.*){{ oim_hostname }}" state: absent - name: Add hosts file entry ansible.builtin.lineinfile: path: "{{ hosts_file_path }}" - line: "{{ pxe_nic_ip }} {{ machine_hostname.stdout }}" + line: "{{ pxe_nic_ip }} {{ oim_hostname }}" state: present create: true mode: "{{ hosts_file_mode }}" diff --git a/discovery/roles/discovery_validations/common/tasks/validate_intelgaudi_repo.yml b/discovery/roles/discovery_validations/common/tasks/validate_intelgaudi_repo.yml new file mode 100644 index 000000000..937ef51a7 --- /dev/null +++ b/discovery/roles/discovery_validations/common/tasks/validate_intelgaudi_repo.yml @@ -0,0 +1,60 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Set default intelgaudi status + ansible.builtin.set_fact: + intelgaudi_config_status: false + intelgaudi_input_status: false + +- name: Get intelgaudi status + ansible.builtin.set_fact: + intelgaudi_input_status: true + loop: "{{ user_config.softwares | default([]) }}" + when: + - "'intelgaudi' in item.name" + loop_control: + loop_var: item + +- name: Set intelgaudi_config_status + when: intelgaudi_input_status + block: + - name: Fetch intelgaudi_version + ansible.builtin.set_fact: + intelgaudi_version: "{{ user_config.softwares | selectattr('name', 'equalto', 'intelgaudi') | map(attribute='version') | first }}" + + - name: Set intelgaudi_directory + ansible.builtin.set_fact: + intelgaudi_directory: "{{ offline_intelgaudi_directory }}/intelgaudi/{{ intelgaudi_version }}/" + + - name: Check intelgaudi_directory exists or not + ansible.builtin.stat: + path: "{{ intelgaudi_directory }}" + register: check_intelgaudi_dir + + - name: Warning - Please wait, This task will take few seconds + ansible.builtin.pause: + seconds: "{{ warning_wait_time }}" + prompt: "{{ intelgaudi_repo_warning_msg }}" + when: not check_intelgaudi_dir.stat.exists + + - name: Set intelgaudi_config_status to true + ansible.builtin.set_fact: + intelgaudi_config_status: true + when: check_intelgaudi_dir.stat.exists + rescue: + - name: Warning - Please wait, This task will take few seconds + ansible.builtin.pause: + seconds: "{{ warning_wait_time }}" + prompt: "{{ intelgaudi_version_warning_msg }}" diff --git a/discovery/roles/discovery_validations/common/tasks/validate_cp_nic.yml b/discovery/roles/discovery_validations/common/tasks/validate_oim_nic.yml similarity index 100% rename from discovery/roles/discovery_validations/common/tasks/validate_cp_nic.yml rename to discovery/roles/discovery_validations/common/tasks/validate_oim_nic.yml diff --git a/discovery/roles/discovery_validations/common/tasks/validate_cp_os.yml b/discovery/roles/discovery_validations/common/tasks/validate_oim_os.yml similarity index 68% rename from discovery/roles/discovery_validations/common/tasks/validate_cp_os.yml rename to discovery/roles/discovery_validations/common/tasks/validate_oim_os.yml index 1fcf06088..244a66665 100644 --- a/discovery/roles/discovery_validations/common/tasks/validate_cp_os.yml +++ b/discovery/roles/discovery_validations/common/tasks/validate_oim_os.yml @@ -13,21 +13,21 @@ # limitations under the License. --- -- name: Set control_plane_os +- name: Set oim_os ansible.builtin.set_fact: - control_plane_os: "{{ ansible_distribution | lower }}" + oim_os: "{{ ansible_distribution | lower }}" -- name: Validate control_plane_os +- name: Validate oim_os ansible.builtin.fail: - msg: "{{ control_plane_os_fail_msg }}" + msg: "{{ oim_os_fail_msg }}" when: - - control_plane_os not in control_plane_os_redhat - - control_plane_os not in control_plane_os_rocky - - control_plane_os not in control_plane_os_ubuntu + - oim_os not in oim_os_redhat + - oim_os not in oim_os_rocky + - oim_os not in oim_os_ubuntu - name: Validate user ansible.builtin.fail: msg: "{{ user_fail_msg }}" when: - - control_plane_os in control_plane_os_ubuntu + - oim_os in oim_os_ubuntu - ansible_env.USER != root_user_name diff --git a/discovery/roles/discovery_validations/common/tasks/validate_provision_vars.yml b/discovery/roles/discovery_validations/common/tasks/validate_provision_vars.yml index 02a4096d1..7ad283771 100644 --- a/discovery/roles/discovery_validations/common/tasks/validate_provision_vars.yml +++ b/discovery/roles/discovery_validations/common/tasks/validate_provision_vars.yml @@ -54,6 +54,13 @@ success_msg: "{{ default_lease_time_success_msg }}" fail_msg: "{{ default_lease_time_fail_msg }}" +- name: Assert that disk_partition does not have duplicate entries + ansible.builtin.assert: + that: + - disk_partition | map(attribute='mount_point')| list | unique | length == disk_partition | map(attribute='mount_point')| list | length + success_msg: "{{ disk_partition_success_msg }}" + fail_msg: "{{ disk_partition_fail_msg }}" + - name: Convert timezone.txt to linux format ansible.builtin.command: dos2unix {{ role_path }}/files/timezone.txt failed_when: false @@ -79,32 +86,32 @@ msg: "{{ language_fail_msg }}" when: '"en-US" not in language' -- name: Assert provision_os for RHEL control plane +- name: Assert provision_os for RHEL Omnia Infrastructure Manager ansible.builtin.assert: that: - provision_os | lower == os_supported_rhel success_msg: "{{ provision_os_success_msg }}" fail_msg: "{{ provision_os_fail_msg }}" when: - - control_plane_os in control_plane_os_redhat + - oim_os in oim_os_redhat -- name: Assert provision_os for Rocky control plane +- name: Assert provision_os for Rocky Omnia Infrastructure Manager ansible.builtin.assert: that: - provision_os | lower == os_supported_rocky success_msg: "{{ provision_os_success_msg }}" fail_msg: "{{ provision_os_fail_msg }}" when: - - control_plane_os in control_plane_os_rocky + - oim_os in oim_os_rocky -- name: Assert provision_os for Ubuntu control plane +- name: Assert provision_os for Ubuntu Omnia Infrastructure Manager ansible.builtin.assert: that: - provision_os | lower == os_supported_ubuntu success_msg: "{{ provision_os_success_msg }}" fail_msg: "{{ provision_os_fail_msg }}" when: - - control_plane_os in control_plane_os_ubuntu + - oim_os in oim_os_ubuntu - name: Set supported_os_version_status to true - rhel/rocky ansible.builtin.set_fact: @@ -157,3 +164,9 @@ - provision_os | lower == os_supported_ubuntu - provision_os_version | string == ubuntu22_version - ubuntu_kernel_flavor is defined + +- name: Assert ntp_support + ansible.builtin.assert: + that: + - ntp_support == true or ntp_support == false + fail_msg: "{{ ntp_support_fail_msg }}" diff --git a/discovery/roles/discovery_validations/common/tasks/validate_site_config.yml b/discovery/roles/discovery_validations/common/tasks/validate_site_config.yml new file mode 100644 index 000000000..acb14f83f --- /dev/null +++ b/discovery/roles/discovery_validations/common/tasks/validate_site_config.yml @@ -0,0 +1,105 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Initialize variables + ansible.builtin.set_fact: + http_proxy_input_status: false + https_proxy_input_status: false + no_proxy_input_status: false + proxy_status: false + +- name: Include site_config.yml + ansible.builtin.include_vars: "{{ site_config_file }}" + +- name: Validate http_proxy variable provided + ansible.builtin.set_fact: + http_proxy_input_status: true + when: + - proxy[0].http_proxy is defined + - proxy[0].http_proxy | default("", true) | length > 1 + +- name: Validate https_proxy variable provided + ansible.builtin.set_fact: + https_proxy_input_status: true + when: + - proxy[0].https_proxy is defined + - proxy[0].https_proxy | default("", true) | length > 1 + +- name: Validate no_proxy variable provided + ansible.builtin.set_fact: + no_proxy_input_status: true + when: + - proxy[0].no_proxy is defined + - proxy[0].no_proxy | default("", true) | length > 1 + +- name: Validate both http_proxy and https_proxy input provided + ansible.builtin.fail: + msg: "{{ invalid_proxy_failure_msg }}" + when: + - not https_proxy_input_status and http_proxy_input_status or + not http_proxy_input_status and https_proxy_input_status + +- name: Validate proxy + when: + - http_proxy_input_status + - https_proxy_input_status + block: + - name: Validate http_proxy, https_proxy and no_proxy configured as environment variables + ansible.builtin.assert: + that: + - lookup('env', 'http_proxy') | length > 1 + - lookup('env', 'https_proxy') | length > 1 + - lookup('env', 'no_proxy') | length > 1 + - lookup('env', 'http_proxy') == proxy[0].http_proxy + - lookup('env', 'https_proxy') == proxy[0].https_proxy + - oim_hostname in lookup('env', 'no_proxy') + - admin_nic_ip in lookup('env', 'no_proxy') + fail_msg: "{{ proxy_env_fail_msg }}" + + - name: Try updating repos in Ubuntu + when: oim_os in oim_os_ubuntu + block: + - name: Update repos in Ubuntu + ansible.builtin.apt: + update_cache: true + register: update_repos + until: update_repos is not failed + retries: "{{ repo_retries }}" + delay: "{{ repo_delay }}" + rescue: + - name: Failed to update repos + ansible.builtin.fail: + msg: "{{ update_repos_fail_msg }}" + + - name: Try updating repos in RHEL/Rocky + when: + - oim_os in oim_os_redhat or + oim_os in oim_os_rocky + block: + - name: Update repos in RHEL/Rocky + ansible.builtin.dnf: + update_cache: true + register: update_repos + until: update_repos is not failed + retries: "{{ repo_retries }}" + delay: "{{ repo_delay }}" + rescue: + - name: Failed to update repos + ansible.builtin.fail: + msg: "{{ update_repos_fail_msg }}" + + - name: Set proxy_status to true + ansible.builtin.set_fact: + proxy_status: true diff --git a/discovery/roles/discovery_validations/common/vars/main.yml b/discovery/roles/discovery_validations/common/vars/main.yml index b6644a20b..13d5091f5 100644 --- a/discovery/roles/discovery_validations/common/vars/main.yml +++ b/discovery/roles/discovery_validations/common/vars/main.yml @@ -20,29 +20,25 @@ provision_validation_vars: - "{{ role_path }}/../switch_based/vars/main.yml" nic_wait_time: 30 -# Usage: validate_cp_os.yml -control_plane_os_redhat: "redhat" -control_plane_os_rocky: "rocky" -control_plane_os_ubuntu: "ubuntu" -control_plane_os_fail_msg: "Failed. Control plane OS should be RHEL, Rocky or Ubuntu." +# Usage: validate_oim_os.yml +oim_os_redhat: "redhat" +oim_os_rocky: "rocky" +oim_os_ubuntu: "ubuntu" +oim_os_fail_msg: "Failed. Omnia Infrastructure Manager OS should be RHEL, Rocky or Ubuntu." root_user_name: "root" user_fail_msg: "Failed. Omnia playbooks should run as root user." -# Usage: package_installation.yml -ansible_galaxy_collection: - - ansible.utils:2.5.2 - - community.general:4.8.7 -python_version: python3.9 -snmp_python_package: easysnmp +python_version: "{{ ansible_python_interpreter }}" +# snmp_python_package: easysnmp postgres_python_package: psycopg2-binary requests_python_package: requests pyarrow_python_package: pyarrow pandas_python_package: pandas passlib_python_package: passlib -pip_version: pip3.9 netaddr_pip_package: netaddr pexpect_pip_package: pexpect max_retries: 10 +commentedconfigparser_python_package: "commented-configparser" # Usage: create_provision_ini.yml provision_conf_path: "/opt/omnia/.data/provision.ini" @@ -79,9 +75,9 @@ software_config_syntax_fail_msg: "Failed. Syntax errors present in software_conf # Usage: validate_local_repo.yml metadata_file_path: "/opt/omnia/offline/.data/metadata.yml" -local_repo_fail_msg: "Failed! Please run local_repo.yml before running discovery_provision.yml/prepare_cp.yml" +local_repo_fail_msg: "Failed! Please run local_repo.yml before running discovery_provision.yml/prepare_oim.yml" softwares_warning_msg: "[WARNING] software_config.json does not have any softwares. Hence softwares will not be installed on the nodes post provisioning." -repo_store_path_fail_msg: "Failed. {{ repo_store_path }} didn't exist. Please run local_repo.yml before running discovery_provision.yml/prepare_cp.yml" +repo_store_path_fail_msg: "Failed. {{ repo_store_path }} didn't exist. Please run local_repo.yml before running discovery_provision.yml/prepare_oim.yml" repo_config_metadata_fail_msg: "Failed: Cannot change repo_config in subsequent runs. Please use the repo_config:{{ md_repo_config }} in software_config.json" # Usage: assign_network_interface.yml @@ -158,9 +154,9 @@ os_supported_rhel: rhel os_supported_ubuntu: ubuntu provision_os_success_msg: "cluster_os_type validated" provision_os_fail_msg: "Failed. Incorrect cluster_os_type selected. -If control plane OS RHEL, only cluster_os_type {{ os_supported_rhel }} is supported. -If control plane OS Rocky, only cluster_os_type {{ os_supported_rocky }} is supported. -If control plane OS Ubuntu, only cluster_os_type {{ os_supported_ubuntu }} is supported" +If Omnia Infrastructure Manager OS RHEL, only cluster_os_type {{ os_supported_rhel }} is supported. +If Omnia Infrastructure Manager OS Rocky, only cluster_os_type {{ os_supported_rocky }} is supported. +If Omnia Infrastructure Manager OS Ubuntu, only cluster_os_type {{ os_supported_ubuntu }} is supported" iso_file_path_missing_msg: "Incorrect iso_file_path provided. Make sure ISO file is present in the provided iso_file_path." iso_file_path_success_msg: "iso_file_path validated" iso_file_path_fail_msg: "Failed. Invalid iso_file_path: {{ iso_file_path }} provided. Make sure iso_file_path variable in provision_config.yml contains value @@ -184,6 +180,9 @@ nodename_chars_fail_msg: "Failed. node_name empty or invalid in provision_config node_name should not contain _ or . or space or node- as it might result in issues with provisioning/authentication tools like FreeIPA." ubuntu_kernel_fail_msg: "Failed. ubuntu_kernel_flavor should be either hwe or generic" ubuntu22_version: "22.04" +ntp_support_fail_msg: "Failed. ntp_support in provision_config.yml should be either true or false" +disk_partition_success_msg: "disk_partition successfully validated" +disk_partition_fail_msg: "Failed. Duplicate disk_partition values present in provision_config.yml." # Usage: validate_domain_name.yml domain_name_success_msg: "domain_name successfully validated" @@ -222,6 +221,12 @@ Hence ROCm will not be installed on the nodes post provisioning." rocm_repo_warning_msg: "[WARNING] local_repo.yml is not executed for downloading ROCM packages. ROCm will not be installed on the nodes post provisioning." +# Usage: validate_intelgaudi_repo.yml +intelgaudi_version_warning_msg: "[WARNING] software_config.json does not have the version for 'intelgaudi'. +Hence Habana stack will not be installed on the nodes post provisioning." +intelgaudi_repo_warning_msg: "[WARNING] local_repo.yml is not executed for downloading 'intelgaudi' packages. +Habana stack will not be installed on the nodes post provisioning." + # Usage: validate_broadcom_repo.yml roce_version_warning_msg: "[WARNING] software_config.json does not have the version for bcm_roce. Hence RoCE drivers will not be installed on the nodes post provisioning." @@ -247,3 +252,13 @@ validate_cidr: "{{ role_path }}/files/validate_cidr.py" range_ip_check_fail_msg: "Failed. input ip range should be valid IP address (Eg. 192.168.1.1-198.168.1.254)." fail_static_ip_range: "Failed, Network static overlaps with" fail_cidr_ip_range: "Failed, Cidr overlaps with" + +# Usage: validate_site_config.yml +site_config_file: "{{ role_path }}/../../../../input/site_config.yml" +invalid_proxy_failure_msg: "Failed. Both http_proxy and https_proxy should be set for proxy variable provided in site_config.yml" +proxy_env_fail_msg: "Failed. The values for http_proxy {{ proxy[0].http_proxy }} and https_proxy {{ proxy[0].https_proxy }} in the +proxy variable of the site_config.yml should be set as environment variables http_proxy and https_proxy in the Omnia Infrastructure Manager. +The no_proxy environment variable should include the Omnia Infrastructure Manager hostname and the admin network IP address." +update_repos_fail_msg: "Failed to update repos. Verify proxy configuration in Omnia Infrastructure Manager for acccessing internet." +repo_retries: 5 +repo_delay: 10 diff --git a/discovery/roles/discovery_validations/common/vars/ubuntu.yml b/discovery/roles/discovery_validations/common/vars/ubuntu.yml index 7ffe02298..9df88b694 100644 --- a/discovery/roles/discovery_validations/common/vars/ubuntu.yml +++ b/discovery/roles/discovery_validations/common/vars/ubuntu.yml @@ -29,3 +29,6 @@ cuda_search_pattern: "*.deb" # Usage: validate_amdgpu_rocm_repo.yml offline_rocm_directory: "{{ repo_store_path }}/cluster/apt" + +# Usage: validate_intelgaudi_repo.yml +offline_intelgaudi_directory: "{{ repo_store_path }}/cluster/apt" diff --git a/discovery/roles/discovery_validations/mapping/files/validate_mapping_file.py b/discovery/roles/discovery_validations/mapping/files/validate_mapping_file.py index ae4729bc0..c87f0bf73 100644 --- a/discovery/roles/discovery_validations/mapping/files/validate_mapping_file.py +++ b/discovery/roles/discovery_validations/mapping/files/validate_mapping_file.py @@ -14,10 +14,11 @@ import re import sys +import os import ipaddress import pandas as pd -mapping_file_path = sys.argv[1] +mapping_file_path = os.path.abspath(sys.argv[1]) admin_static_start_ip = sys.argv[2] admin_static_end_ip = sys.argv[3] mandatory_col = ["SERVICE_TAG", "ADMIN_MAC", "HOSTNAME", "ADMIN_IP", "BMC_IP"] @@ -128,4 +129,4 @@ def read_mapping_csv(): sys.exit(str(err)) -read_mapping_csv() +read_mapping_csv() \ No newline at end of file diff --git a/discovery/roles/discovery_validations/mapping/tasks/validate_mapping_file.yml b/discovery/roles/discovery_validations/mapping/tasks/validate_mapping_file.yml index afaac0412..cce79b973 100644 --- a/discovery/roles/discovery_validations/mapping/tasks/validate_mapping_file.yml +++ b/discovery/roles/discovery_validations/mapping/tasks/validate_mapping_file.yml @@ -18,10 +18,10 @@ path: "{{ temp_pxe_file_path }}" state: absent -- name: Remove spaces from the mapping file - ansible.builtin.shell: > +- name: Remove leading/trailing spaces and tabs from the mapping file (but preserve column structure) + ansible.builtin.shell: | set -o pipefail && \ - tr -d ' ' < "{{ pxe_mapping_file_path }}" > "{{ temp_pxe_file_path }}" + sed 's/^[[:space:]]*//g' "{{ pxe_mapping_file_path }}" | sed 's/[[:space:]]*$//g' > "{{ temp_pxe_file_path }}" changed_when: false failed_when: false @@ -40,10 +40,10 @@ register: val_mapping_msg rescue: - - name: Rescue block - ansible.builtin.fail: - msg: "{{ val_mapping_msg.stderr }}" - when: val_mapping_msg.stderr is defined + - name: Rescue block + ansible.builtin.fail: + msg: "{{ val_mapping_msg.stderr }}" + when: val_mapping_msg.stderr is defined - name: Read host mapping file from CSV file and return a dictionary community.general.read_csv: @@ -78,4 +78,4 @@ ansible.builtin.assert: that: item is regex(("^(([a-z]|[a-z][a-z0-9\-]*[a-z0-9])\.)*([a-z]|[a-z][a-z0-9\-]*[a-z0-9])$")) fail_msg: "{{ capital_hostname_fail_msg }}" - with_items: "{{ list_of_hostnames }}" \ No newline at end of file + with_items: "{{ list_of_hostnames }}" diff --git a/discovery/roles/discovery_validations/mapping/vars/main.yml b/discovery/roles/discovery_validations/mapping/vars/main.yml index 640c0f520..e28020c60 100644 --- a/discovery/roles/discovery_validations/mapping/vars/main.yml +++ b/discovery/roles/discovery_validations/mapping/vars/main.yml @@ -14,15 +14,16 @@ --- # Usage: validate_mapping_file.yml -python_version: "python3.9" +python_version: "{{ ansible_python_interpreter }}" validate_mapping_py: "{{ role_path }}/../mapping/files/validate_mapping_file.py" pxe_path_fail_msg: "Failed. Please provide a valid pxe_mapping_file_path in provision_config.yml" mapping_file_key: "ADMIN_MAC" mapping_file_header_fail_msg: "Failed. Header of csv file is not in correct format. It should be of the format: MAC,Hostname,IP" host_mapping_header_format: "MAC,Hostname,IP" mapping_file_seperation_fail_msg: "Failed. Mapping file should be comma separated and all fields must be filled." -hostname_chars_fail_msg: "Failed. Hostname should not contain _ or . or space or node- as it might result in issues with provisioning/authentication tools like FreeIPA. -Make sure the mapping file contains only the hostname, and not the domain_name. Found in: " +hostname_chars_fail_msg: | + Failed. Hostname should not contain _ or . or space or node- as it might result in issues with provisioning/authentication + tools like FreeIPA. Make sure the mapping file contains only the hostname, and not the domain_name. Found in: duplicate_ip_fail_msg: "Failed. Duplicate IP exists. Please verify mapping file again." duplicate_hostname_fail_msg: "Failed. Duplicate hostname exists. Please verify mapping file again." temp_mapping_file_path: "/opt/omnia/pxe_mapping_file.csv" diff --git a/discovery/roles/discovery_validations/mtms/tasks/main.yml b/discovery/roles/discovery_validations/mtms/tasks/main.yml index c9c765277..712311d74 100644 --- a/discovery/roles/discovery_validations/mtms/tasks/main.yml +++ b/discovery/roles/discovery_validations/mtms/tasks/main.yml @@ -23,4 +23,4 @@ ansible.builtin.include_tasks: prerequisites.yml - name: Validate the BMC input ranges - ansible.builtin.include_tasks: validate_bmc_ranges.yml \ No newline at end of file + ansible.builtin.include_tasks: validate_bmc_ranges.yml diff --git a/discovery/roles/discovery_validations/mtms/tasks/prerequisites.yml b/discovery/roles/discovery_validations/mtms/tasks/prerequisites.yml index c1262760e..597351d1a 100644 --- a/discovery/roles/discovery_validations/mtms/tasks/prerequisites.yml +++ b/discovery/roles/discovery_validations/mtms/tasks/prerequisites.yml @@ -14,5 +14,5 @@ --- - name: Install iprange package - ansible.builtin.command: "{{ pip_version }} install {{ iprange_package }}" + ansible.builtin.command: "{{ python_version }} -m pip install {{ iprange_package }}" changed_when: true diff --git a/discovery/roles/discovery_validations/mtms/tasks/validate_bmc_ranges.yml b/discovery/roles/discovery_validations/mtms/tasks/validate_bmc_ranges.yml index 47556a25d..b852d05d4 100644 --- a/discovery/roles/discovery_validations/mtms/tasks/validate_bmc_ranges.yml +++ b/discovery/roles/discovery_validations/mtms/tasks/validate_bmc_ranges.yml @@ -20,7 +20,8 @@ - name: Validate input ranges for all ranges ansible.builtin.command: >- - {{ python_version }} {{ validation_range_file }} {{ bmc_static_start_range }} {{ bmc_static_end_range }} {{ bmc_dynamic_start_range }} {{ bmc_dynamic_end_range }} + {{ python_version }} {{ validation_range_file }} + {{ bmc_static_start_range }} {{ bmc_static_end_range }} {{ bmc_dynamic_start_range }} {{ bmc_dynamic_end_range }} register: range_validation_status changed_when: true @@ -49,4 +50,4 @@ - name: Set value if bmc discover range is empty ansible.builtin.set_fact: discover_ranges: "{{ network_data.bmc_network.discover_ranges }}" - when: bmc_discover_range_status \ No newline at end of file + when: bmc_discover_range_status diff --git a/discovery/roles/discovery_validations/mtms/vars/main.yml b/discovery/roles/discovery_validations/mtms/vars/main.yml index 0b56dcd96..302359b0b 100644 --- a/discovery/roles/discovery_validations/mtms/vars/main.yml +++ b/discovery/roles/discovery_validations/mtms/vars/main.yml @@ -15,7 +15,7 @@ # Usage: prerequisites.yml iprange_package: iprange-python -pip_version: pip3.9 +python_version: "{{ ansible_python_interpreter }}" # Usage: validate_bmc_params.yml fail_msg_bmc_credentials: "Failed. bmc_username, bmc_password are invalid in input/provision_config_credentials.yml" @@ -32,6 +32,6 @@ bmc_static_ranges_success_msg: "Successfully validated input ranges provided in bmc_static_range_mismatch_msg: "Range is invalid. Start range should be less than end range for bmc_static_start_range, bmc_static_end_range variables in network_spec.yml" range_mismatch_key: "lower bound IP greater than upper bound!" -python_version: python3.9 +# python_version: "{{ ansible_python_interpreter }}" warning_wait_time_bmc: 10 bmc_ranges_overlap_msg: "Warning - BMC dynamic and discover ranges should not overlap." diff --git a/discovery/roles/discovery_validations/switch_based/files/switch_v3_ping.py b/discovery/roles/discovery_validations/switch_based/files/switch_v3_ping.py index 4bebbbf7d..5bf05a844 100644 --- a/discovery/roles/discovery_validations/switch_based/files/switch_v3_ping.py +++ b/discovery/roles/discovery_validations/switch_based/files/switch_v3_ping.py @@ -15,9 +15,19 @@ import platform import subprocess import sys +import ipaddress host = sys.argv[1] +def validate_ip(host): + """ + Returns True if host is a valid IP address. + """ + try: + ipaddress.ip_address(host) + return True + except ValueError: + return False def ping(): """ @@ -25,13 +35,16 @@ def ping(): """ # Option for the number of packets as a function of param = '-n' if platform.system().lower() == 'windows' else '-c' - + # Validate the IP address + if not validate_ip(host): + sys.exit(f"'{host}' is not a valid IP address.") # Building the command. Ex: "ping -c 1 google.com" command = ['ping', param, '1', host] return subprocess.call(command) == 0 + ping_op = ping() if not ping_op: print(host) diff --git a/discovery/roles/discovery_validations/switch_based/tasks/main.yml b/discovery/roles/discovery_validations/switch_based/tasks/main.yml index 67ecd6c17..b36ebe444 100644 --- a/discovery/roles/discovery_validations/switch_based/tasks/main.yml +++ b/discovery/roles/discovery_validations/switch_based/tasks/main.yml @@ -24,6 +24,6 @@ - name: Validate switch snmp v3 details ansible.builtin.include_tasks: validate_switch_snmp_params.yml - + - name: Validate bmc details status ansible.builtin.include_tasks: validate_bmc_details_status.yml diff --git a/discovery/roles/discovery_validations/switch_based/tasks/validate_switch_based_details.yml b/discovery/roles/discovery_validations/switch_based/tasks/validate_switch_based_details.yml index 991eda2d8..a4ed71090 100644 --- a/discovery/roles/discovery_validations/switch_based/tasks/validate_switch_based_details.yml +++ b/discovery/roles/discovery_validations/switch_based/tasks/validate_switch_based_details.yml @@ -32,10 +32,10 @@ - name: Create list of switch IPs ansible.builtin.set_fact: - switch_based_ip_list: "{{ switch_based_ip_list | default([]) }} + [ '{{ item.ip }}']" + switch_based_ip_list: "{{ switch_based_ip_list | default([]) + [item.ip] }}" with_items: "{{ switch_based_details }}" -- name: Create list of switch IPs +- name: Create list of switch IPs - Ensure unique ansible.builtin.set_fact: switch_based_ip_list: "{{ switch_based_ip_list | unique }}" diff --git a/discovery/roles/metadata_update/tasks/update_metadata.yml b/discovery/roles/metadata_update/tasks/update_metadata.yml index 6b28406fc..828c6715c 100644 --- a/discovery/roles/metadata_update/tasks/update_metadata.yml +++ b/discovery/roles/metadata_update/tasks/update_metadata.yml @@ -159,3 +159,11 @@ insertafter: "EOF" state: present line: 'md_discovery_mech_switch_based: {{ discovery_mech_switch_based }}' + +- name: Update installed_version + ansible.builtin.lineinfile: + path: "{{ meta_path }}" + regexp: '^installed_version:(.*)$' + insertafter: "EOF" + state: present + line: 'installed_version: "1.7"' diff --git a/discovery/roles/monitor_thread/files/parse_syslog.py b/discovery/roles/monitor_thread/files/parse_syslog.py index 8afe56000..b5fd2285e 100644 --- a/discovery/roles/monitor_thread/files/parse_syslog.py +++ b/discovery/roles/monitor_thread/files/parse_syslog.py @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import configparser +import commentedconfigparser import os import syslog from psycopg2.extensions import cursor @@ -28,7 +28,7 @@ def get_count(line: str) -> int: # If the split string has more than one element, return the second element as an integer if len(split_line) > 1: return int(split_line[1]) - + # If the split string has only one element, return 0 return 0 @@ -46,7 +46,7 @@ def get_node_info_db(cursor: cursor, node: str) -> tuple: """ # Define the SQL query to retrieve node information query = """ - SELECT + SELECT service_tag, admin_ip, cpu, @@ -54,19 +54,20 @@ def get_node_info_db(cursor: cursor, node: str) -> tuple: cpu_count, gpu_count, status, - admin_mac - FROM + admin_mac, + hostname + FROM cluster.nodeinfo - WHERE + WHERE node = %s """ - + # Execute the SQL query with the given node cursor.execute(query, (node,)) - + # Fetch the node info node_info = cursor.fetchone() - + # Return the node information return node_info @@ -87,8 +88,9 @@ def get_updated_cpu_gpu_info(node: str) -> tuple: no_gpu_str = "No GPU Found" intel_cpu_str = "Intel CPU Found" amd_cpu_str = "AMD CPU Found" + intel_gpu_str = "Intel GPU Found" no_cpu_str = "No CPU Found" - + # Initialize variables cpu = "" cpu_count = 0 @@ -96,7 +98,7 @@ def get_updated_cpu_gpu_info(node: str) -> tuple: gpu_count = 0 gpu_found = False cpu_found = False - + # Define the path to the log file computes_log_file_path = '/var/log/xcat/computes.log' @@ -105,47 +107,52 @@ def get_updated_cpu_gpu_info(node: str) -> tuple: with open(computes_log_file_path, 'r', encoding='utf-8') as file: # Read the contents of the file contents = file.readlines() - - # Iterate over the lines in reverse order - for line in reversed(contents): - # Check if the node name is present in the line - if node in line: - # Check if the GPU have been found - if gpu_found == False: - # Check if the Nvidia GPU str is present in the line - if nvidia_gpu_str in line: - gpu = "nvidia" - gpu_count = get_count(line) - gpu_found = True - # Check if the AMD GPU str is present in the line - elif amd_gpu_str in line: - gpu = "amd" - gpu_count = get_count(line) - gpu_found = True - # Check if the No GPU str is present in the line - elif no_gpu_str in line: - gpu_found = True - - # Check if the CPU has been found - if cpu_found == False: - # Check if the Intel CPU str is present in the line - if intel_cpu_str in line: - cpu = "intel" - cpu_count = get_count(line) - cpu_found = True - # Check if the AMD CPU str is present in the line - elif amd_cpu_str in line: - cpu = "amd" - cpu_count = get_count(line) - cpu_found = True - # Check if the No CPU str is present in the line - elif no_cpu_str in line: - cpu_found = True - - # Break out of the loop if both GPU and CPU have been found - if cpu_found == True and gpu_found == True: - break - + if contents: + # Iterate over the lines in reverse order + for line in reversed(contents): + # Check if the node name is present in the line + if node in line: + # Check if the GPU have been found + if gpu_found == False: + # Check if the Nvidia GPU str is present in the line + if nvidia_gpu_str in line: + gpu = "nvidia" + gpu_count = get_count(line) + gpu_found = True + # Check if the AMD GPU str is present in the line + elif amd_gpu_str in line: + gpu = "amd" + gpu_count = get_count(line) + gpu_found = True + # Check if the Intel GPU str is present in the line + elif intel_gpu_str in line: + gpu = "intel" + gpu_count = get_count(line) + gpu_found = True + # Check if the No GPU str is present in the line + elif no_gpu_str in line: + gpu_found = True + + # Check if the CPU has been found + if cpu_found == False: + # Check if the Intel CPU str is present in the line + if intel_cpu_str in line: + cpu = "intel" + cpu_count = get_count(line) + cpu_found = True + # Check if the AMD CPU str is present in the line + elif amd_cpu_str in line: + cpu = "amd" + cpu_count = get_count(line) + cpu_found = True + # Check if the No CPU str is present in the line + elif no_cpu_str in line: + cpu_found = True + + # Break out of the loop if both GPU and CPU have been found + if cpu_found == True and gpu_found == True: + break + except FileNotFoundError: # Log an error if the file is not found syslog.syslog(syslog.LOG_ERR, f"parse_syslog:get_updated_cpu_gpu_info: File '{computes_log_file_path}' not found") @@ -170,7 +177,7 @@ def update_db(cursor: cursor, node: str, updated_node_info: tuple) -> None: """ # Unpack the updated node information tuple cpu, gpu, cpu_count, gpu_count = updated_node_info - + # Prepare the SQL query for updating the database sql_update_db = """ UPDATE @@ -188,70 +195,65 @@ def update_db(cursor: cursor, node: str, updated_node_info: tuple) -> None: cursor.execute(sql_update_db, params) -def remove_servicetag_inventory(inventory_file: str, service_tag: str) -> None: +def remove_hostname_inventory(inventory_file: str, hostname: str) -> None: """ - Removes a service tag from the inventory file. + Removes a hostname from the inventory file. Args: inventory_file (str): The name of the inventory file. - service_tag (str): The service tag to remove. + hostname (str): The hostname to remove. """ try: # Read the inventory file - config = configparser.ConfigParser(allow_no_value=True) + config = commentedconfigparser.CommentedConfigParser(allow_no_value=True) config.read(inventory_file, encoding='utf-8') - + # Change the permission of the file os.chmod(inventory_file, 0o644) - # Remove service tag if exists in the inventory file - if not config.remove_option(inventory_file, service_tag): - # Log a message if the service tag is not found - syslog.syslog(syslog.LOG_INFO, f"parse_syslog:remove_servicetag_inventory: '{service_tag}' is not found in '{inventory_file}'") + # Remove hostname if exists in the inventory file + if not config.remove_option(inventory_file, hostname): + # Log a message if the hostname is not found + syslog.syslog(syslog.LOG_INFO, f"parse_syslog:remove_hostname_inventory: '{hostname}' is not found in '{inventory_file}'") return - + # Write the updated inventory file with open(inventory_file, 'w', encoding='utf-8') as configfile: config.write(configfile, space_around_delimiters=False) - except (configparser.DuplicateOptionError, - configparser.DuplicateSectionError, - configparser.NoSectionError, + except (OSError, Exception) as err: - syslog.syslog(syslog.LOG_ERR, f"parse_syslog:remove_servicetag_inventory: {str(type(err))} {str(err)}") + syslog.syslog(syslog.LOG_ERR, f"parse_syslog:remove_hostname_inventory: {str(type(err))} {str(err)}") finally: # Change the permission of the file to readonly os.chmod(inventory_file, 0o444) -def add_servicetag_inventory(inventory_file: str, service_tag: str) -> None: +def add_hostname_inventory(inventory_file: str, hostname: str) -> None: """ - Adds a service tag to the inventory file. + Adds a hostname to the inventory file. Args: inventory_file (str): The path to the inventory file. - service_tag (str): The service tag to add. + hostname (str): The hostname to add. """ try: # Read the config file - config = configparser.ConfigParser(allow_no_value=True) + config = commentedconfigparser.CommentedConfigParser(allow_no_value=True) config.read(inventory_file, encoding='utf-8') - + # Change the permission of the file os.chmod(inventory_file, 0o644) - # Set the service tag - config.set(inventory_file, service_tag) - + # Set the hostname + config.set(inventory_file, hostname) + # Write the inventory file with open(inventory_file, 'w', encoding='utf-8') as configfile: config.write(configfile, space_around_delimiters=False) - except (configparser.DuplicateOptionError, - configparser.DuplicateSectionError, - configparser.NoSectionError, - OSError, + except (OSError, Exception) as err: - syslog.syslog(syslog.LOG_ERR, f"parse_syslog:add_servicetag_inventory: {str(type(err))} {str(err)}") + syslog.syslog(syslog.LOG_ERR, f"parse_syslog:add_hostname_inventory: {str(type(err))} {str(err)}") finally: # Change the permission of the file to readonly os.chmod(inventory_file, 0o444) @@ -260,19 +262,19 @@ def add_servicetag_inventory(inventory_file: str, service_tag: str) -> None: def update_inventory(node_info_db: tuple, updated_node_info: tuple) -> None: """ Update the inventory files based on the updated node information. - + Args: node_info_db (tuple): A tuple containing the node information from the database. updated_node_info (tuple): A tuple containing the updated node information. """ - + try: # Unpack the node information from the tuples - service_tag, admin_ip, db_cpu, db_gpu = node_info_db[0], node_info_db[1], node_info_db[2], node_info_db[3] + service_tag, admin_ip, db_cpu, db_gpu, hostname = node_info_db[0], node_info_db[1], node_info_db[2], node_info_db[3], node_info_db[8] updated_cpu, updated_gpu = updated_node_info[0], updated_node_info[1] - - # No modification in inventory if no service tag - if not service_tag: + + # No modification in inventory if no hostname + if not hostname: return # Change the current working directory to the inventory directory @@ -280,30 +282,40 @@ def update_inventory(node_info_db: tuple, updated_node_info: tuple) -> None: omnia_inventory_dir = "/opt/omnia/omnia_inventory/" if curr_dir != omnia_inventory_dir: os.chdir(omnia_inventory_dir) - + # Update inventory files if the CPU has been modified if updated_cpu != db_cpu: if db_cpu: - # Remove existing service tag from corresponding inventory file + # Remove existing hostname from corresponding inventory file inventory_file_str = "compute_cpu_intel" if db_cpu == "intel" else "compute_cpu_amd" - remove_servicetag_inventory(inventory_file_str, service_tag) + remove_hostname_inventory(inventory_file_str, hostname) if updated_cpu: - # Add service tag to corresponding inventory file + # Add hostname to corresponding inventory file inventory_file_str = "compute_cpu_intel" if updated_cpu == "intel" else "compute_cpu_amd" - add_servicetag_inventory(inventory_file_str, service_tag) - # Add service tag and admin ip to compute_servicetag_ip inventory file - service_tag_ip_str = f"{service_tag} ansible_host={admin_ip}" - add_servicetag_inventory("compute_servicetag_ip", service_tag_ip_str) - + add_hostname_inventory(inventory_file_str, hostname) + # Add hostname and admin ip to compute_hostname_ip inventory file + hostname_ip_str = f"{hostname} ansible_host={admin_ip}" + add_hostname_inventory("compute_hostname_ip", hostname_ip_str) + # Update inventory files if the GPU has been modified if updated_gpu != db_gpu: if db_gpu: - # Remove existing service tag from corresponding inventory file - inventory_file_str = "compute_gpu_nvidia" if db_gpu == "nvidia" else "compute_gpu_amd" - remove_servicetag_inventory(inventory_file_str, service_tag) + # Remove existing hostname from corresponding inventory file + if db_gpu == "nvidia": + inventory_file_str = "compute_gpu_nvidia" + elif db_gpu == "amd": + inventory_file_str = "compute_gpu_amd" + elif db_gpu == "intel": + inventory_file_str = "compute_gpu_intel" + remove_hostname_inventory(inventory_file_str, hostname) if updated_gpu: - # Add service tag to corresponding inventory file - inventory_file_str = "compute_gpu_nvidia" if updated_gpu == "nvidia" else "compute_gpu_amd" - add_servicetag_inventory(inventory_file_str, service_tag) + # Add hostname to corresponding inventory file + if updated_gpu == "nvidia": + inventory_file_str = "compute_gpu_nvidia" + elif updated_gpu == "amd": + inventory_file_str = "compute_gpu_amd" + elif updated_gpu == "intel": + inventory_file_str = "compute_gpu_intel" + add_hostname_inventory(inventory_file_str, hostname) except Exception as e: syslog.syslog(syslog.LOG_ERR, f"parse_syslog:update_inventory: Exception occurred: {str(type(e))} {str(e)}") diff --git a/discovery/roles/monitor_thread/tasks/initiate_monitor_status.yml b/discovery/roles/monitor_thread/tasks/initiate_monitor_status.yml index 2042a9b16..5d492ba59 100644 --- a/discovery/roles/monitor_thread/tasks/initiate_monitor_status.yml +++ b/discovery/roles/monitor_thread/tasks/initiate_monitor_status.yml @@ -15,8 +15,8 @@ - name: Install development packages when: - - control_plane_os in control_plane_os_redhat or - control_plane_os in control_plane_os_rocky + - oim_os in oim_os_redhat or + oim_os in oim_os_rocky ansible.builtin.package: name: "{{ psql_devel_packages }}" state: present diff --git a/discovery/roles/monitor_thread/vars/main.yml b/discovery/roles/monitor_thread/vars/main.yml index 699395404..dae19058d 100644 --- a/discovery/roles/monitor_thread/vars/main.yml +++ b/discovery/roles/monitor_thread/vars/main.yml @@ -14,12 +14,12 @@ --- # Usage: initiate_monitor_status.yml -control_plane_os_redhat: "redhat" -control_plane_os_rocky: "rocky" +oim_os_redhat: "redhat" +oim_os_rocky: "rocky" psql_devel_packages: - postgresql-devel - python3-devel -python_version: python3.9 +python_version: "{{ ansible_python_interpreter }}" omnia_service_shell_file_path: /opt/omnia/omnia_service_script.sh shell_file_permissions: "0777" omnia_service_path: /etc/systemd/system/omnia.service diff --git a/discovery/roles/postscripts/common/tasks/check_nodes_all.yml b/discovery/roles/postscripts/common/tasks/check_nodes_all.yml index 7a97839ae..7dbc425dc 100644 --- a/discovery/roles/postscripts/common/tasks/check_nodes_all.yml +++ b/discovery/roles/postscripts/common/tasks/check_nodes_all.yml @@ -18,7 +18,7 @@ all_node_status: false - name: Fetch nodes with group all - ansible.builtin.command: lsdef all + ansible.builtin.command: "{{ xcat_path }}/lsdef all" changed_when: false register: check_all_nodes failed_when: false diff --git a/discovery/roles/postscripts/common/tasks/configure_postscripts.yml b/discovery/roles/postscripts/common/tasks/configure_postscripts.yml index cf00022ce..220070c8b 100644 --- a/discovery/roles/postscripts/common/tasks/configure_postscripts.yml +++ b/discovery/roles/postscripts/common/tasks/configure_postscripts.yml @@ -14,14 +14,15 @@ --- - name: Configure default postscripts - ansible.builtin.command: chdef all postscripts="syslog,remoteshell,syncfiles" + ansible.builtin.command: "{{ xcat_path }}/chdef all postscripts=\"syslog,remoteshell,syncfiles\"" changed_when: true -- name: Configure postscripts - ansible.builtin.include_tasks: "{{ role_path }}/../{{ control_plane_os }}/tasks/configure_postscripts.yml" +- name: Configure ubuntu postscripts + ansible.builtin.include_tasks: "{{ role_path }}/../{{ oim_os }}/tasks/configure_postscripts.yml" - name: Configure ntp postscripts ansible.builtin.include_tasks: configure_postscripts_ntp.yml + when: ntp_support - name: Copy hostname script to postscripts ansible.builtin.template: @@ -31,7 +32,7 @@ with_items: "{{ hostname_postscripts_path }}" - name: Configure postscripts to configure hostname - ansible.builtin.command: chdef all -p postscripts=omnia_hostname + ansible.builtin.command: "{{ xcat_path }}/chdef all -p postscripts=omnia_hostname" changed_when: true - name: Verify OFED repo created @@ -41,7 +42,7 @@ when: ofed_config_status - name: Configure postscripts for OFED - ansible.builtin.command: chdef all -p postscripts=omnia_ofed + ansible.builtin.command: "{{ xcat_path }}/chdef all -p postscripts=omnia_ofed" changed_when: true when: - ofed_config_status @@ -54,17 +55,22 @@ when: cuda_config_status - name: Configure postscripts for CUDA - ansible.builtin.command: chdef all -p postscripts=omnia_cuda + ansible.builtin.command: "{{ xcat_path }}/chdef all -p postscripts=omnia_cuda" changed_when: true when: - cuda_config_status - cuda_repo_stat.stat.exists - name: Configure postscripts for ROCm - ansible.builtin.command: chdef all -p postscripts=omnia_rocm + ansible.builtin.command: "{{ xcat_path }}/chdef all -p postscripts=omnia_rocm" changed_when: true when: amdgpu_config_status +- name: Configure postscripts for Intel Gaudi + ansible.builtin.command: "{{ xcat_path }}/chdef all -p postscripts=omnia_intelgaudi" + changed_when: true + when: intelgaudi_config_status + - name: Copy CPU GPU info script to postscripts ansible.builtin.template: src: "{{ item.src }}" @@ -73,10 +79,10 @@ with_items: "{{ cpu_gpu_info_postscripts_path }}" - name: Configure postscripts to get cpu info - ansible.builtin.command: chdef all -p postscripts=omnia_cpu_gpu_info + ansible.builtin.command: "{{ xcat_path }}/chdef all -p postscripts=omnia_cpu_gpu_info" changed_when: true - name: Configure postscripts for RoCE - ansible.builtin.command: chdef all -p postscripts=omnia_roce + ansible.builtin.command: "{{ xcat_path }}/chdef all -p postscripts=omnia_roce" changed_when: true when: roce_config_status diff --git a/discovery/roles/postscripts/common/tasks/configure_postscripts_ntp.yml b/discovery/roles/postscripts/common/tasks/configure_postscripts_ntp.yml index 2b16af67d..900ca45d8 100644 --- a/discovery/roles/postscripts/common/tasks/configure_postscripts_ntp.yml +++ b/discovery/roles/postscripts/common/tasks/configure_postscripts_ntp.yml @@ -13,10 +13,19 @@ # limitations under the License. --- -- name: Setup ntp - ansible.builtin.command: makentp - changed_when: true +- name: Configure NTP + block: + - name: Setup NTP + ansible.builtin.command: "{{ xcat_path }}/makentp" + changed_when: true + async: "{{ async_time }}" + poll: "{{ poll_time }}" -- name: Configure postscripts for ntp - ansible.builtin.command: chdef all -p postscripts=setupntp - changed_when: true + - name: Configure postscripts for NTP + ansible.builtin.command: "{{ xcat_path }}/chdef all -p postscripts=setupntp" + changed_when: true + rescue: + - name: Warning - Failed to configure NTP + ansible.builtin.pause: + prompt: "{{ setup_ntp_warning_msg }}" + seconds: "{{ warning_time }}" diff --git a/discovery/roles/postscripts/common/tasks/main.yml b/discovery/roles/postscripts/common/tasks/main.yml index ab25babb0..4f3f4111c 100644 --- a/discovery/roles/postscripts/common/tasks/main.yml +++ b/discovery/roles/postscripts/common/tasks/main.yml @@ -20,17 +20,17 @@ MANPATH: "{{ xcat_manpath_env }}" PERL_BADLANG: "{{ perl_badlang_env }}" block: - - name: Include vars for {{ control_plane_os }} - ansible.builtin.include_vars: "{{ role_path }}/../{{ control_plane_os }}/vars/main.yml" + - name: Include vars for {{ oim_os }} + ansible.builtin.include_vars: "{{ role_path }}/../{{ oim_os }}/vars/main.yml" - - name: Configure postscripts on {{ control_plane_os }} - ansible.builtin.include_tasks: "{{ role_path }}/../{{ control_plane_os }}/tasks/main.yml" + - name: Configure postscripts on {{ oim_os }} + ansible.builtin.include_tasks: "{{ role_path }}/../{{ oim_os }}/tasks/main.yml" - name: Check all node group status ansible.builtin.include_tasks: check_nodes_all.yml - name: Configuration of postbootscripts - ansible.builtin.include_tasks: "{{ role_path }}/../{{ control_plane_os }}/tasks/configure_postbootscripts.yml" + ansible.builtin.include_tasks: "{{ role_path }}/../{{ oim_os }}/tasks/configure_postbootscripts.yml" when: all_node_status - name: Configuration of postscripts diff --git a/discovery/roles/postscripts/common/templates/omnia_cpu_gpu_info.j2 b/discovery/roles/postscripts/common/templates/omnia_cpu_gpu_info.j2 index 293d8d05c..3d9f1a7c9 100644 --- a/discovery/roles/postscripts/common/templates/omnia_cpu_gpu_info.j2 +++ b/discovery/roles/postscripts/common/templates/omnia_cpu_gpu_info.j2 @@ -36,6 +36,9 @@ amd_processing_accelerator_count=`lshw | grep --after-context=2 "description: Pr # Compute AMD GPU count amd_gpu_count=$((amd_display_controller_count + amd_processing_accelerator_count)) +# Compute Intel GPU count +intel_gpu_count=$(lspci | grep -ic 'Processing accelerators: Habana Labs Ltd') + # Check for Intel CPU if [ $intel_cpu_check -gt 0 ]; then cpu_str="Intel CPU Found Count=$cpu_count" @@ -52,6 +55,8 @@ if [ $nvidia_gpu_count -gt 0 ]; then # Check for AMD GPU elif [ $amd_gpu_count -gt 0 ]; then gpu_str="AMD GPU Found Count=$amd_gpu_count" +elif [ $intel_gpu_count -gt 0 ]; then + gpu_str="Intel GPU Found Count=$intel_gpu_count" else gpu_str="No GPU Found" fi diff --git a/discovery/roles/postscripts/common/templates/omnia_rocm.j2 b/discovery/roles/postscripts/common/templates/omnia_rocm.j2 index 40add3e17..524921c68 100644 --- a/discovery/roles/postscripts/common/templates/omnia_rocm.j2 +++ b/discovery/roles/postscripts/common/templates/omnia_rocm.j2 @@ -33,6 +33,9 @@ name=amdgpu baseurl=http://{{ admin_nic_ip }}:80/install{{ repo_store_path }}/cluster/yum/amdgpu/{{ amdgpu_version }} enabled=1 gpgcheck=0 +{% if proxy_status %} +proxy=_none_ +{% endif %} EOF dnf clean all @@ -47,4 +50,4 @@ EOF echo "AMDGPU installation completed" >> /var/log/xcat/xcat.log fi echo "-----------------------------" >> /var/log/xcat/xcat.log -fi \ No newline at end of file +fi diff --git a/discovery/roles/postscripts/common/vars/main.yml b/discovery/roles/postscripts/common/vars/main.yml index ee6bf9e3b..d903448c4 100644 --- a/discovery/roles/postscripts/common/vars/main.yml +++ b/discovery/roles/postscripts/common/vars/main.yml @@ -18,6 +18,7 @@ xcat_root_env: "/opt/xcat" xcat_path_env: "/opt/xcat/bin:/opt/xcat/sbin:/opt/xcat/share/xcat/tools" xcat_manpath_env: "/opt/xcat/share/man:$MANPATH" perl_badlang_env: 0 +xcat_path: /opt/xcat/bin # Usage: check_nodes_all.yml all_nodes_warning_msg: "[WARNING] Not found any nodes using the given discovery mechanism. @@ -28,9 +29,16 @@ mlnx_ofed_repo: /install/ofed cuda_core_path: /install/cuda/x86_64/cuda-core hostname_postscripts_path: - { src: "{{ role_path }}/templates/omnia_hostname.j2", dest: "/install/postscripts/omnia_hostname", mode: "755" } -control_plane_os_ubuntu: "ubuntu" +oim_os_ubuntu: "ubuntu" cpu_gpu_info_postscripts_path: - { src: "{{ role_path }}/templates/omnia_cpu_gpu_info.j2", dest: "/install/postscripts/omnia_cpu_gpu_info", mode: "755" } provision_os_ubuntu: "ubuntu" ubuntu_reboot_script_path: - { src: "{{ role_path }}/files/omnia_reboot", dest: "/install/postscripts/omnia_reboot", mode: "755" } + +# Usage: configure_postsctipts_ntp.yml +setup_ntp_warning_msg: "[WARNING] Failed to setup NTP server in Omnia Infrastructure Manager. This can be due to public NTP pools blocked via ICMP. +Skipping NTP configuration in the cluster. If public NTP pools not reachable from Omnia Infrastructure Manager, set ntp_support to false in provison_config.yml." +warning_time: 30 +async_time: 300 +poll_time: 15 diff --git a/discovery/roles/postscripts/redhat/tasks/configure_cuda.yml b/discovery/roles/postscripts/redhat/tasks/configure_cuda.yml index 9a3d6cfdf..2b51f37d5 100644 --- a/discovery/roles/postscripts/redhat/tasks/configure_cuda.yml +++ b/discovery/roles/postscripts/redhat/tasks/configure_cuda.yml @@ -84,5 +84,5 @@ line: 'echo "PATH=$PATH:/opt/dell/srvadmin/sbin:/usr/local/sbin:/usr/local/bin/:/usr/local/cuda/bin" >> /etc/bashrc' - name: Update osimage with cuda repository - ansible.builtin.command: chdef -t osimage -o {{ provision_os_image }} -p pkgdir={{ cuda_core_path }} + ansible.builtin.command: "{{ xcat_path }}/chdef -t osimage -o {{ provision_os_image }} -p pkgdir={{ cuda_core_path }}" changed_when: true diff --git a/discovery/roles/postscripts/redhat/tasks/configure_postbootscripts.yml b/discovery/roles/postscripts/redhat/tasks/configure_postbootscripts.yml index 1c7322460..0bdc67412 100644 --- a/discovery/roles/postscripts/redhat/tasks/configure_postbootscripts.yml +++ b/discovery/roles/postscripts/redhat/tasks/configure_postbootscripts.yml @@ -14,7 +14,7 @@ --- - name: Configure postbootscripts - ansible.builtin.command: chdef all postbootscripts="otherpkgs" + ansible.builtin.command: "{{ xcat_path }}/chdef all postbootscripts=\"otherpkgs\"" changed_when: true - name: Configure admin postbootscripts diff --git a/discovery/roles/postscripts/redhat/tasks/configure_postscripts.yml b/discovery/roles/postscripts/redhat/tasks/configure_postscripts.yml index 62e6a60cc..93a685f33 100644 --- a/discovery/roles/postscripts/redhat/tasks/configure_postscripts.yml +++ b/discovery/roles/postscripts/redhat/tasks/configure_postscripts.yml @@ -21,5 +21,5 @@ with_items: "{{ omnia_disable_repo_postscripts_path }}" - name: Configure postscripts for omnia_disable_repo - ansible.builtin.command: chdef all -p postscripts=omnia_disable_repo + ansible.builtin.command: "{{ xcat_path }}/chdef all -p postscripts=omnia_disable_repo" changed_when: true diff --git a/discovery/roles/postscripts/redhat/tasks/configure_postscripts_admin.yml b/discovery/roles/postscripts/redhat/tasks/configure_postscripts_admin.yml index fdf15c5d0..6774fe525 100644 --- a/discovery/roles/postscripts/redhat/tasks/configure_postscripts_admin.yml +++ b/discovery/roles/postscripts/redhat/tasks/configure_postscripts_admin.yml @@ -44,5 +44,5 @@ with_items: "{{ configeth_patch_path }}" - name: Configure network and hostname postbootscripts - ansible.builtin.command: chdef all -p postbootscripts="confignetwork -s,omnia_hostname" + ansible.builtin.command: "{{ xcat_path }}/chdef all -p postbootscripts=\"confignetwork -s,omnia_hostname\"" changed_when: true diff --git a/discovery/roles/postscripts/redhat/tasks/pre_requisite.yml b/discovery/roles/postscripts/redhat/tasks/pre_requisite.yml index f81a1f964..6045ac15f 100644 --- a/discovery/roles/postscripts/redhat/tasks/pre_requisite.yml +++ b/discovery/roles/postscripts/redhat/tasks/pre_requisite.yml @@ -14,12 +14,12 @@ --- - name: Check pkgdir of osimage - ansible.builtin.command: lsdef -t osimage -o {{ provision_os_image }} -i pkgdir + ansible.builtin.command: "{{ xcat_path }}/lsdef -t osimage -o {{ provision_os_image }} -i pkgdir" changed_when: false register: osimage_pkgdir - name: Remove cuda pkgdir - ansible.builtin.command: chdef -t osimage -o {{ provision_os_image }} -m pkgdir={{ cuda_core_path }},{{ cuda_deps_path }} + ansible.builtin.command: "{{ xcat_path }}/chdef -t osimage -o {{ provision_os_image }} -m pkgdir={{ cuda_core_path }},{{ cuda_deps_path }}" changed_when: true when: - not cuda_config_status @@ -30,6 +30,6 @@ path: "{{ xcat_rhel8_post_script }}" regexp: 'EOF' line: 'echo "PATH=$PATH:/opt/dell/srvadmin/sbin:/usr/local/sbin:/usr/local/bin/" >> /etc/bashrc' - when: + when: - not cuda_config_status - not amdgpu_config_status diff --git a/discovery/roles/postscripts/redhat/vars/main.yml b/discovery/roles/postscripts/redhat/vars/main.yml index 6a6f1873b..97e7aa9d1 100644 --- a/discovery/roles/postscripts/redhat/vars/main.yml +++ b/discovery/roles/postscripts/redhat/vars/main.yml @@ -35,6 +35,7 @@ invalid_cuda_rpm_fail_msg: "Failed. Invalid cuda_toolkit_path: {{ cuda_toolkit_p Make sure cuda rpm file is downloaded completely." cuda_postscripts_path: - { src: "{{ role_path }}/../redhat/templates/omnia_cuda.j2", dest: "/install/postscripts/omnia_cuda", mode: "755" } +xcat_path: /opt/xcat/bin # Usage: configure_rocm.yml rocm_postscripts_path: diff --git a/discovery/roles/postscripts/ubuntu/tasks/configure_cuda.yml b/discovery/roles/postscripts/ubuntu/tasks/configure_cuda.yml index 7ae96b02f..c52a247af 100644 --- a/discovery/roles/postscripts/ubuntu/tasks/configure_cuda.yml +++ b/discovery/roles/postscripts/ubuntu/tasks/configure_cuda.yml @@ -54,7 +54,7 @@ - name: Set cuda gpg key ansible.builtin.set_fact: - cuda_gpg_key: "{{ cuda_gpg_file.files[0].path.split('/')[-1] }}" + cuda_gpg_key: "{{ cuda_gpg_file.files[0].path.split('/')[-1] }}" cuda_debs_folder: "{{ cuda_gpg_file.files[0].path.split('/')[-2] }}" - name: Copy CUDA script to postscripts diff --git a/server_spec_update/roles/metadata_update/vars/main.yml b/discovery/roles/postscripts/ubuntu/tasks/configure_intelgaudi.yml similarity index 74% rename from server_spec_update/roles/metadata_update/vars/main.yml rename to discovery/roles/postscripts/ubuntu/tasks/configure_intelgaudi.yml index 9fe2f73a4..e532bcf3b 100644 --- a/server_spec_update/roles/metadata_update/vars/main.yml +++ b/discovery/roles/postscripts/ubuntu/tasks/configure_intelgaudi.yml @@ -13,8 +13,9 @@ # limitations under the License. --- -# Usage: update_metadata.yml -meta_path: "/opt/omnia/.data/nic_metadata.yml" -conf_file_mode: "0644" -python_version: "python3.9" -update_nic_metadata_py: "{{ role_path }}/files/update_nic_metadata.py" +- name: Copy Intel Gaudi script to postscripts + ansible.builtin.template: + src: "{{ item.src }}" + dest: "{{ item.dest }}" + mode: "{{ item.mode }}" + with_items: "{{ intelgaudi_postscripts_path }}" diff --git a/discovery/roles/postscripts/ubuntu/tasks/configure_postbootscripts.yml b/discovery/roles/postscripts/ubuntu/tasks/configure_postbootscripts.yml index 72355e789..bbe9c9273 100644 --- a/discovery/roles/postscripts/ubuntu/tasks/configure_postbootscripts.yml +++ b/discovery/roles/postscripts/ubuntu/tasks/configure_postbootscripts.yml @@ -14,13 +14,24 @@ --- - name: Configure default postbootscripts - ansible.builtin.command: chdef all postbootscripts="otherpkgs" + ansible.builtin.command: "{{ xcat_path }}/chdef all postbootscripts=\"otherpkgs\"" changed_when: true - name: Configure hostname postbootscripts - ansible.builtin.command: chdef all -p postbootscripts="omnia_hostname" + ansible.builtin.command: "{{ xcat_path }}/chdef all -p postbootscripts=\"omnia_hostname\"" + changed_when: true + +- name: Copy omnia_ssh_check script to postbootscripts + ansible.builtin.template: + src: "{{ item.src }}" + dest: "{{ item.dest }}" + mode: "{{ item.mode }}" + with_items: "{{ omnia_ssh_check_postbootscripts_path }}" + +- name: Configure SSH services check postbootscripts + ansible.builtin.command: "{{ xcat_path }}/chdef all -p postbootscripts=\"omnia_ssh_check\"" changed_when: true - name: Configure syncfiles postbootscripts - ansible.builtin.command: chdef all -p postbootscripts="syncfiles" - changed_when: true \ No newline at end of file + ansible.builtin.command: "{{ xcat_path }}/chdef all -p postbootscripts=\"syncfiles\"" + changed_when: true diff --git a/discovery/roles/postscripts/ubuntu/tasks/configure_postscripts.yml b/discovery/roles/postscripts/ubuntu/tasks/configure_postscripts.yml index 67d658ff8..2e9c24963 100644 --- a/discovery/roles/postscripts/ubuntu/tasks/configure_postscripts.yml +++ b/discovery/roles/postscripts/ubuntu/tasks/configure_postscripts.yml @@ -14,22 +14,13 @@ --- - name: Configure postscripts for syslog,remoteshell - ansible.builtin.command: tabch node=xcatdefaults postscripts.postscripts="syslog,remoteshell" + ansible.builtin.command: "{{ xcat_sbin_path }}/tabch node=xcatdefaults postscripts.postscripts=\"syslog,remoteshell\"" changed_when: true - name: Configure default postscripts - ansible.builtin.command: chdef all postscripts="syslog,remoteshell" + ansible.builtin.command: "{{ xcat_path }}/chdef all postscripts=\"syslog,remoteshell\"" changed_when: true -- name: Fetch control_plane hostname - ansible.builtin.command: hostname - changed_when: false - register: cp_hostname - -- name: Set control_plane_hostname - ansible.builtin.set_fact: - control_plane_hostname: "{{ cp_hostname.stdout }}" - - name: Copy omnia_ubuntu script to postscripts ansible.builtin.template: src: "{{ item.src }}" @@ -38,9 +29,9 @@ with_items: "{{ omnia_ubuntu_postscripts_path }}" - name: Configure postscripts for omnia_ubuntu - ansible.builtin.command: chdef all -p postscripts=omnia_ubuntu + ansible.builtin.command: "{{ xcat_path }}/chdef all -p postscripts=omnia_ubuntu" changed_when: true - name: Configure postscripts for confignetwork - ansible.builtin.command: chdef all -p postscripts="confignetwork -s" + ansible.builtin.command: "{{ xcat_path }}/chdef all -p postscripts=\"confignetwork -s\"" changed_when: true diff --git a/discovery/roles/postscripts/ubuntu/tasks/main.yml b/discovery/roles/postscripts/ubuntu/tasks/main.yml index 9382d9f64..3e067e3d5 100644 --- a/discovery/roles/postscripts/ubuntu/tasks/main.yml +++ b/discovery/roles/postscripts/ubuntu/tasks/main.yml @@ -28,3 +28,7 @@ - name: Configure RoCE postscripts ansible.builtin.include_tasks: configure_roce.yml when: roce_config_status + +- name: Configure Intel Gaudi postscripts + ansible.builtin.include_tasks: configure_intelgaudi.yml + when: intelgaudi_config_status diff --git a/discovery/roles/postscripts/ubuntu/templates/omnia_intelgaudi.j2 b/discovery/roles/postscripts/ubuntu/templates/omnia_intelgaudi.j2 new file mode 100644 index 000000000..b51a25c32 --- /dev/null +++ b/discovery/roles/postscripts/ubuntu/templates/omnia_intelgaudi.j2 @@ -0,0 +1,95 @@ +#!/bin/bash +################################################################################################################ +# omnia_intelgaudi: +# Install Intel Gaudi drivers on all the cluster nodes +# +################################################################################################################# +echo "--------------------------" >> /var/log/xcat/xcat.log +echo "Checking for Intel Gaudi cards" >> /var/log/xcat/xcat.log +gaudi_check_processing_xlr8r=`lspci | grep "Processing accelerators: Habana Labs Ltd"` +validate_ubuntu_os="$(cat /etc/os-release | grep 'ID=ubuntu' | wc -l)" + +cron_job_for_scale_out_interfaces() { + +echo "Set cron job for Gaudi scale out interfaces" >> /var/log/xcat/xcat.log + +if [ ! -f /opt/habanalabs/qual/gaudi2/bin/manage_network_ifs.sh ] && [ ! -f /opt/habanalabs/qual/gaudi3/bin/manage_network_ifs.sh ] ; then + echo "File manage_network_ifs.sh not found, stop setting." >> /var/log/xcat/xcat.log + return 1 +fi + +base_dir="/opt/omnia/cronjobs" + +mkdir -p $base_dir + +script_to_run="$base_dir/bring_up_ports.sh" + +cat <<'EOF' > $script_to_run +#!/bin/bash +gaudi_ver=$(if [ -n "$(lspci -n -d 1da3:1060:1200)" ]; then echo "gaudi3"; else echo "gaudi2"; fi) + +cmd="/opt/habanalabs/qual/$gaudi_ver/bin/manage_network_ifs.sh --up" +$cmd +RET_CODE=$? +if [ "${RET_CODE}" -eq "1" ]; then + echo "One or more Gaudi ports are down." >> /dev/stderr + return 1 +fi +EOF + +chmod +x $script_to_run + +if ! systemctl is-active --quiet cron; then + echo "Cron service is not active, start and enable cron now." >> /var/log/xcat/xcat.log + sudo systemctl start cron + sudo systemctl enable cron +fi + +cron_job="@reboot $script_to_run" + +temp_crontab=$(mktemp) + +crontab -l > $temp_crontab 2>/dev/null + +if grep -q "$cron_job" $temp_crontab ; then + echo "Cron job already configured." >> /var/log/xcat/xcat.log +else + echo $cron_job >> $temp_crontab + crontab $temp_crontab +fi + +rm $temp_crontab + +} + +if [[ $gaudi_check_processing_xlr8r == *"Habana Labs Ltd"* ]]; then + echo "Installing Intel Gaudi" >> /var/log/xcat/xcat.log + if [[ $validate_ubuntu_os == "1" ]] + then + + echo "deb [trusted=yes] http://{{ admin_nic_ip }}:80/install{{ repo_store_path }}/cluster/apt/intelgaudi/{{ intelgaudi_version }} ./" >> /etc/apt/sources.list.d/intelgaudi.list + + sudo apt-get update + sudo apt install libopenmpi3 dkms libelf-dev -y + sudo apt install linux-headers-$(uname -r) -y + + sudo apt install -y habanalabs-dkms \ + habanalabs-firmware \ + habanalabs-firmware-tools \ + habanalabs-graph \ + habanalabs-qual \ + habanalabs-rdma-core \ + habanalabs-thunk \ + habanatools + + rm /etc/apt/sources.list.d/intelgaudi.list + + apt-get update + echo "Intel Gaudi driver installation completed" >> /var/log/xcat/xcat.log + + # make sure Gaudi scale out interfaces are up after rebooting + cron_job_for_scale_out_interfaces + + fi + echo "-----------------------------" >> /var/log/xcat/xcat.log +fi \ No newline at end of file diff --git a/discovery/roles/postscripts/ubuntu/templates/omnia_roce.j2 b/discovery/roles/postscripts/ubuntu/templates/omnia_roce.j2 index 6b20d0e6e..cecf3d925 100644 --- a/discovery/roles/postscripts/ubuntu/templates/omnia_roce.j2 +++ b/discovery/roles/postscripts/ubuntu/templates/omnia_roce.j2 @@ -12,17 +12,77 @@ if [[ -n $thor_check ]] then echo "starting RoCE Installation" >> /var/log/xcat/xcat.log # Download the roce_src tar file - wget "http://{{ admin_nic_ip }}:80/install{{ repo_store_path }}/cluster/tarball/{{ roce_package_name }}" + wget --quiet --show-progress "http://{{ admin_nic_ip }}:80/install{{ repo_store_path }}/cluster/tarball/{{ roce_package_name }}" # Extract the contents of the tar file extracted_driver_name=$(tar -tzf "{{ roce_package_name }}" | head -n 1) tar -xvzf "{{ roce_package_name }}" + + OS_VERSION=$(lsb_release -ds | cut -d ' ' -f 2) + OS_VERSION_MAJOR=$(lsb_release -rs) + echo "OS VERSION: '$OS_VERSION'" >> /var/log/xcat/xcat.log + + ROCE_VERSION=$(echo "$extracted_driver_name" | grep -oP '\d+' | head -n 1) + echo "RoCE Major Version: $ROCE_VERSION" >> /var/log/xcat/xcat.log + # install the L2, RoCE, and Peer Memory Direct (GPU direct) drivers - sudo apt install -y "$(find ./$extracted_driver_name/ -name "netxtreme-peer-mem-dkms*.deb")" + echo "starting RoCE netxtreme-peer-mem-dkms Installation" >> /var/log/xcat/xcat.log + + # Search for .deb files using full OS version + echo "Searching for files matching OS version: $OS_VERSION" >> /var/log/xcat/xcat.log + FILES=$(find ./$extracted_driver_name/ -wholename ".*$OS_VERSION*/netxtreme-peer-mem-dkms*.deb") + + # If files are found for the full OS version, move to RoCE version check + if [[ -n $FILES ]]; then + echo "Files found for full OS version: $OS_VERSION" + echo -e "Files:\n$FILES" + # Proceed to check for RoCE version + elif [[ -z $FILES ]]; then + echo "No files found for OS version $OS_VERSION. Searching for major version: $OS_VERSION_MAJOR" + # Search for OS major version + FILES=$(find ./$extracted_driver_name/ -wholename ".*$OS_VERSION_MAJOR*/netxtreme-peer-mem-dkms*.deb") + if [[ -n $FILES ]]; then + echo "Files found for major version: $OS_VERSION_MAJOR" + echo -e "Files:\n$FILES" + # Proceed to check for RoCE version + elif [[ -z $FILES ]]; then + echo "No files found for major version $OS_VERSION_MAJOR. Searching for generic netxtreme-peer-mem-dkms debian files." + # Search for any generic .deb files + FILES=$(find ./$extracted_driver_name/ -name "netxtreme-peer-mem-dkms*.deb") + if [[ -n $FILES ]]; then + echo "Generic files found" + echo -e "Files:\n$FILES" + fi + fi + fi + + if [[ -n $FILES ]]; then + # Now search for files matching RoCE version + MATCHED_FILE=$(echo "$FILES" | grep -E "netxtreme-peer-mem-dkms.*$ROCE_VERSION.*\.deb" | tail -n 1) + + # If a matching file is found, install it + if [[ -n $MATCHED_FILE ]]; then + echo "Installing RoCE - $ROCE_VERSION match : $MATCHED_FILE" >> /var/log/xcat/xcat.log + sudo apt install -y "$MATCHED_FILE" + echo "Completed Installation Process." + else + echo "No netxtreme-peer-mem-dkms debian file found matching ROCE version $ROCE_VERSION ." >> /var/log/xcat/xcat.log + echo "Installing all : $FILES" >> /var/log/xcat/xcat.log + for file in $FILES; do + sudo apt install -y "$file" + done + fi + echo "Completed RoCE netxtreme-peer-mem-dkms Installation" >> /var/log/xcat/xcat.log + else + echo -e "RoCE installation failed. \n Exiting as no debian files found for netxtreme-peer-mem-dkms criteria." >> /var/log/xcat/xcat.log + exit 1 + fi + + # Download the roce_Drivers tar file sudo apt install -y linux-headers-"$(uname -r)" sudo apt install -y libelf-dev gcc make libtool autoconf librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils perftest ethtool libibverbs-dev rdma-core strace if [ "{{ roce_src_package_name }}" != "omnia_default" ]; then - wget "http://{{ admin_nic_ip }}:80/install{{ repo_store_path }}/cluster/tarball/{{ roce_src_package_name }}" + wget --quiet --show-progress "http://{{ admin_nic_ip }}:80/install{{ repo_store_path }}/cluster/tarball/{{ roce_src_package_name }}" extracted_source_name=$(tar -tzf "{{ roce_src_package_name }}" | head -n 1) tar -xvzf "{{ roce_src_package_name }}" find ./$extracted_source_name/ -name "*netxtreme-bnxt_en*.tar.*" -exec sh -c ' @@ -38,6 +98,8 @@ then fi ' {} \; # compile and install the RoCE library + echo "Installing RoCE library" >> /var/log/xcat/xcat.log + find ./$extracted_source_name/ -name "*libbnxt_re*.tar.*" -exec sh -c ' # Extract the tar file tar -xvzf "$0" -C "$(dirname "$0")" diff --git a/discovery/roles/postscripts/ubuntu/templates/omnia_ssh_check.j2 b/discovery/roles/postscripts/ubuntu/templates/omnia_ssh_check.j2 new file mode 100644 index 000000000..e8f2c7823 --- /dev/null +++ b/discovery/roles/postscripts/ubuntu/templates/omnia_ssh_check.j2 @@ -0,0 +1,19 @@ +#!/bin/bash +################################################################################################################ +# omnia_ssh_check: +# Wait till SSH services starts +# +################################################################################################################# +echo "Waiting for and SSH services to be ready..." >> /var/log/xcat/xcat.log +retries=0 + +until systemctl is-active --quiet ssh.service; do + if [ $retries -ge 10 ]; then + echo "SSH service did not start after 10 attempts. Exiting." >> /var/log/xcat/xcat.log + exit 0 + fi + sleep 15 + ((retries++)) +done + +echo "SSH service is ready." >> /var/log/xcat/xcat.log \ No newline at end of file diff --git a/discovery/roles/postscripts/ubuntu/templates/omnia_ubuntu.j2 b/discovery/roles/postscripts/ubuntu/templates/omnia_ubuntu.j2 index 02f5ece73..41a1e4b71 100644 --- a/discovery/roles/postscripts/ubuntu/templates/omnia_ubuntu.j2 +++ b/discovery/roles/postscripts/ubuntu/templates/omnia_ubuntu.j2 @@ -8,6 +8,7 @@ echo "---------------------------" >> /var/log/xcat/xcat.log echo "Started installing omnia packages" >> /var/log/xcat/xcat.log echo "Configure Proxy" >> /var/log/xcat/xcat.log +echo "Acquire::http::Proxy::{{ admin_nic_ip }} \"DIRECT\";" >> /etc/apt/apt.conf echo "Acquire::http::Proxy \"http://{{ admin_nic_ip }}:3128\";" >> /etc/apt/apt.conf echo "Acquire::https::Proxy \"http://{{ admin_nic_ip }}:3128\";" >> /etc/apt/apt.conf apt clean @@ -28,11 +29,16 @@ if [[ "$check_ubuntu22" == "1" ]] then echo "Installing racadm" >> /var/log/xcat/xcat.log wget "http://{{ admin_nic_ip }}:80/install{{ repo_store_path }}/cluster/tarball/racadm.tar.gz" -O /tmp/racadm.tar.gz - tar -zxvf /tmp/racadm.tar.gz -C /tmp - cd /tmp/iDRACTools/racadm && echo "y" | bash install_racadm.sh + racadm_url_status=$? + if [ $racadm_url_status -eq 0 ]; then + tar -zxvf /tmp/racadm.tar.gz -C /tmp + apt install libargtable2-0 -y + cd /tmp/iDRACTools/racadm && echo "y" | bash install_racadm.sh + fi fi -echo "{{ admin_nic_ip }} {{ control_plane_hostname }}" >> /etc/hosts +echo "127.0.0.1 localhost" >> /etc/hosts +echo "{{ admin_nic_ip }} {{ oim_hostname }}" >> /etc/hosts echo "Installed omnia packages" >> /var/log/xcat/xcat.log echo "---------------------------" >> /var/log/xcat/xcat.log diff --git a/discovery/roles/postscripts/ubuntu/vars/main.yml b/discovery/roles/postscripts/ubuntu/vars/main.yml index 3f3ef4b14..c6570d5d2 100644 --- a/discovery/roles/postscripts/ubuntu/vars/main.yml +++ b/discovery/roles/postscripts/ubuntu/vars/main.yml @@ -42,7 +42,17 @@ rocm_postscripts_path: # Usage: configure_postscripts.yml omnia_ubuntu_postscripts_path: - { src: "{{ role_path }}/../ubuntu/templates/omnia_ubuntu.j2", dest: "/install/postscripts/omnia_ubuntu", mode: "755" } +xcat_path: /opt/xcat/bin +xcat_sbin_path: /opt/xcat/sbin # Usage: configure_roce.yml roce_postscripts_path: - { src: "{{ role_path }}/../ubuntu/templates/omnia_roce.j2", dest: "/install/postscripts/omnia_roce", mode: "755" } + +# Usage: configure_intelgaudi.yml +intelgaudi_postscripts_path: + - { src: "{{ role_path }}/../ubuntu/templates/omnia_intelgaudi.j2", dest: "/install/postscripts/omnia_intelgaudi", mode: "755" } + +# Usage: configure_postbootscripts.yml +omnia_ssh_check_postbootscripts_path: + - { src: "{{ role_path }}/../ubuntu/templates/omnia_ssh_check.j2", dest: "/install/postscripts/omnia_ssh_check", mode: "755" } diff --git a/discovery_provision.yml b/discovery_provision.yml index dd7055af5..8a1e9dc16 100644 --- a/discovery_provision.yml +++ b/discovery_provision.yml @@ -12,6 +12,26 @@ # See the License for the specific language governing permissions and # limitations under the License. --- +- name: Check if virtual environment is active + ansible.builtin.import_playbook: utils/check_venv.yml + when: not ( check_venv_executed | default(false) | bool ) + +- name: Check if package manager is not locked + ansible.builtin.import_playbook: utils/check_package_lock.yml + when: not ( hostvars['127.0.0.1']['apt_lock_status'] | default(false) | bool ) + +- name: Set flag + hosts: localhost + connection: local + gather_facts: false + tasks: + - name: Set flag to indicate check_venv.yml has been executed + ansible.builtin.set_fact: + check_venv_executed: true + + - name: Set flag to indicate check_package_lock.yml has been executed + ansible.builtin.set_fact: + apt_lock_status: true - name: Validate discovery parameters # noqa:role-name[path] hosts: localhost @@ -19,8 +39,8 @@ roles: - discovery/roles/discovery_validations/common -- name: Prepare control plane for basic things - ansible.builtin.import_playbook: prepare_cp/prepare_cp.yml # noqa:role-name[path] +- name: Prepare Omnia Infrastructure Manager for basic things + ansible.builtin.import_playbook: prepare_oim/prepare_oim.yml # noqa:role-name[path] - name: Discover the nodes ansible.builtin.import_playbook: discovery/discovery.yml # noqa:role-name[path] diff --git a/docs/source/Appendices/hostnamereqs.rst b/docs/source/Appendices/hostnamereqs.rst index 45436d45d..3ffda362b 100644 --- a/docs/source/Appendices/hostnamereqs.rst +++ b/docs/source/Appendices/hostnamereqs.rst @@ -1,6 +1,8 @@ -**Hostname requirements** - * The hostname should not contain the following characters: , (comma), \. (period) or _ (underscore). However, the **domain name** is allowed with commas and periods. - * The hostname cannot start or end with a hyphen (-). - * No upper case characters are allowed in the hostname. - * The hostname cannot start with a number. - * The hostname and the domain name (that is: ``hostname00000x.domain.xxx``) cumulatively cannot exceed 64 characters. For example, if the ``node_name`` provided in ``input/provision_config.yml`` is 'node', and the ``domain_name`` provided is 'omnia.test', Omnia will set the hostname of a target cluster node to 'node000001.omnia.test'. Omnia appends 6 digits to the hostname to individually name each target node. \ No newline at end of file +Hostname requirements +====================== + +* The hostname should not contain the following characters: , (comma), \. (period) or _ (underscore). However, the **domain name** is allowed with commas and periods. +* The hostname cannot start or end with a hyphen (-). +* No upper case characters are allowed in the hostname. +* The hostname cannot start with a number. +* The hostname and the domain name (that is: ``hostname00000x.domain.xxx``) cumulatively cannot exceed 64 characters. For example, if the ``node_name`` provided in ``input/provision_config.yml`` is 'node', and the ``domain_name`` provided is 'omnia.test', Omnia will set the hostname of a target cluster node to 'node000001.omnia.test'. Omnia appends 6 digits to the hostname to individually name each target node. \ No newline at end of file diff --git a/docs/source/InstallationGuides/BuildingClusters/index.rst b/docs/source/InstallationGuides/BuildingClusters/index.rst deleted file mode 100644 index 39b4813d4..000000000 --- a/docs/source/InstallationGuides/BuildingClusters/index.rst +++ /dev/null @@ -1,31 +0,0 @@ -Configuring the cluster -======================= - -**Features enabled by omnia.yml** - - * Centralized authentication: Once all the required parameters in `security_config.yml `_ are filled in, ``omnia.yml`` can be used to set up FreeIPA/OpenLDAP. - - * Slurm: Once all the required parameters in `omnia_config.yml `_ are filled in, ``omnia.yml`` can be used to set up slurm. - - * Kubernetes: Once all the required parameters in `omnia_config.yml `_ are filled in, ``omnia.yml`` can be used to set up kubernetes. - - * Login Node (Additionally secure login node) - - -.. toctree:: - schedulerinputparams - schedulerprereqs - installscheduler - install_kubernetes - k8s_plugin_roce_nic - install_slurm - install_ucx_openmpi - Authentication - KubernetesAccess - BeeGFS - NFS - AMD_ROCm - - - - diff --git a/docs/source/InstallationGuides/BuildingClusters/install_kubernetes.rst b/docs/source/InstallationGuides/BuildingClusters/install_kubernetes.rst deleted file mode 100644 index dd7ecf6f6..000000000 --- a/docs/source/InstallationGuides/BuildingClusters/install_kubernetes.rst +++ /dev/null @@ -1,204 +0,0 @@ -Install Kubernetes -=================== - -**Prerequisites** - -* Ensure that ``k8s`` entry is present in the ``softwares`` list in ``software_config.json``, as mentioned below: - :: - - "softwares": [ - {"name": "k8s", "version":"1.26.12"}, - ] - -* Ensure to run ``local_repo.yml`` with the ``k8s`` entry present in ``software_config.json``, to download all required Kubernetes packages and images. - -* Once all the required parameters in `omnia_config.yml `_ are filled in, ``omnia.yml`` can be used to set up Kubernetes. - -* Ensure that ``k8s_share`` is set to ``true`` in `storage_config.yml `_, for one of the entries in ``nfs_client_params``. - -**Inventory details** - -* For Kubernetes, all the applicable inventory groups are ``kube_control_plane``, ``kube_node``, and ``etcd``. - -* The inventory file must contain: - - 1. Exactly 1 ``kube_control_plane``. - 2. At least 1 ``kube_node``. - 3. Odd number of ``etcd`` nodes. - -.. note:: Ensure that the inventory includes an ``[etcd]`` node. etcd is a consistent and highly-available key value store used as Kubernetes' backing store for all cluster data. For more information, `click here. `_ - -**Sample inventory** -:: - - [kube_control_plane] - - 10.5.1.101 - - [kube_node] - - 10.5.1.102 - - [etcd] - - 10.5.1.101 - -.. note:: - If an additional NIC other than admin NIC is present on the cluster, inventory should be updated with argument ``ip``, and ``ip`` should have the value of required admin IP in case node has more than one network interface. If ``kube_control_plane`` has 2 interfaces ``eno1`` and ``eno2`` with IPs ``eno1=10.5.0.3`` and ``eno2=198.168.0.19``, inventory should have the following format: :: - - [kube_control_plane] - - 10.5.0.3 ip=10.5.0.3 - - [kube_node] - - 10.5.0.4 ip=10.5.0.4 - - [etcd] - - 10.5.0.3 ip=10.5.0.3 - -**To install Kubernetes** - -Run either of the following commands: - - 1. :: - - ansible-playbook omnia.yml -i inventory - - 2. :: - - ansible-playbook scheduler.yml -i inventory - -.. note:: To add new nodes to an existing cluster, click `here. <../addinganewnode.html>`_ - -**Additional installations** - -Omnia installs the following packages on top of the Kubernetes stack: - -1. *amdgpu-device-plugin (ROCm device plugin)* - - This is a Kubernetes device plugin implementation that enables the registration of AMD GPU in a container cluster for compute workload. - Click `here `_ for more information. - -2. *mpi-operator* - - The MPI Operator makes it easy to run allreduce-style distributed training on Kubernetes. - Click `here `_ for more information. - -3. *xilinx device plugin* - - The Xilinx FPGA device plugin for Kubernetes is a Daemonset deployed on the Kubernetes (k8s) cluster which allows you to: - - i. Discover the FPGAs inserted in each node of the cluster and expose information about FPGA such as number of FPGA, Shell (Target Platform) type and etc. - - ii. Run FPGA accessible containers in the k8s cluster - - Click `here `_ for more information. - -4. *nfs-client-provisioner* - - * NFS subdir external provisioner is an automatic provisioner that use your existing and already configured NFS server to support dynamic provisioning of Kubernetes Persistent Volumes via Persistent Volume Claims. - * The NFS server utilised here is the one mentioned in ``storage_config.yml``. - * Server IP is ```` and path is ``.`` of the entry where ``k8s_share`` is set to ``true``. - - Click `here `_ for more information. - -5. *nvidia-device-plugin* - - The NVIDIA device plugin for Kubernetes is a Daemonset that allows you to automatically: - - i. Expose the number of GPUs on each nodes of your cluster - ii. Keep track of the health of your GPUs - iii. Run GPU enabled containers in your Kubernetes cluster. - - Click `here `_ for more information. - -**Additional configurations for nvidia-device-plugin** - -After executing ``scheduler.yml`` or ``omnia.yml``, there are some manual steps which user needs to perform for the NVIDIA device plugin to detect GPU on the nodes. - - * First, install "nvidia-container-toolkit" from `this link `_. This must be installed on servers running NVIDIA GPUs. - * As per the `nvidia-container-toolkit installation guide `_, follow the below steps based on the OS running on your cluster. - - **Steps for RHEL/Rocky Linux** - - 1. Check the values of http_proxy and https_proxy environment variables from ``/opt/omnia/offline/local_repo_access.yml`` on the control plane. - 2. Establish a secure connection (SSH protocol) to node containing the NVIDIA GPU, and configure the http_proxy environment variables as shown below: - :: - - export http_proxy=http://:3128 - export https_proxy=http://:3128 - - 3. Execute the following command: - :: - - curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | \ - sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo - - 4. Execute the following command: - :: - - sudo yum install -y nvidia-container-toolkit - - 5. Execute the following command: - :: - - sudo nvidia-ctk runtime configure --runtime=containerd - - 6. Execute the following command: - :: - - systemctl restart containerd - - 7. Execute the following command: - :: - - rm -rf /etc/yum.repos.d/nvidia-container-toolkit.repo - - - **Steps for Ubuntu** - - 1. Check http_proxy and https_proxy values from ``/opt/omnia/offline/local_repo_access.yml`` on ControlPlane. - 2. Establish a secure connection (SSH protocol) to node containing the NVIDIA GPU, and configure the http_proxy environment variables as shown below: - :: - export http_proxy=http://:3128 - export https_proxy=http://:3128 - - 3. Execute the following command: - :: - - curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ - && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ - sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ - sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list - - 4. Execute the following command: - :: - - sudo apt-get update - - 5. Execute the following command: - :: - - sudo apt-get install -y nvidia-container-toolkit - - 6. Execute the following command: - :: - - sudo nvidia-ctk runtime configure --runtime=containerd - - 7. Execute the following command: - :: - - systemctl restart containerd - - 8. Execute the following command: - :: - - rm -rf /etc/apt/sources.list.d/nvidia-container-toolkit.list - - -**Optional installation** - -In addition to the above mentioned plugins, user can also install the *kubernetes device plugin for RoCE NIC*. For complete installation steps, `click here `_. \ No newline at end of file diff --git a/docs/source/InstallationGuides/BuildingClusters/schedulerinputparams.rst b/docs/source/InstallationGuides/BuildingClusters/schedulerinputparams.rst deleted file mode 100644 index f209e47c1..000000000 --- a/docs/source/InstallationGuides/BuildingClusters/schedulerinputparams.rst +++ /dev/null @@ -1,71 +0,0 @@ -Input parameters for the cluster -------------------------------------- - -These parameters are located in ``input/omnia_config.yml``, ``input/security_config.yml``, ``input/telemetry_config.yml`` and [optional] ``input/storage_config.yml``. - -.. caution:: Do not remove or comment any lines in the ``input/omnia_config.yml``, ``input/security_config.yml``, ``input/telemetry_config.yml`` and [optional] ``input/storage_config.yml`` file. - -**omnia_config.yml** - -.. csv-table:: Parameters for kubernetes setup - :file: ../../Tables/scheduler_k8s.csv - :header-rows: 1 - :keepspace: - -.. csv-table:: Parameters for slurm setup - :file: ../../Tables/scheduler_slurm.csv - :header-rows: 1 - :keepspace: - -**security_config.yml** - -.. csv-table:: Parameters for Authentication - :file: ../../Tables/security_config.csv - :header-rows: 1 - :keepspace: - -.. csv-table:: Parameters for OpenLDAP configuration - :file: ../../Tables/security_config_ldap.csv - :header-rows: 1 - :keepspace: - -.. csv-table:: Parameters for FreeIPA configuration - :file: ../../Tables/security_config_freeipa.csv - :header-rows: 1 - :keepspace: - - -**storage_config.yml** - -.. csv-table:: Parameters for Storage - :file: ../../Tables/storage_config.csv - :header-rows: 1 - :keepspace: - -**telemetry_config.yml** - -.. csv-table:: Parameters for Telemetry - :file: ../../Tables/telemetry_config.csv - :header-rows: 1 - :keepspace: - -.. [1] Boolean parameters do not need to be passed with double or single quotes. - - -Click here for more information on `OpenLDAP, FreeIPA `_, `Telemetry <../../Roles/Telemetry/index.html>`_, `BeeGFS `_ or, `NFS `_. - -.. note:: - - * The ``input/omnia_config.yml``, ``input/security_config.yml``, ``input/telemetry_config.yml`` files are encrypted during the execution of ``omnia.yml`` playbook. Use the below commands to edit the encrypted input files: - - * ``omnia_config.yml``: :: - - ansible-vault edit omnia_config.yml --vault-password-file .omnia_vault_key - - * ``security_config.yml``: :: - - ansible-vault edit security_config.yml --vault-password-file .security_vault.key - - * ``telemetry_config.yml``: :: - - ansible-vault edit telemetry_config.yml --vault-password-file .telemetry_vault_key \ No newline at end of file diff --git a/docs/source/InstallationGuides/InstallingProvisionTool/AdditionalNIC.rst b/docs/source/InstallationGuides/InstallingProvisionTool/AdditionalNIC.rst deleted file mode 100644 index 811de8674..000000000 --- a/docs/source/InstallationGuides/InstallingProvisionTool/AdditionalNIC.rst +++ /dev/null @@ -1,105 +0,0 @@ -Configuring additional NICs on the nodes -------------------------------------------- -After the ``discovery_provision.yml`` playbook has been executed and the nodes have booted up, additional NICs can be configured on the cluster nodes using the ``server_spec_update.yml`` playbook. - -**Prerequisites** - -* All target nodes are provisioned and booted. `Click here `_ to know how to verify the status of the nodes. - -* Ensure that ``input/network_spec.yml`` file has been updated with all network information in addition to ``admin_network`` and ``bmc_network`` information. Below are all applicable properties of an additional network: - - * ``nic_name``: The name of the NIC on which the administrative network is accessible to the control plane. - * ``netmask_bits``: The 32-bit "mask" used to divide an IP address into subnets and specify the network's available hosts. - * ``static_range``: The static range of IPs to be provisioned on target nodes. This indicates that only a certain static range is available to Omnia. - -* In addition to the above mentioned properties, the following properties are applicable for configuring additional NICs - - * ``CIDR``: Classless or Classless Inter-Domain Routing (CIDR) addresses use variable length subnet masking (VLSM) to alter the ratio between the network and host address bits in an IP address. - - .. note:: You can either use ``CIDR`` or ``static_range``. Simultaneous use of both parameters will result in an error message being displayed. - - * ``MTU``: Maximum transmission unit (MTU) is a measurement in bytes of the largest data packets that an Internet-connected device can accept. Default value of ``MTU`` is 1500. You can enter your desired value. - * ``VLAN``: A 12-bit field that identifies a virtual LAN (VLAN) and specifies the VLAN that an ethernet frame belongs to. This property is not supported on clusters running Ubuntu. - -.. note:: - - * If a ``static_range`` value is provided in ``input/network_spec.yml``, additional networks are not correlated. - * If a ``CIDR`` value is provided in ``input/network_spec.yml``, the complete subnet is used for Omnia to assign IPs and where possible, the IPs will be correlated with the assignment on the admin network. Omnia performs correlation for additional networks if the subnet prefix for the admin network is a superset, and the additional network is a subset. For example, if the subnet prefix for the admin network is */16* and for the additional network it's */24*, Omnia attempts to correlate the IPs if the value for the ``correlate_to_admin`` field is set to true in ``input/network_spec.yml``. - * If a VLAN is required, ensure that a VLAN ID is provided in the ``vlan`` field in ``input/server_spec.yml`` and ensure that it's provided in the ``NIC.vlan_id`` format. For example, consider "eth1.101" where ``eth1`` is the NIC name configured with a VLAN is and ``101`` is the ``vlan_id``. This field is not supported on admin or bmc networks. - * While new networks can be added to the ``network_spec.yml`` file on subsequent runs of the ``server_spec_update.yml`` playbook, existing networks cannot be edited or deleted. If the user modifies or removes existing networks from ``input/network_spec.yml``, the playbook execution might fail. In that case, the user needs to `reprovision the node <../reprovisioningthecluster.html>`_. - -Below is a sample of additional NIC information in a ``input/network_spec.yml`` file: :: - - - thor_network1: - netmask_bits: "24" - CIDR: "10.23.1.0" - network_gateway: "" - MTU: "1500" - VLAN: "" - - - thor_network2: - netmask_bits: "24" - static_range: "10.23.2.1-10.23.2.254" - network_gateway: "" - MTU: "1500" - VLAN: "1" - - -* The ``input/server_spec.yml`` file has been updated with all NIC information of the target nodes. - - * All NICs listed in the ``server_spec.yml`` file are grouped into categories (groups for servers). The string "Categories:" should not be edited out of the ``input/server_spec.yml`` file. - * The name of the NIC specified in the file (in this sample, ``ensp0``, ``ensp0.5``, and ``eno1``) is the unique identifier of NICs in the file. - * The property ``nictype`` indicates what kind of NIC is in use (ethernet, infiniband, or vlan). If the ``nictype`` is set to ``vlan``, ensure to specify a primary NIC for the VLAN using the property ``nicdevices``. - * While new groups can be added to the ``server_spec.yml`` file on subsequent runs of the ``server_spec_update.yml`` playbook, existing groups cannot be edited or deleted. If the user modifies or removes existing groups from ``input/server_spec.yml``, the playbook execution might fail. In that case, the user needs to `reprovision the node <../reprovisioningthecluster.html>`_. - -.. note:: The ``nicnetwork`` property should match any of the networks specified in ``input/network_spec.yml``. - -Below is a sample ``input/server_spec.yml`` file: :: - - --- - Categories: - - group-1: - - network: - - ensp0: - nicnetwork: "thor_network1" - nictypes: "ethernet" - - ensp0.5: - nicnetwork: "thor_network2" - nictypes: "vlan" - nicdevices: "ensp0" - - - group-2: - - network: - - eno1: - nicnetwork: "thor_network1" - nictypes: "ethernet" - -A sample inventory format is present in ``examples/server_spec_inv``. - -Use the below commands to assign IPs to the NICs: :: - - cd server_spec_update - ansible-playbook server_spec_update.yml -i inventory - -Where the inventory file passed includes user-defined groups, servers associated with them, and a mapping from the groups specified and the categories in ``input/server_spec.yml`` under ``[:vars]``. Below is a sample: :: - - [node-group1] - 10.5.0.3 - - [node-group1:vars] - Categories=group-1 - - [node-group2] - 10.5.0.4 - 10.5.0.5 - - [node-group2:vars] - Categories=group-2 - -.. note:: In Omnia v1.6, while executing ``server_spec_update.yml``, the user needs to ensure that only admin IP addresses are used in the inventory file, not service tags or node names. - -Based on the provided sample files, server 10.5.0.3 has been mapped to node-group1 which corresponds to group-1. Therefore, the NICs ensp0 and ensp0.5 will be configured in an ethernet VLAN group with ens0 as the primary device. - - - - diff --git a/docs/source/InstallationGuides/InstallingProvisionTool/DiscoveryMechanisms/index.rst b/docs/source/InstallationGuides/InstallingProvisionTool/DiscoveryMechanisms/index.rst deleted file mode 100644 index 7f061d48f..000000000 --- a/docs/source/InstallationGuides/InstallingProvisionTool/DiscoveryMechanisms/index.rst +++ /dev/null @@ -1,76 +0,0 @@ -Discovery Mechanisms ------------------------ - -Depending on the values provided in ``input/provision_config.yml``, target nodes can be discovered in one of three ways: - -.. toctree:: - mappingfile - switch-based - bmc - - -**switch_based** - -Omnia can query known switches (by SNMPv3 username/password) for information on target node MAC IDs. - -**Pros** - -- The whole discovery process is totally automatic. - -- Admin IP, BMC IP and Infiniband IP address configuration is automatic on the target nodes. - -- Re-provisioning of servers will be automatic. - -- PXE booting servers is supported via split ports on the switch. - -**Cons** - -- Users need to enable IPMI on target servers. -- Servers require a manual PXE boot after the first run of the provision tool. - -For more information regarding switch-based discovery, `click here `_ - -**mapping** - -Manually collect PXE NIC information for target servers and manually define them to Omnia using a mapping file using the below format: - -**pxe_mapping_file.csv** - - -:: - - SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_IP - XXXXXXXX,n1,xx:yy:zz:aa:bb:cc,10.5.0.101,10.3.0.101 - XXXXXXXX,n2,aa:bb:cc:dd:ee:ff,10.5.0.102,10.3.0.102 - - -**Pros** - - - Easily customized if the user maintains a list of MAC addresses. - -**Cons** - - - The user needs to be aware of the MAC/IP mapping required in the network. - - Servers require a manual PXE boot if iDRAC IPs are not configured. - -For more information regarding mapping files, `click here `_ - -**bmc** - -Omnia can also discover nodes via their iDRAC using IPMI. - - -**Pros** - - - Discovery and provisioning of servers is automatic. - - Admin, BMC and Infiniband IP address configuration is automatic on the control plane. - - LOM architecture is supported (including cloud enclosures: C6420, C6520, C6620). -**Cons** - - - For iDRACs that are not DHCP enabled (ie Static), users need to enable IPMI manually. - - -For more information regarding BMC, `click here `_ - - - diff --git a/docs/source/InstallationGuides/InstallingProvisionTool/ViewingDB.rst b/docs/source/InstallationGuides/InstallingProvisionTool/ViewingDB.rst deleted file mode 100644 index df06e17f5..000000000 --- a/docs/source/InstallationGuides/InstallingProvisionTool/ViewingDB.rst +++ /dev/null @@ -1,52 +0,0 @@ -Checking node status ----------------------- -Via CLI -+++++++ - -Run ``nodels all nodelist.status`` for a list of nodes and their statuses. :: - - omnia-node00001: installing - omnia-node00002: booted - omnia-node00003: powering-on - omnia-node00004: booted - -Possible values of node status are powering-off, powering-on, bmcready, installing, booting, post-booting, booted, failed. - -.. caution:: - * Once xCAT is installed, restart your SSH session to the control plane to ensure that the newly set up environment variables come into effect. This will also allow the above command to work correctly. If the new environment variables still do not come into effect, enable manually using: :: - - source /etc/profile.d/xcat.sh - - - -Via omniadb -++++++++++++++++++ - -1. To access the DB, run: :: - - psql -U postgres - - \c omniadb - - -2. To view the schema being used in the cluster: ``\dn`` - -3. To view the tables in the database: ``\dt`` - -4. To view the contents of the ``nodeinfo`` table: ``select * from cluster.nodeinfo;`` :: - - id | service_tag | node | hostname | admin_mac | admin_ip | bmc_ip | status | discovery_mechanism | bmc_mode | switch_ip | switch_name | switch_port | cpu | gpu | cpu_count | gpu_count$ - ----+-------------+---------------+---------------+-------------------+--------------+------------+--------+---------------------+----------+-----------+-------------+-------------+-----+-----+-----------+---------- - 1 | | control_plane | newcp.new.dev | 00:0a:f7:dc:11:42 | 10.5.255.254 | 0.0.0.0 | | | | | | | | | | - 2 | xxxxxxx | node2 | node2.new.dev | c4:cb:e1:b5:70:44 | 10.5.0.12 | 10.30.0.12 | booted | mapping | | | | | amd | | 1 | 0 - 3 | xxxxxxx | node3 | node3.new.dev | f4:02:70:b8:bc:2a | 10.5.0.10 | 10.30.0.10 | booted | mapping | | | | | amd | amd | 2 | 1 - (3 rows) - - -Possible values of node status are powering-off, powering-on, bmcready, installing, booting, post-booting, booted, failed. - -.. note:: - * The ``gpu_count`` in the DB is only updated every time a cluster node is PXE booted. - * Nodes listed as "failed" can be diagnosed using the ``/var/log/xcat/xcat.log`` file on the target node. Correct any underlying issues and `re-provision the node <../reprovisioningthecluster.html>`_. - * Information on debugging nodes stuck at 'powering-on', 'bmcready' or 'installing' for longer than expected is available `here. <../../Troubleshooting/FAQ.html>`_ Correct any underlying issue on the node and `re-provision the node <../reprovisioningthecluster.html>`_. - * A blank node status indicates that no attempt to provision has taken place. Attempt a manual PXE boot on the node to initiate provisioning. diff --git a/docs/source/InstallationGuides/InstallingProvisionTool/index.rst b/docs/source/InstallationGuides/InstallingProvisionTool/index.rst deleted file mode 100644 index f52b726e9..000000000 --- a/docs/source/InstallationGuides/InstallingProvisionTool/index.rst +++ /dev/null @@ -1,19 +0,0 @@ -Installing the provision tool -============================= - -The provision tool is installed using an Ansible playbook. This playbook achieves the following tasks: - - * Discovers potential cluster nodes. - - * Installs Rocky Linux, Ubuntu, or RHEL on the discovered nodes. - -.. toctree:: - - provisionprereqs - DiscoveryMechanisms/index - provisionparams - installprovisiontool - ViewingDB - AdditionalNIC - IPruleassignment - diff --git a/docs/source/InstallationGuides/InstallingProvisionTool/provisionparams.rst b/docs/source/InstallationGuides/InstallingProvisionTool/provisionparams.rst deleted file mode 100644 index 998341b80..000000000 --- a/docs/source/InstallationGuides/InstallingProvisionTool/provisionparams.rst +++ /dev/null @@ -1,96 +0,0 @@ -Input parameters for the provision tool ------------------------------------------ - -Fill in all required parameters in ``input/provision_config.yml``, ``input/provision_config_credentials.yml``, ``input/software_config.json``, and ``input/network_spec.yml``. - -.. caution:: Do not remove or comment any lines in the above specified ``.yml`` files. - -.. csv-table:: provision_config.yml - :file: ../../Tables/Provision_config.csv - :header-rows: 1 - :keepspace: - -.. csv-table:: provision_config_credentials.yml - :file: ../../Tables/Provision_creds.csv - :header-rows: 1 - :keepspace: - -.. csv-table:: software_config.json - :file: ../../Tables/software_config.csv - :header-rows: 1 - :keepspace: - -.. [1] Boolean parameters do not need to be passed with double or single quotes. - - -Update the ``input/network_spec.yml`` file for all networks available for use by the control plane. - - * The following ``admin_network`` details are mandatory: - - * ``nic_name``: The name of the NIC on which the administrative network is accessible to the control plane. - * ``netmask_bits``: The 32-bit "mask" used to divide an IP address into subnets and specify the network's available hosts. - * ``static_range``: The static range of IPs to be provisioned on target nodes. - * ``dynamic_range``: The dynamic range of IPs to be provisioned on target nodes. - * ``correlation_to_admin``: Boolean value used to indicate whether all other networks specified in the file (for example: ``bmc_network``) should be correlated to the admin network. For example, if a target node is assigned the IP xx.yy.0.5 on the admin network, it will be assigned the IP aa.bb.0.5 on the BMC network. This value is irrelevant when discovering nodes using a mapping file. - * ``admin_uncorrelated_node_start_ip``: If ``correlation_to_admin`` is set to true but correlated IPs are not available on non-admin networks, provide an IP within the ``static_range`` of the admin network that can be used to assign admin static IPs to uncorrelated nodes. If this is empty, then the first IP in the ``static_range`` of the admin network is taken by default. This value is irrelevant when discovering nodes using a mapping file. - * ``MTU``: Maximum transmission unit (MTU) is a measurement in bytes of the largest data packets that an Internet-connected device can accept. - - * If the ``nic_name`` is identical on both the ``admin_network`` and the ``bmc_network``, it indicates a LOM setup. Otherwise, it's a dedicated setup. - * BMC network details are not required when target nodes are discovered using a mapping file. - * If ``bmc_network`` properties are provided, target nodes will be discovered using the BMC method in addition to the methods whose details are explicitly provided in ``provision_config.yml``. - * The following parameters are applicable for ``bmc_network``: - - * ``discover_ranges``: If some iDRACs are reachable from control_plane but is not in ``bmc_network``, then user can provide those IP ranges here. Discovery of a single IP is not possible. User must provide a range. This is an optional field. User must not remove any of the fields even though it is optional. - * ``reassignment_to_static``: If iDRACs are set to DHCP mode and Omnia has assigned the IPs, then the user can reassign the IP within the ``bmc_network`` static range by setting this value to ``true``. If this value is not provided or set to ``false`` while the iDRACs are in DHCP mode, they will obtain IPs from the ``bmc_network`` dynamic range, and these IPs will then be converted to static IPs for the iDRACs. - -.. caution:: - * Do not assign the subnet 10.4.0.0/24 to any interfaces in the network as nerdctl uses it by default. - * Omnia v1.6 does not support the configuration of a DNS server on the control plane. - * All provided network ranges and NIC IP addresses should be distinct with no overlap in the ``input/network_spec.yml``. - * Ensure that all the iDRACs are reachable from the Control Plane. - * If ``bmc_network`` details are provided, target nodes will be discovered using the BMC method for all network ranges. - -A sample is provided below: :: - - --- - Networks: - - admin_network: - nic_name: "eno1" - netmask_bits: "16" - static_range: "10.5.0.1-10.5.0.200" - dynamic_range: "10.5.1.1-10.5.1.200" - correlation_to_admin: true - admin_uncorrelated_node_start_ip: "10.5.0.50" - network_gateway: "" - MTU: "1500" - - - bmc_network: - nic_name: "" - netmask_bits: "" - static_range: "" - dynamic_range: "" - reassignment_to_static: true - discover_ranges: "" - network_gateway: "" - MTU: "1500" - - - - - -.. note:: - - * The ``input/provision_config_credentials.yml`` file is encrypted on the first execution of the ``discovery_provision.yml`` or ``local_repo.yml`` playbooks. - - * To view the encrypted parameters: :: - - ansible-vault view provision_config_credentials.yml --vault-password-file .provision_credential_vault_key - - * To edit the encrypted parameters: :: - - ansible-vault edit provision_config_credentials.yml --vault-password-file .provision_credential_vault_key - - * The strings ``admin_network`` and ``bmc_network`` in the ``input/network_spec.yml`` file should not be edited. Also, the properties ``nic_name``, ``static_range``, and ``dynamic_range`` cannot be edited on subsequent runs of the provision tool. - * Netmask bits are mandatory and should be same for both the ``admin_network`` and ``bmc_network`` (that is, between 1 and 32; 1 and 32 are acceptable values). - * The ``discover_ranges`` property of the ``bmc_network`` can accept multiple comma-separated ranges. - diff --git a/docs/source/InstallationGuides/InstallingProvisionTool/provisionprereqs.rst b/docs/source/InstallationGuides/InstallingProvisionTool/provisionprereqs.rst deleted file mode 100644 index 45b32ee22..000000000 --- a/docs/source/InstallationGuides/InstallingProvisionTool/provisionprereqs.rst +++ /dev/null @@ -1,99 +0,0 @@ -Before you run the provision tool ---------------------------------- - -* (Recommended) Run ``prereq.sh`` to get the system ready to deploy Omnia. Alternatively, ensure that `Ansible 2.14 `_ and `Python 3.9 `_ are installed on the system. - -* All target bare-metal servers should be reachable to the chosen control plane. - -* The UEFI boot setting should be configured in the BIOS settings before initiating PXE boot on the nodes. - -* Set the IP address of the control plane. The control plane NIC connected to remote servers (through the switch) should be configured with two IPs (BMC IP and admin IP) in a shared LOM or hybrid set up. In the case dedicated network topology, a single IP (admin IP) is required. - -.. figure:: ../../images/ControlPlaneNic.png - - *Control plane NIC IP configuration in a LOM setup* - -.. figure:: ../../images/ControlPlane_DedicatedNIC.png - - *Control plane NIC IP configuration in a dedicated setup* - - -* Set the hostname of the control plane in the ``hostname``. ``domain name`` format. - - .. include:: ../../Appendices/hostnamereqs.rst - - For example, ``controlplane.omnia.test`` is acceptable. :: - - hostnamectl set-hostname controlplane.omnia.test - -.. note:: The domain name specified for the control plane should be the same as the one specified under ``domain_name`` in ``input/provision_config.yml``. - -* To provision the bare metal servers, download one of the following ISOs to the control plane: - - 1. `Rocky Linux 8 `_ - - 2. `RHEL 8.x `_ - - 3. `Ubuntu `_ - -.. note:: Ensure the ISO provided has downloaded seamlessly (No corruption). Verify the SHA checksum/ download size of the ISO file before provisioning to avoid future failures. - -Note the compatibility between cluster OS and control plane OS below: - - +---------------------+--------------------+------------------+ - | | | | - | Control Plane OS | Cluster Node OS | Compatibility | - +=====================+====================+==================+ - | | | | - | RHEL [1]_ | RHEL | Yes | - +---------------------+--------------------+------------------+ - | | | | - | RHEL [1]_ | Rocky | Yes | - +---------------------+--------------------+------------------+ - | | | | - | Rocky | Rocky | Yes | - +---------------------+--------------------+------------------+ - | | | | - | Ubuntu | Ubuntu | Yes | - +---------------------+--------------------+------------------+ - | | | | - | Rocky | Ubuntu | No | - +---------------------+--------------------+------------------+ - | | | | - | RHEL | Ubuntu | No | - +---------------------+--------------------+------------------+ - | | | | - | Ubuntu | RHEL | No | - +---------------------+--------------------+------------------+ - | | | | - | Ubuntu | Rocky | No | - +---------------------+--------------------+------------------+ - -.. [1] Ensure that control planes running RHEL have an active subscription or are configured to access local repositories. The following repositories should be enabled on the control plane: **AppStream**, **BaseOS**. - -* Ensure that all connection names under the network manager match their corresponding device names. - To verify network connection names: :: - - nmcli connection - - To verify the device name: :: - - ip link show - -In the event of a mismatch, edit the file ``/etc/sysconfig/network-scripts/ifcfg-`` using vi editor. - -* When discovering nodes via a mapping file, all target nodes should be set up in PXE mode before running the playbook. - -.. note:: - - * After configuration and installation of the cluster, changing the control plane is not supported. If you need to change the control plane, you must redeploy the entire cluster. - - * For servers with an existing OS being discovered via BMC, ensure that the first PXE device on target nodes should be the designated active NIC for PXE booting. - - - - - - - - diff --git a/docs/source/InstallationGuides/LocalRepo/index.rst b/docs/source/InstallationGuides/LocalRepo/index.rst deleted file mode 100644 index 040102148..000000000 --- a/docs/source/InstallationGuides/LocalRepo/index.rst +++ /dev/null @@ -1,16 +0,0 @@ -Local repositories for the cluster -===================================== - -The local repository feature will help create offline repositories on the control plane which all the cluster nodes will access. ``local_repo/local_repo.yml`` runs with inputs from ``input/software_config.json`` and ``input/local_repo_config.yml``: - -.. caution:: Minimal OS version of RHEL and Rocky Linux and "desktop image" version of Ubuntu is not supported on the control plane. - -.. toctree:: - Prerequisite - InputParameters - localrepos - RunningLocalRepo - CustomLocalRepo - - - diff --git a/docs/source/InstallationGuides/PostProvisionScript.rst b/docs/source/InstallationGuides/PostProvisionScript.rst deleted file mode 100644 index 68b9572cb..000000000 --- a/docs/source/InstallationGuides/PostProvisionScript.rst +++ /dev/null @@ -1,47 +0,0 @@ -Creating node inventory -======================== - -When ``discovery_provision.yml``, ``prepare_cp.yml``, or ``utils/inventory_tagging.yml`` is run, a set of inventory files is created in ``/opt/omnia/omnia_inventory/`` based on `the Omnia database. `_ The inventories are created based on the type of CPUs and GPUs nodes have. The inventory files are: - - * ``compute_cpu_amd`` :: - - [compute_cpu_amd] - ABCD1 - - - - * ``compute_cpu_intel`` :: - - [compute_cpu_intel] - ABCD1 - - * ``compute_gpu_amd`` :: - - [compute_cpu_amd] - ABCD2 - ABCD3 - - * ``compute_gpu_nvidia`` :: - - [compute_gpu_nvidia] - ABCD1 - - - * ``compute_servicetag_ip`` :: - - [compute_servicetag_ip] - ABCD1 ansible_host=10.5.0.2 - ABCD2 ansible_host=10.5.0.3 - ABCD3 ansible_host=10.5.0.4 - - .. note:: - - * Service tags will only be written into the inventory files after the nodes are successfully PXE booted post provisioning. - * For a node's service tag to list in an inventory file, two conditions must be met: - - * Node status must be "booted" in DB. - * Node's service tag information is present in DB. - * To regenerate all the inventory files, use the playbook ``utils/inventory_tagging.yml``. - - - diff --git a/docs/source/InstallationGuides/RunningInit/index.rst b/docs/source/InstallationGuides/RunningInit/index.rst deleted file mode 100644 index dfae86ae2..000000000 --- a/docs/source/InstallationGuides/RunningInit/index.rst +++ /dev/null @@ -1,18 +0,0 @@ -Running prereq.sh -================= - -.. caution:: Minimal OS version of RHEL and Rocky Linux and "desktop image" version of Ubuntu is not supported on the control plane. - -``prereq.sh`` is used to install the software utilized by Omnia on the control plane including Python (3.9), Ansible (2.14). :: - - cd omnia - ./prereq.sh - -.. note:: - * If SELinux is not disabled, it will be disabled by the script and the user will be prompted to reboot the control plane. - * The file ``input/software_config.json`` is overwritten with the default value (based on the operating system) when ``prereq.sh`` is executed. - - - - - diff --git a/docs/source/InstallationGuides/deletenode.rst b/docs/source/InstallationGuides/deletenode.rst deleted file mode 100644 index 44076e663..000000000 --- a/docs/source/InstallationGuides/deletenode.rst +++ /dev/null @@ -1,188 +0,0 @@ -Remove Slurm/K8s configuration from a node -------------------------------------------- - -Use this playbook to remove slurm and kubernetes configuration from slurm or kubernetes worker nodes of the cluster and stop all clustering software on the worker nodes. - -.. note:: - * All target nodes should be drained before executing the playbook. If a job is running on any target nodes, the playbook may timeout waiting for the node state to change. - * When running ``remove_node_configuration.yml``, ensure that the ``input/storage_config.yml`` and ``input/omnia_config.yml`` have not been edited since ``omnia.yml`` was run. - - -**Configurations performed by the playbook** - - * Nodes specified in the slurm_node group or kube_node group in the inventory file will be removed from the slurm and kubernetes cluster respectively. - * Slurm and Kubernetes services are stopped and uninstalled. OS startup service list will be updated to disable Slurm and Kubernetes. - -**To run the playbook** - -Run the playbook using the following commands: :: - - cd utils - ansible-playbook remove_node_configuration.yml -i inventory - -* To specify only Slurm or Kubernetes nodes while running the playbook, use the tags ``slurm_node`` or ``kube_node``. That is: -* To remove only slurm nodes, use ``ansible-playbook remove_node_configuration.yml -i inventory --tags slurm_node``. -* To remove only kubernetes nodes, use ``ansible-playbook remove_node_configuration.yml -i inventory --tags kube_node``. -* To skip confirmation while running the playbook, use ``ansible-playbook remove_node_configuration.yml -i inventory --extra-vars skip_confirmation=yes`` or ``ansible-playbook remove_node_configuration.yml -i inventory -e skip_confirmation=yes``. - -The inventory file passed for ``remove_node_configuration.yml`` should follow the below format. :: - - #Batch Scheduler: Slurm - - [slurm_control_node] - - 10.5.1.101 - - [slurm_node] - - 10.5.1.103 - - 10.5.1.104 - - [login] - - 10.5.1.105 - - - - #General Cluster Storage - - [auth_server] - - 10.5.1.106 - - #AI Scheduler: Kubernetes - - [kube_control_plane] - - 10.5.1.101 - - [etcd] - - 10.5.1.101 - - [kube_node] - - 10.5.1.102 - - 10.5.1.103 - - 10.5.1.104 - - 10.5.1.105 - - 10.5.1.106 - - -Soft reset the cluster ------------------------ -Use this playbook to stop all Slurm and Kubernetes services. This action will destroy the cluster. - -.. note:: - * All target nodes should be drained before executing the playbook. If a job is running on any target nodes, the playbook may timeout waiting for the node state to change. - * When running ``reset_cluster_configuration.yml``, ensure that the ``input/storage_config.yml`` and ``input/omnia_config.yml`` have not been edited since ``omnia.yml`` was run. - -**Configurations performed by the playbook** - - * The configuration on the kube_control_plane or the slurm_control_plane will be reset. - * Slurm and Kubernetes services are stopped and removed. - -**To run the playbook** - -Run the playbook using the following commands: :: - - cd utils - ansible-playbook reset_cluster_configuration.yml -i inventory - -To specify only Slurm or Kubernetes clusters while running the playbook, use the tags ``slurm_cluster`` or ``k8s_cluster``. That is: - -To reset a slurm cluster, use ``ansible-playbook reset_cluster_configuration.yml -i inventory --tags slurm_cluster``. -To reset a kubernetes cluster, use ``ansible-playbook reset_cluster_configuration.yml -i inventory --tags k8s_cluster``. - -.. warning:: If you do not specify the tags ``slurm_cluster`` or ``k8s_cluster``, the ``reset_cluster_configuration.yml`` will reset the configuration for both Slurm and Kubernetes clusters. - -To skip confirmation while running the playbook, use ``ansible-playbook reset_cluster_configuration.yml -i inventory --extra-vars skip_confirmation=yes`` or ``ansible-playbook reset_cluster_configuration.yml -i inventory -e skip_confirmation=yes``. - -The inventory file passed for ``reset_cluster_configuration.yml`` should follow the below format. :: - - #Batch Scheduler: Slurm - - [slurm_control_node] - - 10.5.1.101 - - [slurm_node] - - 10.5.1.103 - - 10.5.1.104 - - [login] - - 10.5.1.105 - - - - #General Cluster Storage - - [auth_server] - - 10.5.1.106 - - #AI Scheduler: Kubernetes - - [kube_control_plane] - - 10.5.1.101 - - [etcd] - - 10.5.1.101 - - [kube_node] - - 10.5.1.102 - - 10.5.1.103 - - 10.5.1.104 - - 10.5.1.105 - - 10.5.1.106 - - -Delete provisioned node ------------------------- - -Use this playbook to remove discovered or provisioned nodes from all inventory files and Omnia database tables. No changes are made to the Slurm or Kubernetes cluster. - - -**Configurations performed by the playbook** - - * Nodes will be deleted from the Omnia DB and the xCAT node object will be deleted. - * Telemetry services will be stopped and removed. - -**To run the playbook** - -Run the playbook using the following commands: :: - - cd utils - ansible-playbook delete_node.yml -i inventory - -To skip confirmation while running the playbook, use ``ansible-playbook delete_node.yml -i inventory --extra-vars skip_confirmation=yes`` or ``ansible-playbook delete_node.yml -i inventory -e skip_confirmation=yes``. - -The inventory file passed for ``delete_node.yml`` should follow the below format. :: - - [nodes] - 10.5.0.33 - -.. note:: - * When the node is added or deleted, the autogenerated inventories: ``amd_gpu``, ``nvidia_gpu``, ``amd_cpu``, and ``intel_cpu`` should be updated for the latest changes. - * Nodes passed in the inventory file will be removed from the cluster. To reprovision the node, use the `add node script. `_ - - - - - - diff --git a/docs/source/InstallationGuides/index.rst b/docs/source/InstallationGuides/index.rst deleted file mode 100644 index f2c938e3e..000000000 --- a/docs/source/InstallationGuides/index.rst +++ /dev/null @@ -1,57 +0,0 @@ -Quick Installation Guide -======================== - -* Choose a server outside your intended cluster to function as your control plane. - -* Ensure that the control plane server meets the below mentioned space requirements: - - * For all available software packages that Omnia supports: 50GB - * For the complete set of software images (in ``/var``): 500GB - * For storing offline repositories (the file path should be specified in ``repo_store_path`` in ``input/local_repo_config.yml``): 50GB - -* The control plane needs to be internet-capable with Git installed. Additionally, the control plane must have a full-featured operating system installed. - -.. note:: Omnia can be run on control planes running RHEL, Rocky Linux, and Ubuntu. For a complete list of versions supported, check out the `Support Matrix <../Overview/SupportMatrix/OperatingSystems/index.html>`_. - -To install Git on RHEL and Rocky Linux installations, use the following command: :: - - dnf install git -y - -To install Git on Ubuntu installations, use the following command: :: - - apt install git -y - -.. note:: Optionally, if the control plane has an Infiniband NIC installed on RHEL or Rocky Linux, run the below command: - :: - yum groupinstall "Infiniband Support" -y - -* Clone the Omnia repository from GitHub on to the control plane, using the following command: :: - - git clone https://github.com/dell/omnia.git - -* Once the cloning process is complete, change directory to Omnia and run the ``prereq.sh`` script to verify that the system is ready for Omnia deployment, using the following command: :: - - cd omnia - ./prereq.sh - -.. note:: The permissions on the Omnia directory are set to **0755** by default. Do not change these values. - -.. toctree:: - RunningInit/index - LocalRepo/index - InstallingProvisionTool/index - PostProvisionScript - BuildingClusters/index - Platform/index - addinganewnode - reprovisioningthecluster - ConfiguringSwitches/index - ConfiguringStorage/index - Benchmarks/index - pullimagestonodes - deletenode - CleanUpScript - - - - diff --git a/docs/source/Logging/ControlPlaneLogs.rst b/docs/source/Logging/ControlPlaneLogs.rst index f15dcb2e5..9f7b3fcc4 100644 --- a/docs/source/Logging/ControlPlaneLogs.rst +++ b/docs/source/Logging/ControlPlaneLogs.rst @@ -1,5 +1,5 @@ -Control plane logs -------------------- +OIM logs +---------- .. caution:: It is not recommended to delete the below log files or the directories they reside in. @@ -7,15 +7,6 @@ Control plane logs * Log files are rotated periodically as a storage consideration. To customize how often logs are rotated, edit the ``/etc/logrotate.conf`` file on the node. * If you want log files for specific playbook execution, ensure to use the ``cd`` command to move into the specific directory before executing the playbook. For example, if you want local repo logs, ensure to enter ``cd local_repo`` before executing the playbook. If the directory is not changed, all the playbook execution log files will be consolidated and provided as part of omnia logs located in ``/var/log/omnia.log``. -CLI logs ----------- -All log files can be viewed using CLI. However, there are few log files which can be viewed exclusively using CLI. They are: - -.. csv-table:: Exclusive CLI log files - :file: ../Tables/CLI_exclusive_logs.csv - :header-rows: 1 - :keepspace: - Loki logs ---------- @@ -40,7 +31,7 @@ Logs of individual containers Provisioning logs -------------------- -Logs pertaining to actions taken during ``discovery_provision.yml`` can be viewed in ``/var/log/xcat/cluster.log`` and ``/var/log/xcat/computes.log`` on the control plane. +Logs pertaining to actions taken during ``discovery_provision.yml`` can be viewed in ``/var/log/xcat/cluster.log`` and ``/var/log/xcat/computes.log`` on the OIM. .. note:: As long as a node has been added to a cluster by Omnia, deployment events taking place on the node will be updated in ``/var/log/xcat/cluster.log``. @@ -56,7 +47,7 @@ Logs pertaining to actions taken by Omnia or iDRAC telemetry can be viewed in `` Grafana Loki -------------- -After `telemetry.yml <../Roles/Telemetry/index.html>`_ is run, Grafana services are installed on the control plane. +After `telemetry.yml <../Telemetry/index.html>`_ is run, Grafana services are installed on the OIM. i. Get the Grafana IP using ``kubectl get svc -n grafana``. diff --git a/docs/source/Logging/LogManagement.rst b/docs/source/Logging/LogManagement.rst index ed6081e00..48b869e8b 100644 --- a/docs/source/Logging/LogManagement.rst +++ b/docs/source/Logging/LogManagement.rst @@ -25,4 +25,4 @@ With the above settings: * Data upto 4 weeks old is backed up. Any log backup older than four weeks will be deleted. -.. caution:: Since these logs take up ``/var`` space, sufficient space must be allocated to ``/var`` partition if it's created. If ``/var`` partition space fills up, control plane might crash. +.. caution:: Since these logs take up ``/var`` space, sufficient space must be allocated to ``/var`` partition if it's created. If ``/var`` partition space fills up, OIM might crash. diff --git a/docs/source/InstallationGuides/addinganewnode.rst b/docs/source/OmniaInstallGuide/Maintenance/addnode.rst similarity index 62% rename from docs/source/InstallationGuides/addinganewnode.rst rename to docs/source/OmniaInstallGuide/Maintenance/addnode.rst index 59c9dc416..4bc2536be 100644 --- a/docs/source/InstallationGuides/addinganewnode.rst +++ b/docs/source/OmniaInstallGuide/Maintenance/addnode.rst @@ -1,31 +1,34 @@ Adding new nodes -++++++++++++++++++ +================== **Provisioning the new node** -A new node can be added using the following ways: +A new node can be provisioned using the following ways, based on the `discovery mechanism <../Ubuntu/Provision/DiscoveryMechanisms/index.html>`_ used: -* If ``pxe_mapping_file_path`` is provided in ``input/provision_config.yml```: +1. Using a **mapping file**: * Update the existing mapping file by appending the new entry (without the disrupting the older entries) or provide a new mapping file by pointing ``pxe_mapping_file_path`` in ``provision_config.yml`` to the new location. - .. note:: When re-running ``discovery_provision.yml`` with a new mapping file, ensure that existing IPs from the current mapping file are not provided in the new mapping file. Any IP overlap between mapping files will result in PXE failure. This can only be resolved by running the `Clean Up script `_ followed by ``discovery_provision.yml``. + .. note:: Any IP overlap between the mapping files will result in PXE boot failure. This can be resolved by running the `Delete Node script `_ or the `Clean Up script `_. Re-run ``discovery_provision.yml`` once the node has been deleted. - * Run ``discovery_provision.yml``.:: + * Run ``discovery_provision.yml`` :: ansible-playbook discovery_provision.yml - * Manually PXE boot the target servers after the ``discovery_provision.yml`` playbook (if ``bmc_ip`` is not provided in the mapping file) is executed and the target node lists as **booted** in the `nodeinfo table `_. + * Manually PXE boot the target servers after the ``discovery_provision.yml`` playbook (if ``bmc_ip`` is not provided in the mapping file) is executed and the target node lists as **booted** in the `nodeinfo table <../Ubuntu/Provision/ViewingDB.html>`_. -* When target nodes were discovered using BMC: - * Run ``discovery_provision.yml`` once the node has joined the cluster using an IP that exists within the provided range. :: +2. Using **BMC** method: + * Update ``discover_ranges`` under ``bmc_network`` in ``input/network_spec.yml`` with the desired range of IPs to be discovered. For more information, `click here <../Ubuntu/Provision/provisionparams.html#id6>`_. + * Run ``discovery_provision.yml`` :: ansible-playbook discovery_provision.yml -* When target nodes were discovered using ``switch_based_details`` in ``input/provision_config.yml``: + + +3. Using **switch-based** method: * Edit or append JSON list stored in ``switch_based_details`` in ``input/provision_config.yml``. @@ -34,15 +37,15 @@ A new node can be added using the following ways: * Ports configured via Omnia should be not be removed from ``switch_based_details`` in ``input/provision_config.yml``. - * Run ``discovery_provision.yml``. :: + * Run ``discovery_provision.yml`` :: ansible-playbook discovery_provision.yml - * Manually PXE boot the target servers after the ``discovery_provision.yml`` playbook is executed and the target node lists as **booted** in the `nodeinfo table `_. + * Manually PXE boot the target servers after the ``discovery_provision.yml`` playbook is executed and the target node lists as **booted** in the `nodeinfo table <../Ubuntu/Provision/ViewingDB.html>`_. -Verify that the node has been provisioned successfully by checking the Omnia `nodeinfo table `_. +Verify that the node has been provisioned successfully by checking the Omnia `nodeinfo table <../Ubuntu/Provision/ViewingDB.html>`_. **Adding new compute nodes to the cluster** @@ -66,7 +69,6 @@ Verify that the node has been provisioned successfully by checking the Omnia `no 10.5.0.110 - *Updated kubernetes inventory with the new node information* :: @@ -129,10 +131,10 @@ In the above examples, nodes 10.5.0.105 and 10.5.0.106 have been added to the cl .. note:: * The ``[etcd]`` group only supports an odd number of servers in the group. Adding nodes to ``[etcd]`` groups is not supported in re-run scenarios. - * Do not change the kube_control_plane/slurm_control_node/auth_server in the existing inventory. Simply add the new node information in the kube_node/slurm_node group. + * Do not change the ``kube_control_plane``, ``slurm_control_node`` and/or ``auth_server`` in the existing inventory file. Simply add the new node information in the ``kube_node`` and/or ``slurm_node`` group. * When re-running ``omnia.yml`` to add a new node, ensure that the ``input/security_config.yml`` and ``input/omnia_config.yml`` are not edited between runs. -3. To install `security `_, `job scheduler `_ and storage tools (`NFS `_, `BeeGFS `_) on the node, run ``omnia.yml``: :: +2. Once the new node IPs have been provided in the inventory, you can install security tools (OpenLDAP, FreeIPA), job schedulers (Kubernetes, Slurm), and storage tools (NFS, BeeGFS) on the nodes by executing ``omnia.yml`` with the updated inventory file: :: ansible-playbook omnia.yml -i inventory diff --git a/docs/source/InstallationGuides/CleanUpScript.rst b/docs/source/OmniaInstallGuide/Maintenance/cleanup.rst similarity index 59% rename from docs/source/InstallationGuides/CleanUpScript.rst rename to docs/source/OmniaInstallGuide/Maintenance/cleanup.rst index 168b94886..8d21f66d4 100644 --- a/docs/source/InstallationGuides/CleanUpScript.rst +++ b/docs/source/OmniaInstallGuide/Maintenance/cleanup.rst @@ -1,27 +1,27 @@ -Uninstalling the provision tool --------------------------------- +Uninstalling the OIM tools +------------------------------ -Use this script to undo all the changes made by the provision tool. For a list of actions taken by the provision tool, `click here `_. +Run this script to roll back all modifications made to the OIM, such as configured local repositories, provisioning tools, and telemetry configurations. To run the script: :: cd utils - ansible-playbook control_plane_cleanup.yml + ansible-playbook oim_cleanup.yml To skip the deletion of the configured local repositories (stored in ``repo_store_path`` and xCAT repositories), run: :: - ansible-playbook control_plane_cleanup.yml –-skip-tags downloads + ansible-playbook oim_cleanup.yml –-skip-tags downloads To delete the changes made by ``local_repo.yml`` while retaining the ``repo_store_path`` folder, run: :: - ansible-playbook control_plane_cleanup.yml -–tags local_repo --skip-tags downloads + ansible-playbook oim_cleanup.yml -–tags local_repo --skip-tags downloads To delete the changes made by ``local_repo.yml`` including the ``repo_store_path`` folder, run: :: - ansible-playbook control_plane_cleanup.yml –-tags local_repo + ansible-playbook oim_cleanup.yml –-tags local_repo -.. note:: After you run the ``control_plane_cleanup.yml`` playbook, ensure to reboot the control plane node. +.. note:: After you run the ``oim_cleanup.yml`` playbook, ensure to reboot the OIM node. .. caution:: * When re-provisioning your cluster (that is, re-running the ``discovery_provision.yml`` playbook) after a clean-up, ensure to use a different ``admin_nic_subnet`` in ``input/provision_config.yml`` to avoid a conflict with newly assigned servers. Alternatively, disable any OS available in the ``Boot Option Enable/Disable`` section of your BIOS settings (``BIOS Settings`` > ``Boot Settings`` > ``UEFI Boot Settings``) on all target nodes. diff --git a/docs/source/OmniaInstallGuide/Maintenance/delete_config.rst b/docs/source/OmniaInstallGuide/Maintenance/delete_config.rst new file mode 100644 index 000000000..fbeb04c55 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Maintenance/delete_config.rst @@ -0,0 +1,78 @@ +Remove Slurm/Kubernetes configuration from a compute node +================================================================ + +Use this playbook to remove the Slurm and/or Kubernetes configuration and stop all clustering software on the compute nodes of the cluster. This will help clean up the cluster and ensure that all clustering components are properly deactivated and removed from the compute nodes. + +.. note:: + * All target nodes should be drained before executing the playbook. If a job is running on any target nodes, the playbook may timeout waiting for the node state to change. + * When running ``remove_node_configuration.yml``, ensure that the ``input/storage_config.yml`` and ``input/omnia_config.yml`` have not been edited since ``omnia.yml`` was run. + +.. caution:: While attempting to remove a slurm_node configured on a cluster, the ``slurmctld`` services might fail on the ``slurm_control_node``. This happens only when there is a single ``slurm_node`` present in the cluster. + +**Configurations performed by the playbook** + + * Nodes specified in the ``slurm_node`` or ``kube_node`` group in the inventory file will be removed from the Slurm or Kubernetes cluster respectively. + * Slurm and Kubernetes services are stopped and uninstalled. OS startup service list will be updated to disable Slurm and Kubernetes. + +**To run the playbook** + +* Insert the IP of the compute node(s) to be removed, in the existing inventory file as shown below: + +*Existing Kubernetes inventory* +:: + [kube_control_plane] + 10.5.0.101 + + [kube_node] + 10.5.0.102 + 10.5.0.103 + 10.5.0.105 + 10.5.0.106 + + [auth_server] + 10.5.0.101 + + [etcd] + 10.5.0.110 + +*New inventory for removing Kube nodes from the cluster* +:: + + [kube_node] + 10.5.0.102 + 10.5.0.103 + +*Existing Slurm inventory* +:: + [slurm_control_node] + 10.5.0.101 + + [slurm_node] + 10.5.0.102 + 10.5.0.103 + 10.5.0.105 + 10.5.0.106 + + [login] + 10.5.0.104 + + [auth_server] + 10.5.0.101 + +*New inventory for removing Slurm nodes from the cluster* +:: + [slurm_node] + 10.5.0.102 + 10.5.0.103 + +* To run the playbook, run the following commands: :: + + cd utils + ansible-playbook remove_node_configuration.yml -i inventory + +* To specify only Slurm or Kubernetes nodes while running the playbook, use the tags ``slurm_node`` or ``kube_node``. That is: + + * To remove only Slurm nodes, use ``ansible-playbook remove_node_configuration.yml -i inventory --tags slurm_node``. + * To remove only Kubernetes nodes, use ``ansible-playbook remove_node_configuration.yml -i inventory --tags kube_node``. + +* To skip confirmation while running the playbook, use ``ansible-playbook remove_node_configuration.yml -i inventory --extra-vars skip_confirmation=yes`` or ``ansible-playbook remove_node_configuration.yml -i inventory -e skip_confirmation=yes``. \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/Maintenance/deletenode.rst b/docs/source/OmniaInstallGuide/Maintenance/deletenode.rst new file mode 100644 index 000000000..12e06b788 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Maintenance/deletenode.rst @@ -0,0 +1,27 @@ +Delete provisioned node +======================== + +Use this playbook to remove discovered or provisioned nodes from all inventory files and Omnia database tables. No changes are made to the Slurm or Kubernetes cluster. + +**Configurations performed by the playbook** + + * Nodes will be deleted from the Omnia DB and the xCAT node object will be deleted. + * Telemetry services will be stopped and removed. + +**To run the playbook** + +Run the playbook using the following commands: :: + + cd utils + ansible-playbook delete_node.yml -i inventory + +To skip confirmation while running the playbook, use ``ansible-playbook delete_node.yml -i inventory --extra-vars skip_confirmation=yes`` or ``ansible-playbook delete_node.yml -i inventory -e skip_confirmation=yes``. + +The inventory file passed for ``delete_node.yml`` should follow the below format. :: + + [nodes] + 10.5.0.33 + +.. note:: + * When the node is added or deleted, the autogenerated inventories: ``amd_gpu``, ``nvidia_gpu``, ``amd_cpu``, and ``intel_cpu`` should be updated with the latest changes. + * Nodes passed in the inventory file will be removed from the cluster. To re-provision the node, use the `add node script. `_ \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/Maintenance/index.rst b/docs/source/OmniaInstallGuide/Maintenance/index.rst new file mode 100644 index 000000000..8fa59c223 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Maintenance/index.rst @@ -0,0 +1,12 @@ +Maintenance +============= + +.. toctree:: + :maxdepth: 2 + + addnode + delete_config + reset + deletenode + cleanup + reprovision \ No newline at end of file diff --git a/docs/source/InstallationGuides/reprovisioningthecluster.rst b/docs/source/OmniaInstallGuide/Maintenance/reprovision.rst similarity index 64% rename from docs/source/InstallationGuides/reprovisioningthecluster.rst rename to docs/source/OmniaInstallGuide/Maintenance/reprovision.rst index e9ba62c1f..16fb85a15 100644 --- a/docs/source/InstallationGuides/reprovisioningthecluster.rst +++ b/docs/source/OmniaInstallGuide/Maintenance/reprovision.rst @@ -1,19 +1,20 @@ Re-provisioning the cluster -++++++++++++++++++++++++++++ - -**Pre-requisites** - - * Run the `delete node playbook `_ for every target node. +============================= In the event that an existing Omnia cluster needs a different OS version or a fresh installation, the cluster can be re-provisioned. -If a re-deployment with no modifications are required :: +**Prerequisite** + +* Run the `delete node playbook `_ for every target node. - ansible-playbook discovery_provision.yml +.. note:: If a re-deployment with no modifications is required, execute the following commands: + :: + cd omnia + ansible-playbook discovery_provision.yml **Setting up the cluster** -1. Insert the new IPs in the existing inventory file following the below example. +* Insert the new IPs in the existing inventory file as shown below: *Existing kubernetes inventory* @@ -90,15 +91,13 @@ If a re-deployment with no modifications are required :: [auth_server] 10.5.0.101 - - In the above examples, nodes 10.5.0.105 and 10.5.0.106 have been added to the cluster as compute nodes. .. note:: - * Do not change the kube_control_plane/slurm_control_node/auth_server in the existing inventory. Simply add the new node information in the kube_node/slurm_node group. + * Do not change the ``kube_control_plane``, ``slurm_control_node`` and/or ``auth_server`` in the existing inventory file. Simply add the new node information in the ``kube_node`` and/or ``slurm_node`` group. * When re-running ``omnia.yml`` to add a new node, ensure that the ``input/security_config.yml`` and ``input/omnia_config.yml`` are not edited between runs. -3. To install `security `_, `job scheduler `_ and storage tools (`NFS `_, `BeeGFS `_) on the node, run ``omnia.yml``: :: +* To install security, job scheduler, and storage tools (NFS, BeeGFS) on the node, run ``omnia.yml``: :: ansible-playbook omnia.yml -i inventory diff --git a/docs/source/OmniaInstallGuide/Maintenance/reset.rst b/docs/source/OmniaInstallGuide/Maintenance/reset.rst new file mode 100644 index 000000000..e33d9b456 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Maintenance/reset.rst @@ -0,0 +1,81 @@ +Soft reset the cluster +======================= + +Use this playbook to stop and remove all Slurm and Kubernetes services from the cluster nodes. + +.. warning:: This action will destroy the existing Slurm/Kubernetes cluster. + +.. note:: + * All target nodes should be drained before executing the playbook. If a job is running on any target nodes, the playbook may timeout waiting for the node state to change. + * When running ``reset_cluster_configuration.yml``, ensure that the ``input/storage_config.yml`` and ``input/omnia_config.yml`` have not been edited since ``omnia.yml`` was run. + +**Configurations performed by the playbook** + + * The Slurm and Kubernetes configuration will be reset on the ``kube_control_plane`` or the ``slurm_control_node``, as defined in the inventory file. + * All services pertaining to Slurm and Kubernetes are stopped and removed. + +**To run the playbook** + +Run the playbook using the following commands: :: + + cd utils + ansible-playbook reset_cluster_configuration.yml -i inventory + +* To specify only Slurm or Kubernetes clusters while running the playbook, use the tags ``slurm_cluster`` or ``k8s_cluster``. That is: + + * To reset a slurm cluster, use ``ansible-playbook reset_cluster_configuration.yml -i inventory --tags slurm_cluster``. + * To reset a kubernetes cluster, use ``ansible-playbook reset_cluster_configuration.yml -i inventory --tags k8s_cluster``. + +.. caution:: If you do not specify the tags ``slurm_cluster`` or ``k8s_cluster``, the ``reset_cluster_configuration.yml`` will reset the configuration for both Slurm and Kubernetes clusters. + +* To skip confirmation while running the playbook, use ``ansible-playbook reset_cluster_configuration.yml -i inventory --extra-vars skip_confirmation=yes`` or ``ansible-playbook reset_cluster_configuration.yml -i inventory -e skip_confirmation=yes``. + +The inventory file passed for ``reset_cluster_configuration.yml`` should follow the below format. :: + + #Batch Scheduler: Slurm + + [slurm_control_node] + + 10.5.1.101 + + [slurm_node] + + 10.5.1.103 + + 10.5.1.104 + + [login] + + 10.5.1.105 + + + #General Cluster Storage + + [auth_server] + + 10.5.1.106 + + + #AI Scheduler: Kubernetes + + [kube_control_plane] + + 10.5.1.101 + + [etcd] + + 10.5.1.101 + + [kube_node] + + 10.5.1.102 + + 10.5.1.103 + + 10.5.1.104 + + 10.5.1.105 + + 10.5.1.106 + +.. note:: After running the ``reset_cluster_configuration.yml`` playbook on a Kubernetes cluster, it is observed that some Kubernetes logs and configuration files are still present on the ``kube_control_plane``. For more information about this, `click here <../../Troubleshooting/KnownIssues/Common/Kubernetes.html>`_. \ No newline at end of file diff --git a/docs/source/InstallationGuides/Benchmarks/AutomatingOneAPI.rst b/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/AutomatingOneAPI.rst similarity index 92% rename from docs/source/InstallationGuides/Benchmarks/AutomatingOneAPI.rst rename to docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/AutomatingOneAPI.rst index 07e3f0662..ad566dd52 100644 --- a/docs/source/InstallationGuides/Benchmarks/AutomatingOneAPI.rst +++ b/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/AutomatingOneAPI.rst @@ -7,10 +7,10 @@ This topic explains how to automatically update servers for MPI jobs. **Pre-requisites** -* A local repository has been set up by listing ``{"name": "intel_benchmarks", "version": "2024.1.0"},`` in ``input/software_config.json`` and running ``local_repo.yml``. For more information, `click here. <../LocalRepo/index.html>`_ +* A local repository has been set up by listing ``{"name": "intel_benchmarks", "version": "2024.1.0"},`` in ``input/software_config.json`` and running ``local_repo.yml``. For more information, `click here. <../CreateLocalRepo/index.html>`_ * ``discovery_provision.yml`` playbook has been executed. -* Verify that the target nodes are in the ``booted`` state. For more information, `click here <../InstallingProvisionTool/ViewingDB.html>`_. -* The cluster has been set up with Slurm. For more information, `click here <../BuildingClusters/install_slurm.html>`_. +* Verify that the target nodes are in the ``booted`` state. For more information, `click here <../Provision/ViewingDB.html>`_. +* The cluster has been set up with Slurm. For more information, `click here <../OmniaCluster/BuildingCluster/install_slurm.html>`_. * An Omnia **slurm** cluster has been set up by ``omnia.yml`` with at least 2 nodes: 1 ``slurm_control_node`` and 1 ``slurm_node``. **Sample inventory** diff --git a/docs/source/InstallationGuides/Benchmarks/AutomatingOpenMPI.rst b/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/AutomatingOpenMPI.rst similarity index 94% rename from docs/source/InstallationGuides/Benchmarks/AutomatingOpenMPI.rst rename to docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/AutomatingOpenMPI.rst index ad4df3cc7..9ae77d8a2 100644 --- a/docs/source/InstallationGuides/Benchmarks/AutomatingOpenMPI.rst +++ b/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/AutomatingOpenMPI.rst @@ -7,11 +7,11 @@ This topic explains how to automatically update AMD servers for MPI jobs. **Pre-requisites** -* A local repository has been set up by listing ``{"name": "amd_benchmarks"},`` in ``input/software_config.json`` and running ``local_repo.yml``. For more information, `click here. <../LocalRepo/index.html>`_ +* A local repository has been set up by listing ``{"name": "amd_benchmarks"},`` in ``input/software_config.json`` and running ``local_repo.yml``. For more information, `click here. <../CreateLocalRepo/index.html>`_ * ``discovery_provision.yml`` playbook has been executed. -* Verify that the target nodes are in the ``booted`` state. For more information, `click here <../InstallingProvisionTool/ViewingDB.html>`_. +* Verify that the target nodes are in the ``booted`` state. For more information, `click here <../Provision/ViewingDB.html>`_. * An Omnia **slurm** cluster has been set up by ``omnia.yml`` with at least 2 nodes: 1 ``slurm_control_node`` and 1 ``slurm_node``. -* A local OpenMPI repository has been created. For more information, `click here <../LocalRepo/localrepos.html>`_. +* A local OpenMPI repository has been created. For more information, `click here <../CreateLocalRepo/localrepos.html>`_. **Sample inventory** :: diff --git a/docs/source/InstallationGuides/ConfiguringStorage/index.rst b/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/ConfiguringStorage/index.rst similarity index 98% rename from docs/source/InstallationGuides/ConfiguringStorage/index.rst rename to docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/ConfiguringStorage/index.rst index e65b55b91..d49346ec9 100644 --- a/docs/source/InstallationGuides/ConfiguringStorage/index.rst +++ b/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/ConfiguringStorage/index.rst @@ -1,9 +1,10 @@ -Configuring PowerVault +Configuring Storage ======================= -**Configuring Powervault storage** +Configuring PowerVault storage +-------------------------------- -To configure powervault ME4 and ME5 storage arrays, follow the below steps: +To configure PowerVault ME4 and ME5 storage arrays, follow the below steps: Fill out all required parameters in ``storage/powervault_input.yml``: @@ -105,7 +106,8 @@ Run the playbook: :: * ``powervault_username`` and ``powervault_password`` are the credentials used to administrate the array. -**Configuring NFS servers** +Configuring NFS server connected to PowerVault +------------------------------------------------------ To configure an NFS server, enter the following parameters in ``storage/nfs_server_input.yml`` @@ -126,7 +128,7 @@ To configure an NFS server, enter the following parameters in ``storage/nfs_serv | | | | | Must specify atleast 1 volume | | | | -| | **Default values**: `` - { name: omnia_home, server_share_path: /home/omnia_home, server_export_options: }`` | +| | **Default values**: ``- { name: omnia_home, server_share_path: /home/omnia_home, server_export_options: }`` | +--------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ Run the playbook: :: @@ -134,8 +136,8 @@ Run the playbook: :: cd storage ansible-playbook nfs_sas.yml -i /root/inventory -e powervault_username="xxxxx" -e powervault_password="xxxxxx" -* Where the ``inventory`` refers to a list of all nodes in the format of `NFS server inventory file <../../samplefiles.html#nfs-server-inventory-file>`_ -* To set up NFS client services, `click here <../BuildingClusters/NFS.html>`_ +* Where the ``inventory`` refers to a list of all nodes separated by a newline. +* To set up NFS client services, `click here <../../OmniaCluster/BuildingCluster/Storage/NFS.html>`_ diff --git a/docs/source/InstallationGuides/ConfiguringSwitches/ethernet-Z.rst b/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/ConfiguringSwitches/ethernet-Z.rst similarity index 99% rename from docs/source/InstallationGuides/ConfiguringSwitches/ethernet-Z.rst rename to docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/ConfiguringSwitches/ethernet-Z.rst index 744e0df5f..fdaf8db2a 100644 --- a/docs/source/InstallationGuides/ConfiguringSwitches/ethernet-Z.rst +++ b/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/ConfiguringSwitches/ethernet-Z.rst @@ -77,7 +77,7 @@ Configuring ethernet switches (Z series) * Where ``ethernet_switch_username`` is the username used to authenticate into the switch. -* The inventory file should be a list of IPs separated by newlines. Check out the switch_inventory section in `Sample Files `_ +* The inventory file should be a list of IPs separated by newlines. Check out the switch_inventory section in `Sample Files <../../../samplefiles.html#switch-inventory>`_ * Where ``ethernet_switch_password`` is the password used to authenticate into the switch. diff --git a/docs/source/InstallationGuides/ConfiguringSwitches/ethernet-s3_s4.rst b/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/ConfiguringSwitches/ethernet-s3_s4.rst similarity index 98% rename from docs/source/InstallationGuides/ConfiguringSwitches/ethernet-s3_s4.rst rename to docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/ConfiguringSwitches/ethernet-s3_s4.rst index 7aeaee954..71fe42848 100644 --- a/docs/source/InstallationGuides/ConfiguringSwitches/ethernet-s3_s4.rst +++ b/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/ConfiguringSwitches/ethernet-s3_s4.rst @@ -71,7 +71,7 @@ Configuring ethernet switches (S3 and S4 series) * Where ``ethernet_switch_username`` is the username used to authenticate into the switch. -* The inventory file should be a list of IPs separated by newlines. Check out the switch_inventory section in `Sample Files `_ +* The inventory file should be a list of IPs separated by newlines. Check out the switch_inventory section in `Sample Files <../../../samplefiles.html#switch-inventory>`_ * Where ``ethernet_switch_password`` is the password used to authenticate into the switch. diff --git a/docs/source/InstallationGuides/ConfiguringSwitches/ethernet-s5.rst b/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/ConfiguringSwitches/ethernet-s5.rst similarity index 98% rename from docs/source/InstallationGuides/ConfiguringSwitches/ethernet-s5.rst rename to docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/ConfiguringSwitches/ethernet-s5.rst index b8c782ca4..3539ea828 100644 --- a/docs/source/InstallationGuides/ConfiguringSwitches/ethernet-s5.rst +++ b/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/ConfiguringSwitches/ethernet-s5.rst @@ -73,7 +73,7 @@ Configuring ethernet switches (S5 series) * Where ``ethernet_switch_username`` is the username used to authenticate into the switch. -* The inventory file should be a list of IPs separated by newlines. Check out the switch_inventory section in `Sample Files `_ +* The inventory file should be a list of IPs separated by newlines. Check out the switch_inventory section in `Sample Files <../../../samplefiles.html#switch-inventory>`_ * Where ``ethernet_switch_password`` is the password used to authenticate into the switch. diff --git a/docs/source/InstallationGuides/ConfiguringSwitches/index.rst b/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/ConfiguringSwitches/index.rst similarity index 100% rename from docs/source/InstallationGuides/ConfiguringSwitches/index.rst rename to docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/ConfiguringSwitches/index.rst diff --git a/docs/source/InstallationGuides/ConfiguringSwitches/infiniband.rst b/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/ConfiguringSwitches/infiniband.rst similarity index 99% rename from docs/source/InstallationGuides/ConfiguringSwitches/infiniband.rst rename to docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/ConfiguringSwitches/infiniband.rst index a2907c915..b8a01d792 100644 --- a/docs/source/InstallationGuides/ConfiguringSwitches/infiniband.rst +++ b/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/ConfiguringSwitches/infiniband.rst @@ -120,7 +120,7 @@ If ``enable_split_port`` is **false**, run:: * Passwords should contain at least one of each: Lowercase, uppercase and digits. - * The inventory file should be a list of IPs separated by newlines. Check out the ``switch_inventory`` section in `Sample Files `_ + * The inventory file should be a list of IPs separated by newlines. Check out the ``switch_inventory`` section in `Sample Files <../../../samplefiles.html#switch-inventory>`_ * Where ``ib_default_password`` is the password used to authenticate into factory reset/fresh-install switches. diff --git a/docs/source/InstallationGuides/LocalRepo/CustomLocalRepo.rst b/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/CustomLocalRepo.rst similarity index 68% rename from docs/source/InstallationGuides/LocalRepo/CustomLocalRepo.rst rename to docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/CustomLocalRepo.rst index 115e46976..a16bfb68f 100644 --- a/docs/source/InstallationGuides/LocalRepo/CustomLocalRepo.rst +++ b/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/CustomLocalRepo.rst @@ -1,53 +1,59 @@ Configuring custom repositories ------------------------------- -Use the local repository feature to create a customized set of local repositories on the control plane for the cluster nodes to access. +Use the local repository feature to create a customized set of local repositories on the OIM for the cluster nodes to access. 1. Ensure the ``custom`` entry is included in the ``software_config.json`` file. :: { - "cluster_os_type": "ubuntu", - "cluster_os_version": "22.04", + "cluster_os_type": "rhel", + "cluster_os_version": "8.8", "repo_config": "partial", "softwares": [ - {"name": "k8s", "version":"1.26.12"}, - {"name": "jupyter", "version": "3.2.0"}, - {"name": "kubeflow", "version": "1.8"}, + {"name": "amdgpu", "version": "6.2.2"}, + {"name": "cuda", "version": "12.3.2"}, + {"name": "ofed", "version": "24.01-0.3.3.1"}, + {"name": "freeipa"}, {"name": "openldap"}, - {"name": "beegfs", "version": "7.2.6"}, + {"name": "secure_login_node"}, {"name": "nfs"}, + {"name": "beegfs", "version": "7.4.2"}, + {"name": "slurm"}, + {"name": "k8s", "version":"1.29.5"}, + {"name": "jupyter"}, + {"name": "kubeflow"}, {"name": "kserve"}, - {"name": "custom"}, - {"name": "amdgpu", "version": "6.0"}, - {"name": "cuda", "version": "12.3.2"}, - {"name": "ofed", "version": "24.01-0.3.3.1"}, + {"name": "pytorch"}, + {"name": "tensorflow"}, + {"name": "vllm"}, {"name": "telemetry"}, + {"name": "intel_benchmarks", "version": "2024.1.0"}, + {"name": "amd_benchmarks"}, {"name": "utils"}, - {"name": "vllm"}, - {"name": "pytorch"}, - {"name": "tensorflow"} + {"name": "ucx", "version": "1.15.0"}, + {"name": "openmpi", "version": "4.1.6"}, + {"name": "csi_driver_powerscale", "version":"v2.11.0"} ], - "amdgpu": [ - {"name": "rocm", "version": "6.0" } + {"name": "rocm", "version": "6.2.2" } ], - "vllm": [ - {"name": "vllm_amd", "version":"vllm-v0.2.4"}, - {"name": "vllm_nvidia", "version": "latest"} - ], - "pytorch": [ - {"name": "pytorch_cpu", "version":"latest"}, - {"name": "pytorch_amd", "version":"latest"}, - {"name": "pytorch_nvidia", "version": "23.12-py3"} - ], - "tensorflow": [ - {"name": "tensorflow_cpu", "version":"latest"}, - {"name": "tensorflow_amd", "version":"latest"}, - {"name": "tensorflow_nvidia", "version": "23.12-tf2-py3"} - ] - + "vllm": [ + {"name": "vllm_amd"}, + {"name": "vllm_nvidia"} + ], + "pytorch": [ + {"name": "pytorch_cpu"}, + {"name": "pytorch_amd"}, + {"name": "pytorch_nvidia"} + ], + "tensorflow": [ + {"name": "tensorflow_cpu"}, + {"name": "tensorflow_amd"}, + {"name": "tensorflow_nvidia"} + ] } + 2. Create a ``custom.json`` file in the following directory: ``input/config//`` to define the repositories. For example, For a cluster running RHEL 8.8, go to ``input/config/rhel/8.8/`` and create the file there. The file is a JSON list consisting of the package name, repository type, URL (optional), and version (optional). Below is a sample version of the file: :: { @@ -110,7 +116,7 @@ Use the local repository feature to create a customized set of local repositorie } } -2. Enter the parameters required in ``input/local_repo_config.yml`` as explained `here `_. +2. Enter the parameters required in ``input/local_repo_config.yml`` as explained `here <../CreateLocalRepo/InputParameters.html#id2>`_. 3. Run the following commands: :: diff --git a/docs/source/InstallationGuides/BuildingClusters/KubernetesAccess.rst b/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/KubernetesAccess.rst similarity index 99% rename from docs/source/InstallationGuides/BuildingClusters/KubernetesAccess.rst rename to docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/KubernetesAccess.rst index 4b746545a..c2a3a8f86 100644 --- a/docs/source/InstallationGuides/BuildingClusters/KubernetesAccess.rst +++ b/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/KubernetesAccess.rst @@ -3,7 +3,7 @@ Granting Kubernetes access Omnia grants Kubernetes node access to users defined on the ``kube_control_plane`` using the ``k8s_access.yml`` playbook. -**Prerequisites** +**Prerequisite** * Ensure that the Kubernetes cluster is up and running. diff --git a/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/PowerScale_CSI.rst b/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/PowerScale_CSI.rst new file mode 100644 index 000000000..806f41763 --- /dev/null +++ b/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/PowerScale_CSI.rst @@ -0,0 +1,293 @@ +Deploy CSI drivers for Dell PowerScale storage solutions +=========================================================== + +Dell PowerScale is a flexible and secure scale-out NAS (network attached storage) solution designed to simplify storage requirements for AI and HPC workloads. To enable the PowerScale storage solution on the Kubernetes clusters, Omnia installs the Dell CSI PowerScale driver (version 2.11.0) on the nodes using helm charts. Once the PowerScale CSI driver is installed, the PowerScale nodes can be connected to the Kubernetes clusters for storage requirements. +To know more about the CSI PowerScale driver, `click here `_. + +.. caution:: PowerScale CSI driver installation is only supported on RHEL 8.8, Rocky Linux 8.8, and Ubuntu 22.04 clusters. + +.. note:: Omnia doesn't configure any PowerScale device via OneFS (operating system for PowerScale). Omnia configures the deployed Kubernetes cluster to interact with the PowerScale storage. + +PowerScale SmartConnect [Optional] +------------------------------------- + +* To utilize the PowerScale SmartConnect hostname, it is necessary for the user to have an upstream DNS server that includes delegation mappings of hostname to PowerScale IP addresses. During the provisioning of cluster nodes, users can specify the IP of the upstream ``DNS`` server in the ``input/network_spec.yml`` file. This ensures that the Omnia cluster recognizes and is aware of the upstream DNS server, enabling the use of PowerScale SmartConnect hostname functionality. For example: :: + + --- + Networks: + - admin_network: + nic_name: + netmask_bits: "16" + static_range: + dynamic_range: + correlation_to_admin: true + admin_uncorrelated_node_start_ip: "" + network_gateway: "" + DNS: + MTU: "1500" + +* If the user did not specify the upstream DNS server during the provisioning process and wishes to utilize PowerScale SmartConnect afterwards, then the user must first add the upstream DNS server IP to the ``DNS`` entry in ``input/network_spec.yml`` and then re-run the ``discovery-provision.yml`` playbook. + +Prerequisites +-------------- + +1. Download the ``secret.yaml`` file template from this `link `_. + +2. Update the following parameters in the ``secret.yaml`` file as per your cluster details and keep the rest as default values. For example: + + * clusterName: + * username: + * password: + * endpoint: + .. note:: If PowerScale SmartConnect hostname is configured, user can provide the PowerScale hostname for ``endpoint``. Otherwise user can provide PowerScale IP address as well. + * endpointPort: + * isDefault: true + * isiPath: "/ifs/data/csi" + + *Reference values from OneFS portal:* + + .. image:: ../../../images/CSI_1.png + +3. Download the ``values.yaml`` files template using the following command: :: + + wget https://raw.githubusercontent.com/dell/helm-charts/csi-isilon-2.11.0/charts/csi-isilon/values.yaml + +4. Update the following parameters in the ``values.yaml`` file and keep the rest as default values. Refer the below sample values: + + * controllerCount: 1 + + * replication: + + enabled: false + + * snapshot: + + enabled: true + + * resizer: + + enabled: false + + * healthMonitor: + + enabled: false + + * endpointPort:8080 + + * skipCertificateValidation: true + + * isiAccessZone: System + + * isiPath: /ifs/data/csi + + +.. note:: In order to integrate PowerScale solution to the deployed Kubernetes cluster, Omnia 1.7 requires the following fixed parameter values in ``values.yaml`` file: + + * controllerCount: 1 + * Replication: false + * Snapshot: true + * skipCertificateValidation: true + +.. note:: Once the PowerScale CSI driver has been deployed, the parameters in the ``values.yaml`` can't be changed. If the user wants to modify the ``values.yaml`` file, they must first uninstall the PowerScale CSI driver from the cluster and then re-install with the updated parameters. + +Installation Process +--------------------- + +1. Once ``secret.yaml`` and ``values.yaml`` is filled up with the necessary details, copy both files to any directory on the OIM. For example, ``/tmp/secret.yaml`` and ``/tmp/values.yaml``. + +2. Add the ``csi_driver_powerscale`` entry along with the driver version to the ``omnia/input/software_config.json`` file: :: + + {"name": "csi_driver_powerscale", "version":"v2.11.0"} + + .. note:: By default, the ``csi_driver_powerscale`` entry is not present in the ``input/software_config.json``. + +3. Execute the ``local_repo.yml`` playbook to download the required artifacts to the OIM: :: + + cd local_repo + ansible-playbook local_repo.yml + +4. Add the filepath of the ``secret.yaml`` and ``values.yaml`` file to the ``csi_powerscale_driver_secret_file_path`` and ``csi_powerscale_driver_values_file_path`` variables respectively, present in the ``omnia/input/omnia_config.yml`` file. + +5. Execute the ``omnia.yml`` playbook to install the PowerScale CSI driver: :: + + cd omnia + ansible-playbook omnia.yml -i + +.. note:: + * There isn't a separate playbook to run for PowerScale CSI driver installation. Running ``omnia.yml`` with necessary inputs installs the driver. If Kubernetes is already deployed on the cluster, users can also run the ``scheduler.yml`` playbook to install the PowerScale CSI driver. + * After running ``omnia.yml`` playbook, the ``secret.yaml`` file will be encrypted. User can use below command to decrypt and edit it if required: :: + + cd omnia + ansible-vault edit --vault-password-file scheduler/roles/k8s_csi_powerscale_plugin/files/.csi_powerscale_secret_vault + +.. caution:: Do not delete the vault key file ``.csi_powerscale_secret_vault``, otherwise users will not be able to decrypt the ``secret.yaml`` file anymore. + +Expected Results +------------------ + +* After the successful execution of the ``omnia.yml`` playbook, the PowerScale CSI driver is deployed in the isilon namespace. +* Along with PowerScale driver installation a storage class named **ps01** is also created. The details of the storage class are as follows: :: + + apiVersion: storage.k8s.io/v1 + kind: StorageClass + metadata: + name: ps01 + provisioner: csi-isilon.dellemc.com + reclaimPolicy: Delete + allowVolumeExpansion: true + volumeBindingMode: Immediate + parameters: + AccessZone: < access zone mentioned in values.yaml file > + Isipath: < isipath mentioned in values.yaml file > + RootClientEnabled: "true" + csi.storage.k8s.io/fstype: "nfs" + +* If there are errors during CSI driver installation, the whole ``omnia.yml`` playbook execution does not stop or fail. It pauses for 30 seconds with CSI driver installation failure error message and then proceeds with rest of the playbook execution. +* For an unsuccessful driver installation scenario, the user first needs to follow the manual removal steps mentioned below from the ``kube_control_plane``, and then re-run the ``omnia.yml`` playbook for CSI driver installation. + +Post installation +------------------- + +**[Optional] Create custom storage class** + +If user wants to create a custom storage class, they can do so by following the sample storage class `template `_. + +*Sample storageclass template*: :: + + apiVersion: storage.k8s.io/v1 + kind: StorageClass + metadata : + name: + provisioner: csi-isilon.dellemc.com + reclaimPolicy: Delete + allowVolumeExpansion: true + volumeBindingMode: Immediate + parameters : + clusterName: #optional + AccessZone: System + AzServiceIP: #optional + Isipath: #sample: /ifs/data/csi/ + RootClientEnabled: "true" + csi.storage.k8s.io/fstype: "nfs" + +.. note:: + + * If PowerScale SmartConnect hostname is configured and the delegated host list is set up in the external DNS server, then the user can provide the PowerScale hostname for ``AzServiceIP``. Otherwise user can provide PowerScale IP address as well. + * If there are any changes to the storage class parameters in a PowerScale cluster, the user must update the existing storage class or create a new one as needed. + +**Apply storage class** + +Use the following command to apply the storageclass: :: + + kubectl apply -f + +**Create Persistent Volume Claim (PVC)** + +Once the storage class is created, the same can be used to create PVC. + +*Sample deployment with PVC*: :: + + apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: pvc-powerscale + spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 1Gi + storageClassName: ps01 + --- + apiVersion: apps/v1 + kind: Deployment + metadata: + name: deploy-busybox-01 + spec: + strategy: + type: Recreate + replicas: 1 + selector: + matchLabels: + app: deploy-busybox-01 + template: + metadata: + labels: + app: deploy-busybox-01 + spec: + containers: + - name: busybox + image: registry.k8s.io/busybox + command: ["sh", "-c"] + args: ["while true; do touch /data/datafile; rm -f /data/datafile; done"] + volumeMounts: + - name: data + mountPath: /data + env: + - name: http_proxy + value: "http://:3128" + - name: https_proxy + value: "http://:3128" + volumes: + - name: data + persistentVolumeClaim: + claimName: pvc-powerscale + +**Apply the deployment manifest along with PVC** + +Use the following command to apply the manifest: :: + + kubectl apply -f + +*Expected Result*: + +* Once the above manifest is applied, a PVC is created under name ``pvc-powerscale`` and is in ``Bound`` status. Use the ``kubectl get pvc -A`` command to bring up the PVC information. For example: :: + + root@node001:/opt/omnia/csi-driver-powerscale/csi-powerscale/dell-csi-helm-installer# kubectl get pvc -A + NAMESPACE NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS VOLUMEATTRIBUTESCLASS AGE + default pvc-powerscale Bound k8s-b00f77b817 1Gi RWX ps01 27h + +* User can also verify the same information from the OneFS portal. In the sample image below, it is mapped with the ``VOLUME`` entry from the above example: ``k8s-b00f77b817``: + +.. image:: ../../../images/CSI_OneFS.png + +Removal +-------- + +To remove the PowerScale driver manually, do the following: + +1. Login to the ``kube_control_plane``. + +2. Execute the following command to switch to the ``dell-csi-helm-installer`` directory: :: + + cd /opt/omnia/csi-driver-powerscale/csi-powerscale/dell-csi-helm-installer + +3. Once you're inside the ``dell-csi-helm-installer`` directory, use the following command to trigger the ``csi-uninstall`` script: :: + + ./csi-uninstall.sh --namespace isilon + +4. After running the previous command, the PowerScale driver is removed. But, the secret and the created PVC are not removed. If users want to remove them, they need to do it manually from the "isilon" namespace. + +5. If users don't want to use PowerScale anymore, they can remove the following as well: + + a. Remove the PowerScale secret by executing the following commands one after the other: + + i. ``kubectl delete secret isilon-creds -n isilon`` + + ii. ``kubectl delete secret isilon-certs-0 -n isilon`` + + b. Remove any custom user deployment and PVC that was using PowerScale storage class. + + c. Remove the PowerScale storage class. + +.. note:: In case OneFS portal credential changes, users need to perform following steps to update the changes to the ``secret.yaml`` manually: + + 1. Update the ``secret.yaml`` file with the changed credentials. + 2. Login and copy the ``secret.yaml`` file to the ``kube_control_plane``. + 3. Delete the existing secret by executing the following command: :: + + kubectl delete secret isilon-creds -n isilon + + 4. Create the new secret from the updated ``secret.yaml`` file by executing the following command: :: + + kubectl create secret generic isilon-creds -n isilon --from-file=config= \ No newline at end of file diff --git a/docs/source/Roles/Accelerator/index.rst b/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/ROCm_accelerator.rst similarity index 73% rename from docs/source/Roles/Accelerator/index.rst rename to docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/ROCm_accelerator.rst index d7658f68a..fcc1c475a 100644 --- a/docs/source/Roles/Accelerator/index.rst +++ b/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/ROCm_accelerator.rst @@ -1,28 +1,32 @@ Alternate method to install the AMD ROCm platform --------------------------------------------------- +===================================================== The accelerator role allows users to set up the `AMD ROCm `_ platform. This tools allow users to unlock the potential of installed AMD GPUs. -Ensure that the ROCm local repositories are configured using the `local_repo.yml <../../InstallationGuides/LocalRepo/index.html>`_ script. +**Prerequisites** -Ensure that the ``input/software_config.json`` contains valid amdgpu and rocm version. See `input parameters <../../InstallationGuides/LocalRepo/InputParameters.html>`_ for more information. +* Ensure that the ROCm local repositories are configured using the `local_repo.yml <../CreateLocalRepo/index.html>`_ script. +* Ensure that the ``input/software_config.json`` contains valid amdgpu and rocm version. See `input parameters <../CreateLocalRepo/InputParameters.html>`_ for more information. .. note:: * Nodes provisioned using the Omnia provision tool do not require a RedHat subscription to run ``accelerator.yml`` on RHEL target nodes. * For RHEL target nodes not provisioned by Omnia, ensure that RedHat subscription is enabled on all target nodes. Every target node will require a RedHat subscription. - * AMD ROCm driver installation is not supported by Omnia on Rocky Linux cluster nodes. - -To install all the latest GPU drivers and toolkits, run: :: - - cd accelerator - ansible-playbook accelerator.yml -i inventory + * AMD ROCm driver installation is not supported by Omnia on Rocky Linux cluster nodes. +**Playbook configurations** -The following configurations take place when running ``accelerator.yml`` +The following configurations takes place while running the ``accelerator.yml`` playbook: i. Servers with AMD GPUs are identified and the latest GPU drivers and ROCm platforms are downloaded and installed. ii. Servers with no GPU are skipped. +**Executing the playbook** + +To install all the latest GPU drivers and toolkits, run: :: + + cd accelerator + ansible-playbook accelerator.yml -i inventory + User permissions for ROCm platforms ------------------------------------ @@ -33,12 +37,12 @@ User permissions for ROCm platforms .. note:: * is the system name of the end user. * This command must be run with ``root`` permissions. - * If the root user wants to provide access to other users and their individual GPU nodes, the previous command needs to be run on all of them separately. :: + * If the root user wants to provide access to other users and their individual GPU nodes, the previous command needs to be run on all of them separately. * To enable users to use rocm tools, use the following command as shown in the below added sample file: :: /opt/rocm/bin/ -.. image:: ../../images/ROCm_user_permissions.png +.. image:: ../../../images/ROCm_user_permissions.png For any configuration changes, check out ROCm's official documentation `here. `_ \ No newline at end of file diff --git a/docs/source/InstallationGuides/Benchmarks/hpcsoftwarestack.rst b/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/hpcsoftwarestack.rst similarity index 100% rename from docs/source/InstallationGuides/Benchmarks/hpcsoftwarestack.rst rename to docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/hpcsoftwarestack.rst diff --git a/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/index.rst b/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/index.rst new file mode 100644 index 000000000..420bb9cde --- /dev/null +++ b/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/index.rst @@ -0,0 +1,16 @@ +Advanced configurations for RHEL/Rocky Linux clusters +======================================================== + +.. toctree:: + :maxdepth: 2 + + CustomLocalRepo + install_ucx_openmpi + KubernetesAccess + ConfiguringStorage/index + ConfiguringSwitches/index + AdditionalNIC + ROCm_accelerator + PowerScale_CSI + AutomatingOneAPI + AutomatingOpenMPI diff --git a/docs/source/InstallationGuides/BuildingClusters/install_ucx_openmpi.rst b/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/install_ucx_openmpi.rst similarity index 89% rename from docs/source/InstallationGuides/BuildingClusters/install_ucx_openmpi.rst rename to docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/install_ucx_openmpi.rst index fb95a1c23..c3e7f5737 100644 --- a/docs/source/InstallationGuides/BuildingClusters/install_ucx_openmpi.rst +++ b/docs/source/OmniaInstallGuide/RHEL/AdvancedConfigurationsRHEL/install_ucx_openmpi.rst @@ -13,7 +13,7 @@ Configuring UCX and OpenMPI on the cluster * Ensure to run ``local_repo.yml`` with the ``ucx`` and ``openmpi`` entry present in ``software_config.json``, to download all required UCX and OpenMPI packages. -* To install any benchmarking software like UCX or OpenMPI, at least ``slurm_share`` or ``k8s_share`` is set to ``true`` in `storage_config.yml `_, for one of the entries in ``nfs_client_params``. If both are set to true, a higher precedence is given to ``slurm_share``. +* To install any benchmarking software like UCX or OpenMPI, at least ``slurm_share`` or ``k8s_share`` is set to ``true`` in `storage_config.yml <../OmniaCluster/schedulerinputparams.html#storage-config-yml>`_, for one of the entries in ``nfs_client_params``. If both are set to true, a higher precedence is given to ``slurm_share``. **Inventory details** @@ -42,4 +42,4 @@ Run either of the following commands: * All corresponding compiled UCX and OpenMPI files will be saved to the ``/compile`` directory on the nfs share. * All corresponding UCX and OpenMPI executables will be saved to the ``/benchmarks/`` directory on the nfs share. * The default OpenMPI version for Omnia is 4.1.6. If you change the version in the ``software.json`` file, make sure to update it in the ``openmpi.json`` file in the ``input/config`` directory as well. - * To add new nodes in an existing cluster, click `here <../addinganewnode.html>`_. \ No newline at end of file + * To add new nodes in an existing cluster, click `here <../../Maintenance/addnode.html>`_. \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/RHEL/Benchmarks/hpcsoftwarestack.rst b/docs/source/OmniaInstallGuide/RHEL/Benchmarks/hpcsoftwarestack.rst new file mode 100644 index 000000000..25fdbf4f5 --- /dev/null +++ b/docs/source/OmniaInstallGuide/RHEL/Benchmarks/hpcsoftwarestack.rst @@ -0,0 +1,51 @@ +Containerized HPC benchmark execution +-------------------------------------- + +Use this playbook to download docker images and pull images onto cluster nodes using `apptainer `_. + +1. Ensure that the cluster has been `provisioned by the provision tool. <../../InstallationGuides/InstallingProvisionTool/index.html>`_ and the `cluster has been set up using omnia.yml. <../../InstallationGuides/BuildingClusters/index.html>`_ + +2. Enter the following variables in ``utils/hpc_apptainer_job_execution/hpc_apptainer_job_execution_config.yml``: + ++-------------------------+-----------------------------------------------------------------------------------------------------------+ +| Parameter | Details | ++=========================+===========================================================================================================+ +| **hpc_apptainer_image** | * Docker image details to be downloaded in to cluster nodes using apptainer to create a sif file. | +| ``JSON list`` | | +| Required | * Example (for single image): :: | +| | | +| | | +| | hpc_apptainer_image: | +| | | +| | - { image_url: "docker.io/intel/oneapi-hpckit:latest" } | +| | | +| | * Example (for multiple images): :: | +| | | +| | hpc_apptainer_image: | +| | | +| | - { image_url: "docker.io/intel/oneapi-hpckit:latest" } | +| | | +| | - { image_url: "docker.io/tensorflow/tensorflow:latest" } | +| | | +| | * If provided, docker credentials in ``omnia_config.yml``, it will be used for downloading docker images. | +| | | ++-------------------------+-----------------------------------------------------------------------------------------------------------+ +| **hpc_apptainer_path** | * Directory to filepath for storing apptainer sif files on cluster nodes. | +| | | +| ``string`` | * It is recommended to use a directory inside a shared path that is accessible to all cluster nodes. | +| | | +| Required | * **Default value:** ``"/home/omnia-share/softwares/apptainer"`` | ++-------------------------+-----------------------------------------------------------------------------------------------------------+ + +To run the playbook: :: + + cd utils/hpc_apptainer_job_execution + + ansible-playbook hpc_apptainer_job_execution.yml -i inventory + +.. note:: Use the inventory file format specified under `Sample Files. <../../samplefiles.html>`_ + +HPC apptainer jobs can be initiated on a slurm cluster using the following sample command: :: + + srun -N 3 --mpi=pmi2 --ntasks=4 apptainer run /home/omnia-share/softwares/apptainer/oneapi-hpckit_latest.sif hostname + diff --git a/docs/source/InstallationGuides/Benchmarks/index.rst b/docs/source/OmniaInstallGuide/RHEL/Benchmarks/index.rst similarity index 100% rename from docs/source/InstallationGuides/Benchmarks/index.rst rename to docs/source/OmniaInstallGuide/RHEL/Benchmarks/index.rst diff --git a/docs/source/InstallationGuides/LocalRepo/InputParameters.rst b/docs/source/OmniaInstallGuide/RHEL/CreateLocalRepo/InputParameters.rst similarity index 58% rename from docs/source/InstallationGuides/LocalRepo/InputParameters.rst rename to docs/source/OmniaInstallGuide/RHEL/CreateLocalRepo/InputParameters.rst index 8ad1e6e59..d989f6e8f 100644 --- a/docs/source/InstallationGuides/LocalRepo/InputParameters.rst +++ b/docs/source/OmniaInstallGuide/RHEL/CreateLocalRepo/InputParameters.rst @@ -4,66 +4,12 @@ Input parameters for Local Repositories * Input all required values in ``input/software_config.json``. .. csv-table:: Parameters for Software Configuration - :file: ../../Tables/software_config.csv + :file: ../../../Tables/software_config_rhel.csv :header-rows: 1 :keepspace: :class: longtable -Sample version of the file: - -* For Ubuntu: - - :: - - { - "cluster_os_type": "ubuntu", - "cluster_os_version": "22.04", - "repo_config": "partial", - "softwares": [ - {"name": "amdgpu", "version": "6.0"}, - {"name": "cuda", "version": "12.3.2"}, - {"name": "bcm_roce", "version": "229.2.61.0"}, - {"name": "ofed", "version": "24.01-0.3.3.1"}, - {"name": "openldap"}, - {"name": "secure_login_node"}, - {"name": "nfs"}, - {"name": "beegfs", "version": "7.4.2"}, - {"name": "k8s", "version":"1.26.12"}, - {"name": "roce_plugin"}, - {"name": "jupyter"}, - {"name": "kubeflow"}, - {"name": "kserve"}, - {"name": "pytorch"}, - {"name": "tensorflow"}, - {"name": "vllm"}, - {"name": "telemetry"}, - {"name": "ucx", "version": "1.15.0"}, - {"name": "openmpi", "version": "4.1.6"} - ], - - "bcm_roce": [ - {"name": "bcm_roce_libraries", "version": "229.2.61.0"} - ], - "amdgpu": [ - {"name": "rocm", "version": "6.0" } - ], - "vllm": [ - {"name": "vllm_amd"}, - {"name": "vllm_nvidia"} - ], - "pytorch": [ - {"name": "pytorch_cpu"}, - {"name": "pytorch_amd"}, - {"name": "pytorch_nvidia"} - ], - "tensorflow": [ - {"name": "tensorflow_cpu"}, - {"name": "tensorflow_amd"}, - {"name": "tensorflow_nvidia"} - ] - } - -* For RHEL/Rocky Linux OS: +* Sample version for RHEL/Rocky Linux: .. note:: For Rocky Linux OS, the ``cluster_os_type`` in the below sample will be ``rocky``. @@ -74,7 +20,7 @@ Sample version of the file: "cluster_os_version": "8.8", "repo_config": "partial", "softwares": [ - {"name": "amdgpu", "version": "6.0"}, + {"name": "amdgpu", "version": "6.2.2"}, {"name": "cuda", "version": "12.3.2"}, {"name": "ofed", "version": "24.01-0.3.3.1"}, {"name": "freeipa"}, @@ -83,7 +29,7 @@ Sample version of the file: {"name": "nfs"}, {"name": "beegfs", "version": "7.4.2"}, {"name": "slurm"}, - {"name": "k8s", "version":"1.26.12"}, + {"name": "k8s", "version":"1.29.5"}, {"name": "jupyter"}, {"name": "kubeflow"}, {"name": "kserve"}, @@ -95,11 +41,12 @@ Sample version of the file: {"name": "amd_benchmarks"}, {"name": "utils"}, {"name": "ucx", "version": "1.15.0"}, - {"name": "openmpi", "version": "4.1.6"} + {"name": "openmpi", "version": "4.1.6"}, + {"name": "csi_driver_powerscale", "version":"v2.11.0"} ], "amdgpu": [ - {"name": "rocm", "version": "6.0" } + {"name": "rocm", "version": "6.2.2" } ], "vllm": [ {"name": "vllm_amd"}, @@ -119,7 +66,7 @@ Sample version of the file: } -For a list of accepted values in ``softwares``, go to ``input/config//`` and view the list of JSON files available. The filenames present in this location (without the * .json extension) are a list of accepted software names. The repositories to be downloaded for each software are listed the corresponding JSON file. For example, for a cluster running Ubuntu 22.04, go to ``input/config/ubuntu/22.04/`` and view the file list: +For a list of accepted values in ``softwares``, go to ``input/config//`` and view the list of JSON files available. The filenames present in this location (without the * .json extension) are a list of accepted software names. The repositories to be downloaded for each software are listed the corresponding JSON file. For example, for a cluster running RHEL 8.8, go to ``input/config/rhel/8.8/`` and view the file list: :: @@ -131,7 +78,6 @@ For a list of accepted values in ``softwares``, go to ``input/config/`_. +.. note:: To configure a locally available repository that does not have a pre-defined json file, `click here <../AdvancedConfigurationsRHEL/CustomLocalRepo.html>`_. * Input the required values in ``input/local_repo_config.yml``. .. csv-table:: Parameters for Local Repository Configuration - :file: ../../Tables/local_repo_config.csv + :file: ../../../Tables/local_repo_config_rhel.csv :header-rows: 1 :widths: auto -* Input ``docker_username`` and ``docker_password`` in ``input/provision_config_credentials.yml`` to avoid image pullback errors. \ No newline at end of file +* Input ``docker_username`` and ``docker_password`` in ``input/provision_config_credentials.yml`` to avoid image pullback errors. \ No newline at end of file diff --git a/docs/source/InstallationGuides/LocalRepo/Prerequisite.rst b/docs/source/OmniaInstallGuide/RHEL/CreateLocalRepo/Prerequisite.rst similarity index 69% rename from docs/source/InstallationGuides/LocalRepo/Prerequisite.rst rename to docs/source/OmniaInstallGuide/RHEL/CreateLocalRepo/Prerequisite.rst index 346e41b81..3f2869e69 100644 --- a/docs/source/InstallationGuides/LocalRepo/Prerequisite.rst +++ b/docs/source/OmniaInstallGuide/RHEL/CreateLocalRepo/Prerequisite.rst @@ -1,35 +1,25 @@ -Before you create local repositories -------------------------------------- +Prerequisites +=============== -**Space considerations** +1. Set the hostname of the OIM in the "hostname.domain name" format. -If all available software stacks are configured, the free space required on the control plane is as below: +.. include:: ../../../Appendices/hostnamereqs.rst - * For packages: 50GB - * For images (in ``/var``): 500GB - * For storing repositories (the file path should be specified in ``repo_store_path`` in ``input/local_repo_config.yml``): 50GB +For example, ``controlplane.omnia.test`` is acceptable. :: -**On Ubuntu clusters** + hostnamectl set-hostname controlplane.omnia.test -For persistent offline local repositories, (If the parameter ``repo_config`` in ``input/software_config`` is set to ``always``), `click here `_ to set up the required repositories. +2. Creating user registries -.. note:: This link explains how to build a mirror on an Ubuntu 20.04 server. Adapt the steps and scripts as required for any other version of Ubuntu. - -**Set the hostname of the control plane in the "hostname.domain name" format.** - - .. include:: ../../Appendices/hostnamereqs.rst - - For example, ``controlplane.omnia.test`` is acceptable. :: - - hostnamectl set-hostname controlplane.omnia.test - -**When creating user registries** +.. note:: -.. note:: Omnia supports only nerdctl and docker registries as ``user_registry``. + * The ``user_registry`` in ``input/local_repo_config.yml`` supports only nerdctl and docker registries. + * If you define the ``cert_path`` variable, ensure that it points to the absolute path of the user registry certificate present on the Omnia OIM. + * To avoid docker pull limits, provide docker credentials (``docker_username``, ``docker_password``) in ``input/provision_config_credentials.yml``. -To avoid docker pull limits, provide docker credentials (``docker_username``, ``docker_password``) in ``input/provision_config_credentials.yml``. +.. caution:: In order to download the software images from an user registry, the user needs to ensure that the ``user_registry`` address provided in ``input/local_repo_config.yml`` is accessible from the Omnia OIM. If the ``user_registry`` is not accessible from the OIM, Omnia will download all the software images listed in ``input/software_config.json`` to the Omnia-registry. Use the ``curl -k `` to check. -Images listed in ``user_registry`` in ``input/local_repo_config.yml`` are accessed from user defined registries. To ensure that the control plane can correctly access the registry, ensure that the following naming convention is used to save the image: :: +Images listed in ``user_registry`` in ``input/local_repo_config.yml`` are accessed from user defined registries. To ensure that the OIM can correctly access the registry, ensure that the following naming convention is used to save the image: :: /:v diff --git a/docs/source/InstallationGuides/LocalRepo/RunningLocalRepo.rst b/docs/source/OmniaInstallGuide/RHEL/CreateLocalRepo/RunningLocalRepo.rst similarity index 78% rename from docs/source/InstallationGuides/LocalRepo/RunningLocalRepo.rst rename to docs/source/OmniaInstallGuide/RHEL/CreateLocalRepo/RunningLocalRepo.rst index d8610dddc..5461cfb97 100644 --- a/docs/source/InstallationGuides/LocalRepo/RunningLocalRepo.rst +++ b/docs/source/OmniaInstallGuide/RHEL/CreateLocalRepo/RunningLocalRepo.rst @@ -1,36 +1,36 @@ Running local repo ------------------ -The local repository feature will help create offline repositories on the control plane which all the cluster nodes will access. +The local repository feature will help create offline repositories on the OIM which all the cluster nodes will access. **Configurations made by the playbook** - * A registry is created on the control plane at :5001. + * A registry is created on the OIM at :5001. - * If ``repo_config`` in ``local_repo_config.yml`` is set to ``always`` or ``partial``, all images present in the ``input/config//`` folder will be downloaded to the control plane. + * If ``repo_config`` in ``local_repo_config.yml`` is set to ``always`` or ``partial``, all images present in the ``input/config//`` folder will be downloaded to the OIM. - * If the image is defined using a tag, the image will be tagged using :5001/: and pushed to the Omnia local registry. + * If the image is defined using a tag, the image will be tagged using :5001/: and pushed to the Omnia local registry. - * If the image is defined using a digest, the image will be tagged using :5001/:omnia and pushed to the Omnia local registry.repositories + * If the image is defined using a digest, the image will be tagged using :5001/:omnia and pushed to the Omnia local registry.repositories - * When ``repo_config`` in ``local_repo_config.yml`` is set to ``always``, the control plane is set as the default registry mirror. + * When ``repo_config`` in ``local_repo_config.yml`` is set to ``always``, the OIM is set as the default registry mirror. - * When ``repo_config`` in ``local_repo_config`` is set to ``partial``, the ``user_registry`` (if defined) and the control plane are set as default registry mirrors. + * When ``repo_config`` in ``local_repo_config`` is set to ``partial``, the ``user_registry`` (if defined) and the OIM are set as default registry mirrors. To create local repositories, run the following commands: :: cd local_repo ansible-playbook local_repo.yml -.. caution:: During the execution of ``local_repo.yml``, Omnia 1.6.1 will remove packages such as ``podman``, ``containers-common``, and ``buildah`` (if they are already installed), as they conflict with the installation of ``containerd.io`` on RHEL/Rocky Linux OS control plane. +.. caution:: During the execution of ``local_repo.yml``, Omnia 1.7 will remove packages such as ``podman``, ``containers-common``, and ``buildah`` (if they are already installed), as they conflict with the installation of ``containerd.io`` on RHEL/Rocky Linux OS OIM. Verify changes made by the playbook by running ``cat /etc/containerd/certs.d/_default/hosts.toml`` on compute nodes. .. note:: - * View the status of packages for the current run of ``local_repo.yml`` in ``/opt/omnia/offline/download_package_status.csv``. Packages which are already a part of AppStream or BaseOS repositories (for RHEL or Rocky Linux OS) and Focal or Jammy repositories (for Ubuntu) show up as ``Skipped``. - * ``local_repo.yml`` playbook execution fails if any software package download fails. Packages that fail are marked with a "Failed" status. In such a scenario, the user needs to re-run the ``local_repo.yml`` playbook. For more information, `click here <../../Troubleshooting/FAQ.html>`_. + * View the status of packages for the current run of ``local_repo.yml`` in ``/opt/omnia/offline/download_package_status.csv``. Packages which are already a part of AppStream or BaseOS repositories show up as ``Skipped``. + * ``local_repo.yml`` playbook execution fails if any software package download fails. Packages that fail are marked with a "Failed" status. In such a scenario, the user needs to re-run the ``local_repo.yml`` playbook. For more information, `click here <../../../Troubleshooting/FAQ/Common/LocalRepo.html>`_. * If ``repo_config`` is set to ``partial``, packages which are part of the ``user_repo_url`` or images which are part of ``user_registry`` have a ``Skipped`` status in ``/opt/omnia/offline/download_package_status.csv``. * If any software packages failed to download during the execution of this script, scripts that rely on the package for their working (that is, scripts that install the software) may fail. @@ -42,7 +42,7 @@ To fetch images from the ``user_registry`` or the Omnia local registry, run the .. note:: - * After ``local_repo.yml`` has run, the value of ``repo_config`` in ``input/software_config.json`` cannot be updated without running the `control_plane_cleanup.yml <../CleanUpScript.html>`_ script first. + * After ``local_repo.yml`` has run, the value of ``repo_config`` in ``input/software_config.json`` cannot be updated without running the `oim_cleanup.yml <../../Maintenance/cleanup.html>`_ playbook first. * To configure additional local repositories after running ``local_repo.yml``, update ``software_config.json`` and re-run ``local_repo.yml``. diff --git a/docs/source/OmniaInstallGuide/RHEL/CreateLocalRepo/index.rst b/docs/source/OmniaInstallGuide/RHEL/CreateLocalRepo/index.rst new file mode 100644 index 000000000..cbd617d65 --- /dev/null +++ b/docs/source/OmniaInstallGuide/RHEL/CreateLocalRepo/index.rst @@ -0,0 +1,13 @@ +Step 2: Create Local repositories for the cluster +================================================== + +The ``local_repo.yml`` playbook creates offline repositories on the OIM server, which all the cluster nodes will access. This playbook execution requires inputs from ``input/software_config.json`` and ``input/local_repo_config.yml``. + +.. caution:: If you have a proxy server set up for your OIM, you must configure the proxy environment variables on the OIM before running any Omnia playbooks. For more information, `click here <../Setup_CP_proxy.html>`_. + +.. toctree:: + Prerequisite + InputParameters + localrepos + RunningLocalRepo + diff --git a/docs/source/OmniaInstallGuide/RHEL/CreateLocalRepo/localrepos.rst b/docs/source/OmniaInstallGuide/RHEL/CreateLocalRepo/localrepos.rst new file mode 100644 index 000000000..40f6afad1 --- /dev/null +++ b/docs/source/OmniaInstallGuide/RHEL/CreateLocalRepo/localrepos.rst @@ -0,0 +1,373 @@ +Configuring specific local repositories +----------------------------------------- + +**AMD GPU ROCm** + + To install ROCm, do the following: + + * Include the following line under ``softwares`` in ``input/software_config.json``: + + :: + + {"name": "amdgpu", "version": "6.2.2"}, + + * Add the following line below the ``softwares`` section: + + :: + + "amdgpu": [ + {"name": "rocm", "version": "6.2.2" } + ] + + * A sample format is available `here. `_ + +.. note:: If ``amdgpu`` group and ``rocm`` subgroup is provided, the AMD GPU drivers are installed during the cluster provisioning process and the AMD ROCm software stack is installed during ``omnia.yml`` playbook execution. + +**CUDA** + + To install CUDA, include the following line under ``softwares`` in ``input/software_config.json``: :: + + {"name": "cuda", "version": "12.3.2"}, + + + For a list of repositories (and their types) configured for CUDA, view the ``input/config///cuda.json`` file. To customize your CUDA installation, update the file. URLs for different versions can be found `here `_: + + For RHEL or Rocky Linux: :: + + { + "cuda": { + "cluster": [ + { "package": "cuda", + "type": "iso", + "url": "https://developer.download.nvidia.com/compute/cuda/12.3.2/local_installers/cuda-repo-rhel8-12-3-local-12.3.2_545.23.08-1.x86_64.rpm", + "path": "" + }, + { "package": "dkms", + "type": "rpm", + "repo_name": "epel" + } + ] + } + } + + +.. note:: + * If the package version is customized, ensure that the ``version`` value is updated in ``software_config.json``. + * If the target cluster runs on RHEL or Rocky Linux, ensure the "dkms" package is included in ``input/config//8.x/cuda.json`` as illustrated above. + + +**OFED** + + To install OFED, include the following line under ``softwares`` in ``input/software_config.json``: :: + + {"name": "ofed", "version": "24.01-0.3.3.1"}, + + + For a list of repositories (and their types) configured for OFED, view the ``input/config///ofed.json`` file. To customize your OFED installation, update the file.: + + For RHEL or Rocky Linux: :: + + { + "ofed": { + "cluster": [ + { "package": "ofed", + "type": "iso", + "url": "https://content.mellanox.com/ofed/MLNX_OFED-24.01-0.3.3.1/MLNX_OFED_LINUX-24.01-0.3.3.1-rhel8.7-x86_64.iso", + "path": "" + } + ] + } + } + +.. note:: If the package version is customized, ensure that the ``version`` value is updated in ``software_config.json``. + +**BeeGFS** + + To install BeeGFS, include the following line under ``softwares`` in ``input/software_config.json``: :: + + {"name": "beegfs", "version": "7.4.2"}, + + For information on deploying BeeGFS after setting up the cluster, `click here <../OmniaCluster/BuildingCluster/Storage/BeeGFS.html>`_. + +**NFS** + + To install NFS, include the following line under ``softwares`` in ``input/software_config.json``: :: + + {"name": "nfs"}, + + For information on deploying NFS after setting up the cluster, `click here <../OmniaCluster/BuildingCluster/Storage/NFS.html>`_. + +**Kubernetes** + + To install Kubernetes, include the following line under ``softwares`` in ``input/software_config.json``: :: + + {"name": "k8s", "version":"1.29.5"}, + + For more information about installing Kubernetes, `click here <../OmniaCluster/BuildingCluster/install_kubernetes.html>`_. + +.. note:: The version of the software provided above is the only version of the software Omnia supports. + +**Slurm** + + To install Slurm, include the following line under ``softwares`` in ``input/software_config.json``: :: + + {"name": "slurm"}, + + For more information about installing Kubernetes, `click here <../OmniaCluster/BuildingCluster/install_slurm.html>`_. + +.. note:: Omnia recommends to install Slurm with ``always`` and ``partial`` scenarios of ``repo_config`` in ``input/software_config.json``. This is due to intermittent connectivity issues with the EPEL8 repositories. + +**FreeIPA** + + To install FreeIPA, include the following line under ``softwares`` in ``input/software_config.json``: :: + + {"name": "freeipa"}, + + For more information on FreeIPA, `click here <../OmniaCluster/BuildingCluster/Authentication.html#configuring-freeipa-openldap-security>`_. + + +**OpenLDAP** + + To install OpenLDAP, include the following line under ``softwares`` in ``input/software_config.json``: :: + + {"name": "openldap"}, + + For more information on OpenLDAP, `click here <../OmniaCluster/BuildingCluster/Authentication.html#configuring-freeipa-openldap-security>`_. + + +**Secure Login Node** + + To secure the login node, include the following line under ``softwares`` in ``input/software_config.json``: :: + + {"name": "secure_login_node"}, + + For more information on configuring login node security, `click here <../OmniaCluster/BuildingCluster/Authentication.html#configuring-login-node-security>`_. + + +**Telemetry** + + To install Telemetry, include the following line under ``softwares`` in ``input/software_config.json``: :: + + {"name": "telemetry"}, + + For information on deploying Telemetry after setting up the cluster, `click here <../../../Telemetry/index.html>`_. + +**PowerScale CSI driver** + + To install PowerScale CSI driver, include the following line under ``softwares`` in ``input/software_config.json``: :: + + {"name": "csi_driver_powerscale", "version":"v2.11.0"}, + + For information on PowerScale CSI driver, `click here <../AdvancedConfigurationsRHEL/PowerScale_CSI.html>`_. + +**Jupyterhub** + + To install Jupyterhub, include the following line under ``softwares`` in ``input/software_config.json``: :: + + {"name": "jupyter"}, + + For information on deploying Jupyterhub after setting up the cluster, `click here <../InstallAITools/InstallJupyterhub.html>`_. + + +**Kserve** + + To install Kserve, include the following line under ``softwares`` in ``input/software_config.json``: :: + + {"name": "kserve"}, + + For information on deploying Kserve after setting up the cluster, `click here <../InstallAITools/kserve.html>`_. + + +**Kubeflow** + + To install kubeflow, include the following line under ``softwares`` in ``input/software_config.json``: :: + + {"name": "kubeflow"}, + + For information on deploying kubeflow after setting up the cluster, `click here <../InstallAITools/kubeflow.html>`_. + + +**Pytorch** + + To install PyTorch, do the following: + + * Include the following line under ``softwares`` in ``input/software_config.json``: + + :: + + {"name": "pytorch"}, + + * Add the following line below the ``softwares`` section: + + :: + + "pytorch": [ + {"name": "pytorch_cpu"}, + {"name": "pytorch_amd"}, + {"name": "pytorch_nvidia"} + ], + + * A sample format is available `here. `_ + +For information on deploying Pytorch after setting up the cluster, `click here. <../InstallAITools/Pytorch.html>`_ + + +**TensorFlow** + + To install TensorFlow, do the following: + + * Include the following line under ``softwares`` in ``input/software_config.json``: + + :: + + {"name": "tensorflow"}, + + * Add the following line below the ``softwares`` section: + + :: + + "tensorflow": [ + {"name": "tensorflow_cpu"}, + {"name": "tensorflow_amd"}, + {"name": "tensorflow_nvidia"} + ] + + * A sample format is available `here. `_ + +For information on deploying TensorFlow after setting up the cluster, `click here <../InstallAITools/TensorFlow.html>`_. + + +**vLLM** + + To install vLLM, do the following: + + * Include the following line under ``softwares`` in ``input/software_config.json``: + + :: + + {"name": "vLLM"}, + + * Add the following line below the ``softwares`` section: + + :: + + "vllm": [ + {"name": "vllm_amd"}, + {"name": "vllm_nvidia"} + ], + + * A sample format is available `here. `_ + +For information on deploying vLLM after setting up the cluster, `click here <../InstallAITools/vLLM/index.html>`_. + + +**OpenMPI** + + To install OpenMPI, include the following line under ``softwares`` in ``input/software_config.json``: :: + + {"name": "openmpi", "version":"4.1.6"}, + +OpenMPI is deployed on the cluster when the above configurations are complete and `omnia.yml <../OmniaCluster/BuildingCluster/installscheduler.html>`_ playbook is executed. + +For more information on OpenMPI configurations, `click here <../AdvancedConfigurationsRHEL/install_ucx_openmpi.html>`_. + +.. note:: The default OpenMPI version for Omnia is 4.1.6. If you change the version in the ``software.json`` file, make sure to update it in the ``openmpi.json`` file in the ``input/config`` directory as well. + + +**Unified Communication X** + + To install UCX, include the following line under ``softwares`` in ``input/software_config.json``: :: + + {"name": "ucx", "version":"1.15.0"}, + +UCX is deployed on the cluster when ``local_repo.yml`` playbook is executed, followed by the execution of `omnia.yml <../OmniaCluster/BuildingCluster/installscheduler.html>`_. + +For more information on UCX configurations, `click here <../AdvancedConfigurationsRHEL/install_ucx_openmpi.html>`_. + + +**Intel benchmarks** + + To install Intel benchmarks, include the following line under ``softwares`` in ``input/software_config.json``: :: + + {"name": "intel_benchmarks", "version": "2024.1.0"}, + +For more information on Intel benchmarks, `click here <../AdvancedConfigurationsRHEL/AutomatingOneAPI.html>`_. + + +**AMD benchmarks** + + To install AMD benchmarks, include the following line under ``softwares`` in ``input/software_config.json``: :: + + {"name": "amd_benchmarks"}, + +For more information on AMD benchmarks, `click here <../AdvancedConfigurationsRHEL/AutomatingOpenMPI.html>`_. + + +**Custom repositories** + + Include the following line under ``softwares`` in ``input/software_config.json``: :: + + {"name": "custom"}, + + Create a ``custom.json`` file in the following directory: ``input/config//`` to define the repositories. For example, For a cluster running RHEL 8.8, go to ``input/config/rhel/8.8/`` and create the file there. The file is a JSON list consisting of the package name, repository type, URL (optional), and version (optional). Below is a sample version of the file: :: + + { + "custom": { + "cluster": [ + { + "package": "ansible==5.3.2", + "type": "pip_module" + }, + { + "package": "docker-ce-24.0.4", + "type": "rpm", + "repo_name": "docker-ce-repo" + }, + + { + "package": "gcc", + "type": "rpm", + "repo_name": "appstream" + }, + { + "package": "community.general", + "type": "ansible_galaxy_collection", + "version": "4.4.0" + }, + + { + "package": "perl-Switch", + "type": "rpm", + "repo_name": "codeready-builder" + }, + { + "package": "prometheus-slurm-exporter", + "type": "git", + "url": "https://github.com/vpenso/prometheus-slurm-exporter.git", + "version": "master" + }, + { + "package": "ansible.utils", + "type": "ansible_galaxy_collection", + "version": "2.5.2" + }, + { + "package": "prometheus-2.23.0.linux-amd64", + "type": "tarball", + "url": "https://github.com/prometheus/prometheus/releases/download/v2.23.0/prometheus-2.23.0.linux-amd64.tar.gz" + }, + { + "package": "metallb-native", + "type": "manifest", + "url": "https://raw.githubusercontent.com/metallb/metallb/v0.13.4/config/manifests/metallb-native.yaml" + }, + { + "package": "registry.k8s.io/pause", + "version": "3.9", + "type": "image" + } + + ] + } + } + diff --git a/docs/source/InstallationGuides/Platform/InstallJupyterhub.rst b/docs/source/OmniaInstallGuide/RHEL/InstallAITools/InstallJupyterhub.rst similarity index 82% rename from docs/source/InstallationGuides/Platform/InstallJupyterhub.rst rename to docs/source/OmniaInstallGuide/RHEL/InstallAITools/InstallJupyterhub.rst index f11f69efa..050b958c5 100644 --- a/docs/source/InstallationGuides/Platform/InstallJupyterhub.rst +++ b/docs/source/OmniaInstallGuide/RHEL/InstallAITools/InstallJupyterhub.rst @@ -1,16 +1,16 @@ Setup Jupyterhub ----------------- -Using Jupyterhub helm chart (version 3.2.0), Omnia installs Jupyterhub (version 4.0.2) on Kubernetes clusters. Once Jupyterhub is deployed, log into the GUI to create your own Jupyter notebook. For more information, `click here `_. +Omnia installs Jupyterhub (version 3.2.0) on Kubernetes clusters. Once Jupyterhub is deployed, log into the GUI to create your own Jupyter notebook. For more information, `click here `_. **Prerequisites** * Ensure that Kubernetes is deployed and all pods are running on the cluster. * MetalLB pod is up and running to provide external IP to jupyterhub service. * Ensure the passed inventory file includes ``kube_control_plane`` and ``kube_node`` groups. `Click here <../../samplefiles.html>`_ for a sample file. -* Review the ``tools/jupyter_config.yml`` file to ensure that the deployment meets your requirements. If not, modify the file. +* Review the ``tools/jupyterhub_config.yml`` file to ensure that the deployment meets your requirements. If not, modify the file. * Run ``local_repo.yml`` with ``jupyter`` entry in ``software_config.json``. -* Omnia deploys the ``quay.io/jupyterhub/k8s-singleuser-sample:3.2.0`` image irrespective of whether the intended notebooks are CPU-only, NVIDIA GPU, or AMD GPU. To use a custom image, modify the ``omnia/tools/roles/jupyter_config.yml`` file. +* Omnia deploys the ``quay.io/jupyterhub/k8s-singleuser-sample:3.2.0`` image irrespective of whether the intended notebooks are CPU-only, NVIDIA GPU, or AMD GPU. To use a custom image, modify the ``omnia/tools/jupyterhub_config.yml`` file. * Ensure that NFS storage provisioner has been deployed on the cluster using ``storage.yml`` followed by ``scheduler.yml`` or ``omnia.yml``. Verify that the required NFS storage provisioner is deployed using the below command: :: [root@node3 ~]# kubectl get pod -A @@ -39,7 +39,7 @@ Using Jupyterhub helm chart (version 3.2.0), Omnia installs Jupyterhub (version **Accessing the Jupyterhub GUI** -1. Login to kube control plane and verify that the Jupyterhub service is running. +1. Login to the ``kube_control_plane`` and verify that the Jupyterhub service is running. 2. Find the IP address of the Jupyterhub service using: :: @@ -55,11 +55,11 @@ Using Jupyterhub helm chart (version 3.2.0), Omnia installs Jupyterhub (version 3. The Jupyterhub GUI should be accessible from the ``kube_control_plane`` via the external IP mentioned above. Use any browser to log in. Currently Jupyterhub authentication is not linked with openLDAP. -.. image:: ../../images/Jupyterhub_Login.png +.. image:: ../../../images/Jupyterhub_Login.png 4. Choose your preferred notebook server option and click **Start**. A pod will be created for the user. -.. image:: ../../images/Jupyterhub_UI.png +.. image:: ../../../images/Jupyterhub_UI.png .. role:: raw-role(raw) @@ -67,7 +67,7 @@ Using Jupyterhub helm chart (version 3.2.0), Omnia installs Jupyterhub (version :raw-role:`
` -.. image:: ../../images/Jupyterhub_UI_2.png +.. image:: ../../../images/Jupyterhub_UI_2.png **Stopping the Notebook server** @@ -78,7 +78,7 @@ Using Jupyterhub helm chart (version 3.2.0), Omnia installs Jupyterhub (version **Redeploy Jupyterhub with new configurations** -1. Update the ``tools/jupyter_config.yml`` file with the new configuration. +1. Update the ``tools/jupyterhub_config.yml`` file with the new configuration. 2. Re-run the ``jupyterhub.yml`` playbook. :: cd tools diff --git a/docs/source/InstallationGuides/Platform/Pytorch.rst b/docs/source/OmniaInstallGuide/RHEL/InstallAITools/Pytorch.rst similarity index 92% rename from docs/source/InstallationGuides/Platform/Pytorch.rst rename to docs/source/OmniaInstallGuide/RHEL/InstallAITools/Pytorch.rst index 4f5eb4a4c..2aec1bab1 100644 --- a/docs/source/InstallationGuides/Platform/Pytorch.rst +++ b/docs/source/OmniaInstallGuide/RHEL/InstallAITools/Pytorch.rst @@ -37,7 +37,7 @@ PyTorch is a popular open-source deep learning framework, renowned for its dynam **Accessing PyTorch (CPU)** -1. Verify that the PyTorch image present in container engine images: :: +1. Verify that the PyTorch image is present in container engine images: :: nerdctl images @@ -50,7 +50,7 @@ For more information, `click here `_. + Omnia targets all nodes that appear in the Kubernetes inventory when deploying the desired AI toolset; that is, the AI tool will be deployed on every Kubernetes node mentioned in the inventory. Ensure to mention all the desired nodes in the Kubernetes inventory file while deploying the AI tools via their respective playbooks. For more information on how to set up Kubernetes, `click here <../OmniaCluster/BuildingCluster/install_kubernetes.html>`_. .. toctree:: + InstallJupyterhub kubeflow vLLM/index diff --git a/docs/source/InstallationGuides/Platform/kserve.rst b/docs/source/OmniaInstallGuide/RHEL/InstallAITools/kserve.rst similarity index 89% rename from docs/source/InstallationGuides/Platform/kserve.rst rename to docs/source/OmniaInstallGuide/RHEL/InstallAITools/kserve.rst index d95e35ac3..ccb6ce147 100644 --- a/docs/source/InstallationGuides/Platform/kserve.rst +++ b/docs/source/OmniaInstallGuide/RHEL/InstallAITools/kserve.rst @@ -1,11 +1,11 @@ Setup Kserve -------------- -Kserve is an open-source serving platform that simplifies the deployment, scaling, and management of machine learning models in production environments, ensuring efficient and reliable inference capabilities. For more information, `click here. `_ Omnia deploys Kserve (v0.11.2) on the kubernetes cluster. Once Kserve is deployed, any inference service can be installed on the kubernetes cluster. +Kserve is an open-source serving platform that simplifies the deployment, scaling, and management of machine learning models in production environments, ensuring efficient and reliable inference capabilities. For more information, `click here. `_ Omnia deploys Kserve (v0.13.0) on the kubernetes cluster. Once Kserve is deployed, any inference service can be installed on the kubernetes cluster. -.. note:: Omnia 1.6.1 does not support deploying both Kserve and Kubeflow in the same Kubernetes cluster. If Kubeflow is already deployed on the cluster and you wish to deploy Kserve, you must first remove Kubeflow by following the steps `here `_. +.. note:: Omnia 1.7 does not support deploying both Kserve and Kubeflow in the same Kubernetes cluster. If Kubeflow is already deployed on the cluster and you wish to deploy Kserve, you must first remove Kubeflow by following the steps `here `_. -.. caution:: Kserve deployment occasionally fails on RHEL 8.8 clusters. `Reprovision the cluster <../reprovisioningthecluster.html>`_ and re-deploy Kserve. For more information, refer to the `known issues <../../Troubleshooting/knownissues.html>`_ section. +.. caution:: Kserve deployment occasionally fails on RHEL 8.8 clusters. `Reprovision the cluster <../../Maintenance/reprovision.html>`_ and re-deploy Kserve. For more information, refer to the `known issues <../../../Troubleshooting/KnownIssues/RHEL/AITools.html>`_ section. **Prerequisites** @@ -35,9 +35,9 @@ Kserve is an open-source serving platform that simplifies the deployment, scalin Post deployment, the following dependencies are installed along with Kserve: - * Istio (version: 1.17.0) - * Certificate manager (version: 1.13.0) - * Knative (version: 1.11.0) + * Istio (version: 1.20.4) + * Certificate manager (version: 1.14.5) + * Knative (version: 1.13.1) To verify the installation, run ``kubectl get pod -A`` and look for the namespaces: ``cert-manager``, ``istio-system``, ``knative-serving``, and ``kserve``. :: @@ -84,9 +84,9 @@ Kserve is an open-source serving platform that simplifies the deployment, scalin 2. Add ``docker.io`` and ``index.docker.io`` as part of ``registries-skipping-tag-resolving`` - .. image:: ../../images/kserve_config_map.png + .. image:: ../../../images/kserve_config_map.png - For more information, `click here. <../../Troubleshooting/knownissues.html>`_ + For more information, `click here. <../../../Troubleshooting/KnownIssues/Common/AITools.html>`_ **Access the inference service** @@ -105,7 +105,7 @@ Kserve is an open-source serving platform that simplifies the deployment, scalin istiod ClusterIP 10.233.18.185 15010/TCP,15012/TCP,443/TCP,15014/TCP 44h knative-local-gateway ClusterIP 10.233.37.248 80/TCP 44h -3. To access inferencing from the ingressgateway with HOST header, run the below command from the kube_control_plane or kube_node: :: +3. To access inferencing from the ingressgateway with HOST header, run the below command from the ``kube_control_plane`` or ``kube_node``: :: curl -v -H "Host: " -H "Content-Type: application/json" "http://:/v1/models/:predict" -d @./iris-input.json @@ -132,7 +132,7 @@ For example: :: * Connection #0 to host 10.20.0.101 left intact {"predictions":[1,1]} -.. note:: Refer to `image pull <../../Roles/Utils/pullimagestonodes.html>`_ in case of ImagePullBackOff issue while deploying inference service. +.. note:: Refer to `image pull <../pullimagestonodes.html>`_ in case of ImagePullBackOff issue while deploying inference service. **Remove Kserve** diff --git a/docs/source/InstallationGuides/Platform/kubeflow.rst b/docs/source/OmniaInstallGuide/RHEL/InstallAITools/kubeflow.rst similarity index 88% rename from docs/source/InstallationGuides/Platform/kubeflow.rst rename to docs/source/OmniaInstallGuide/RHEL/InstallAITools/kubeflow.rst index 67c18f2dd..527dcca3d 100644 --- a/docs/source/InstallationGuides/Platform/kubeflow.rst +++ b/docs/source/OmniaInstallGuide/RHEL/InstallAITools/kubeflow.rst @@ -2,7 +2,7 @@ Setup Kubeflow --------------- Kubeflow is an open-source platform for machine learning and MLOps on Kubernetes introduced by Google. -.. note:: Omnia 1.6.1 does not support deploying both Kserve and Kubeflow in the same Kubernetes cluster. If Kserve is already deployed on the cluster and you wish to deploy Kubeflow, you must first remove Kserve by following the steps `here `_. +.. note:: Omnia 1.7 does not support deploying both Kserve and Kubeflow in the same Kubernetes cluster. If Kserve is already deployed on the cluster and you wish to deploy Kubeflow, you must first remove Kserve by following the steps `here `_. **Prerequisite** @@ -12,7 +12,7 @@ Ensure that you have executed ``local_repo.yml`` with Kubeflow specified in the First, ensure that you have a Kubernetes cluster deployed on your compute node. -For instructions to set up Kubernetes, `click here <../BuildingClusters/install_kubernetes.html>`_. +For instructions to set up Kubernetes, `click here <../OmniaCluster/BuildingCluster/install_kubernetes.html>`_. .. note:: The playbooks automate the process, ensuring consistency across deployments. @@ -61,7 +61,7 @@ After obtaining the external IP address of the ingress gateway, you can access t * Open any browser of your choice and go to ``http://external_ip:80``. * You will be redirected to the Dex login page. You can find a sample image below. - .. image:: ../../images/dex_login.png + .. image:: ../../../images/dex_login.png **Login to the Kubeflow dashboard** diff --git a/docs/source/InstallationGuides/Platform/vLLM/HuggingFace.rst b/docs/source/OmniaInstallGuide/RHEL/InstallAITools/vLLM/HuggingFace.rst similarity index 96% rename from docs/source/InstallationGuides/Platform/vLLM/HuggingFace.rst rename to docs/source/OmniaInstallGuide/RHEL/InstallAITools/vLLM/HuggingFace.rst index 4bedef390..f173baa56 100644 --- a/docs/source/InstallationGuides/Platform/vLLM/HuggingFace.rst +++ b/docs/source/OmniaInstallGuide/RHEL/InstallAITools/vLLM/HuggingFace.rst @@ -69,7 +69,7 @@ For a complete list of quick start examples, `click here :3128 && export https_proxy=http://:3128 && python -m vllm.entrypoints.api_server --model facebook/opt-125m' * Once the above command is executed, vllm gets enabled through port 8000. Now, user can utilise endpoint to communicate with the model. @@ -109,7 +109,7 @@ For a complete list of quick start examples, `click here :3128 && export https_proxy=http://:3128 && python -m vllm.entrypoints.openai.api_server --model facebook/opt-125m' Expected output: diff --git a/docs/source/InstallationGuides/Platform/vLLM/benchmarktesting.rst b/docs/source/OmniaInstallGuide/RHEL/InstallAITools/vLLM/benchmarktesting.rst similarity index 100% rename from docs/source/InstallationGuides/Platform/vLLM/benchmarktesting.rst rename to docs/source/OmniaInstallGuide/RHEL/InstallAITools/vLLM/benchmarktesting.rst diff --git a/docs/source/InstallationGuides/Platform/vLLM/index.rst b/docs/source/OmniaInstallGuide/RHEL/InstallAITools/vLLM/index.rst similarity index 64% rename from docs/source/InstallationGuides/Platform/vLLM/index.rst rename to docs/source/OmniaInstallGuide/RHEL/InstallAITools/vLLM/index.rst index 9d4d5519e..99518dee6 100644 --- a/docs/source/InstallationGuides/Platform/vLLM/index.rst +++ b/docs/source/OmniaInstallGuide/RHEL/InstallAITools/vLLM/index.rst @@ -3,35 +3,35 @@ Setup vLLM vLLM is a fast and easy-to-use library for LLM inference and serving. It is seamlessly integrated with popular HuggingFace models. It is also compatible with OpenAI API servers and GPUs (Both NVIDIA and AMD). vLLM 0.2.4 and above supports model inferencing and serving on AMD GPUs with ROCm. At the moment AWQ quantization is not supported in ROCm, but SqueezeLLM quantization has been ported. Data types currently supported in ROCm are FP16 and BF16. -For NVidia, vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries. +For NVIDIA, vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries. -With an Ansible script, deploy vLLM on both the kube_node and kube_control_plane. After the deployment of vLLM, access the vllm container (AMD GPU) and import the vLLM Python package (NVIDIA GPU). For more information, `click here `_ +Omnia deploys vLLM on both the ``kube_node`` and ``kube_control_plane``, using an ansible script. After the deployment of vLLM, access the vLLM container (AMD GPU) and import the vLLM Python package (NVIDIA GPU). For more information, `click here `_ -.. note:: This playbook is supported on Ubuntu 22.04 and RHEL 8.8. +.. note:: This playbook has been tested on the RHEL 8.8 OS platform. **Pre requisites** -* Ensure nerdctl is available on all cluster nodes. +* Ensure nerdctl registry is available on all cluster nodes. -* Only AMD GPUs from the MI200s (gfx90a) are supported. +* Only AMD MI200s (gfx90a) and newer GPUs are supported. -* For nodes using NVidia, ensure that the GPU has a compute capacity that is higher than 7 (Eg: V100, T4, RTX20xx, A100, L4, H100, etc). +* For nodes with NVIDIA GPUs, ensure that the GPU has a minimum compute capability of 7.0 (Volta architecture). Few examples of such NVIDIA GPUs are: T4, A100, L4, H100. -* Ensure the ``kube_node``, ``kube_control_plane`` is setup and working. If NVidia or AMD GPU acceleration is required for the task, install the NVidia (with containerd) or AMD ROCm GPU drivers during provisioning. +* Ensure the ``kube_node``, ``kube_control_plane`` is setup and working. If NVIDIA or AMD GPU acceleration is required for the task, install the NVIDIA (with containerd) or AMD ROCm GPU drivers during provisioning. -* Use ``local_repo.yml`` to create an offline vLLM repository. For more information, `click here. <../../LocalRepo/localrepos.html>`_ +* Use ``local_repo.yml`` to create an offline vLLM repository. For more information, `click here. <../../CreateLocalRepo/localrepos.html>`_ **[Optional prerequisites]** -* Ensure the system has enough available space. (Approximately 100GiB is required for the vLLM image. Any additional scripting will take disk capacity outside the image.) +* Ensure the system has enough available space. (Approximately 100GB is required for the vLLM image. Any additional scripting will take disk capacity outside the image.) -* Ensure the passed inventory file has a ``kube_control_plane`` and ``kube_node`` listing all cluster nodes. +* Ensure the provided inventory file has one ``kube_control_plane`` and all cluster nodes should be listed under ``kube_node``. -* Update the ``/input/software_config.json`` file with the correct vLLM version required. The default value is ``vllm-v0.2.4`` for AMD container and ``vllm latest`` for NVidia. +* Update the ``/input/software_config.json`` file with the correct vLLM version required. The default value is ``vllm-v0.2.4`` for AMD container and ``vllm latest`` for NVIDIA. -* Omnia deploys the vLLM pip installation for NVidia GPU, or ``embeddedllminfo/vllm-rocm:vllm-v0.2.4`` container image for AMD GPU. +* Omnia deploys the vLLM pip installation for NVIDIA GPU, or ``embeddedllminfo/vllm-rocm:vllm-v0.2.4`` container image for AMD GPU. -* Nerdctl does not support mounting directories as devices because it is not a feature of containerd (The runtime that nerdctl uses). Individual files need to be attached while running nerdctl. +* **nerdctl** does not support mounting directories as devices because it is not a feature of containerd (nerdctl runtime). Individual files need to be attached while running nerdctl. **Deploying vLLM** @@ -46,7 +46,7 @@ With an Ansible script, deploy vLLM on both the kube_node and kube_control_plane The default namespace is for deployment is ``vLLM``. -.. note:: During the ``vllm.yml`` playbook execution, nodes with AMD or Nvidia GPUs and drivers will install and test either the ``vllm-AMD`` or ``vllm-Nvidia`` containers, respectively. +.. note:: During the ``vllm.yml`` playbook execution, nodes with AMD or NVIDIA GPUs and drivers will install and test either the ``vllm-AMD`` or ``vllm-Nvidia`` containers, respectively. **Accessing the vLLM (AMD)** @@ -60,11 +60,11 @@ The default namespace is for deployment is ``vLLM``. 3. To enable an endpoint, `click here `_. -**Accessing the vLLM (NVidia)** +**Accessing the vLLM (NVIDIA)** 1. Verify that the vLLM package is installed: :: - python3.9 -c "import vllm; print(vllm.__version__)" + python3.11 -c "import vllm; print(vllm.__version__)" 2. Use the package within a python script as demonstrated in the sample below: :: diff --git a/docs/source/InstallationGuides/Platform/vLLM/vllmInternet.rst b/docs/source/OmniaInstallGuide/RHEL/InstallAITools/vLLM/vllmInternet.rst similarity index 68% rename from docs/source/InstallationGuides/Platform/vLLM/vllmInternet.rst rename to docs/source/OmniaInstallGuide/RHEL/InstallAITools/vLLM/vllmInternet.rst index 54eb8b9de..a5198c306 100644 --- a/docs/source/InstallationGuides/Platform/vLLM/vllmInternet.rst +++ b/docs/source/OmniaInstallGuide/RHEL/InstallAITools/vLLM/vllmInternet.rst @@ -5,5 +5,5 @@ To enable internet access within the container, user needs to export ``http_prox :: - export http_proxy=http://:3128 - export https_proxy=http://:3128 \ No newline at end of file + export http_proxy=http://:3128 + export https_proxy=http://:3128 \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/RHEL/InstallAITools/vLLM/vllmMI300.rst b/docs/source/OmniaInstallGuide/RHEL/InstallAITools/vLLM/vllmMI300.rst new file mode 100644 index 000000000..bfab57218 --- /dev/null +++ b/docs/source/OmniaInstallGuide/RHEL/InstallAITools/vLLM/vllmMI300.rst @@ -0,0 +1,69 @@ +vLLM enablement for AMD MI300 GPU +---------------------------------- + +.. note:: This whole execution will take approximately 3-4 hours. + +* MI300 support is enabled with vLLM version 0.3.2 +* The ``vllm_build.yml`` file is located inside ``omnia/utils/vllm_build``. + +Follow the below steps to setup the vLLM: + +1. **Build vLLM** + + * Update the ``admin-nic-IP`` in the ``vllm_k8s_config.yml`` file located inside the ``omnia/utils/vllm_build`` directory. + + * Run the ``vllm_build.yml`` playbook using: :: + + ansible-playbook vllm_build.yml + +2. **Verify vLLM** + +Once the playbook is executed, run the following command to verify whether vLLM image generation was successful. + +:: + + nerdctl images | grep vllm + +3. Update "package" and "tag" details in the ``vllm.json`` file located at ``omnia/tools/input/config/rhel/8.8/vllm.json``, as shown below. + +:: + + "vllm_amd": { + + + + "cluster": [ + + { + + "package": "vllm-rocm", + + "tag": "latest", + + "type": "image" + + } + + ] + + + + } + +4. Finally, deploy the latest vllm using the ``vllm.yml`` playbook located at ``omnia/tools/vllm.yml``. Use the following command: + +:: + + ansible-playbook vllm.yml -i inv.ini + +A sample inventory is attached below: + +:: + + inv.ini + + [kube_node] + + 10.5.x.a + + 10.5.x.b \ No newline at end of file diff --git a/docs/source/InstallationGuides/BuildingClusters/AMD_ROCm.rst b/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/BuildingCluster/AMD_ROCm.rst similarity index 86% rename from docs/source/InstallationGuides/BuildingClusters/AMD_ROCm.rst rename to docs/source/OmniaInstallGuide/RHEL/OmniaCluster/BuildingCluster/AMD_ROCm.rst index aca37833b..8f7f69fb7 100644 --- a/docs/source/InstallationGuides/BuildingClusters/AMD_ROCm.rst +++ b/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/BuildingCluster/AMD_ROCm.rst @@ -3,9 +3,9 @@ Install the ROCm platform for AMD GPUs This playbook sets up the `AMD ROCm `_ platform on the clusters. This tool allows users to unlock the full potential of installed AMD GPUs. -Ensure that the ROCm local repositories are configured using the `local_repo.yml <../../InstallationGuides/LocalRepo/index.html>`_ script. +Ensure that the ROCm local repositories are configured using the `local_repo.yml <../../CreateLocalRepo/localrepos.html#configure-specific-local-repositories>`_ script. -Ensure that the ``input/software_config.json`` contains valid amdgpu and rocm version. See `input parameters <../../InstallationGuides/LocalRepo/InputParameters.html>`_ for more information. +Ensure that the ``input/software_config.json`` contains valid amdgpu and rocm version. See `input parameters <../../CreateLocalRepo/InputParameters.html>`_ for more information. .. note:: AMD ROCm driver installation is not supported by Omnia on Rocky Linux cluster nodes. @@ -35,7 +35,7 @@ User permissions for ROCm platforms /opt/rocm/bin/ -.. image:: ../../images/ROCm_user_permissions.png +.. image:: ../../../../images/ROCm_user_permissions.png For any configuration changes, check out ROCm's official documentation `here. `_ diff --git a/docs/source/InstallationGuides/BuildingClusters/Authentication.rst b/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/BuildingCluster/Authentication.rst similarity index 81% rename from docs/source/InstallationGuides/BuildingClusters/Authentication.rst rename to docs/source/OmniaInstallGuide/RHEL/OmniaCluster/BuildingCluster/Authentication.rst index d482b254a..0c684fe87 100644 --- a/docs/source/InstallationGuides/BuildingClusters/Authentication.rst +++ b/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/BuildingCluster/Authentication.rst @@ -10,28 +10,35 @@ ______________________________________ .. note:: FreeIPA configuration is not supported on Ubuntu. -**Pre requisites** +**Prerequisites** + +* To set up FreeIPA, ensure that the following entry is present in the ``input/software_config.json``: :: + + {"name": "freeipa"} -* Run ``local_repo.yml`` to create offline repositories of FreeIPA or OpenLDAP. If both were downloaded, ensure that the non-required system is removed from ``input/software_config.json`` before running ``security.yml``. For more information, `click here <../../InstallationGuides/LocalRepo/index.html>`_. +* To set up OpenLDAP, ensure that the following entry is present in the ``input/software_config.json``: :: -* Enter the following parameters in ``input/security_config.yml``. + {"name": "openldap"} + +* Run ``local_repo.yml`` to create offline repositories of FreeIPA or OpenLDAP. If both were downloaded, ensure that the non-required system is removed from ``input/software_config.json`` before running ``security.yml``. For more information, `click here <../../CreateLocalRepo/index.html>`_. + +* Enter the following parameters in ``input/security_config.yml``: .. csv-table:: Parameters for Authentication - :file: ../../Tables/security_config.csv + :file: ../../../../Tables/security_config.csv :header-rows: 1 :keepspace: .. csv-table:: Parameters for OpenLDAP configuration - :file: ../../Tables/security_config_ldap.csv + :file: ../../../../Tables/security_config_ldap.csv :header-rows: 1 :keepspace: .. csv-table:: Parameters for FreeIPA configuration - :file: ../../Tables/security_config_freeipa.csv + :file: ../../../../Tables/security_config_freeipa.csv :header-rows: 1 :keepspace: -.. [1] Boolean parameters do not need to be passed with double or single quotes. Running the security role -------------------------- @@ -41,13 +48,13 @@ The wrapper playbook ``omnia.yml`` handles execution of the security or authenti cd security ansible-playbook security.yml -i inventory -The inventory should contain auth_server as per the inventory file in `samplefiles <../../samplefiles.html#inventory-file>`_. The inventory file is case-sensitive. Follow the format provided in the sample file link. +The provided inventory should contain ``auth_server`` and ``login`` [optional] groups. The inventory file is case-sensitive. Follow the format provided in the `sample files <../../../samplefiles.html#inventory-file>`_. - * Do not include the IP of the control plane or local host as the ``auth_server group`` in the inventory file. - * To customize the security features on the login node, update the desired parameters in ``input/login_node_security_config.yml``. + * Do not include the IP of the OIM or local host in the ``auth_server`` group of the inventory file. + * For `secure login node functionality `_, ensure to add the ``login`` group in the provided inventory file. To customize the security features on the login node, update the desired parameters in ``input/login_node_security_config.yml``. * If a subsequent run of ``security.yml`` fails, the ``security_config.yml`` file will be unencrypted. -.. note:: Installation of OpenLDAP server or FreeIPA server on Control Plane is not supported. +.. note:: Installation of OpenLDAP server or FreeIPA server on OIM is not supported. .. caution:: No users will be created by Omnia. @@ -98,7 +105,7 @@ Once user accounts are created, admins can enable passwordless SSH for users to .. note:: Once user accounts are created on the auth server, use the accounts to login to the cluster nodes to reset the password and create a corresponding home directory. -To customize your setup of passwordless ssh, input parameters in ``input/passwordless_ssh_config.yml``. +To customize your setup of passwordless ssh, input custom parameters in ``input/passwordless_ssh_config.yml``: +-----------------------+--------------------------------------------------------------------------------------------------------------------+ | Parameter | Details | @@ -121,7 +128,7 @@ Use the below command to enable passwordless SSH: :: ansible-playbook user_passwordless_ssh.yml -i inventory -Where inventory follows the format defined under inventory file in the provided `Sample Files. <../../sample files.html>`_ The inventory file is case-sensitive. Follow the format provided in the sample file link. +Where inventory follows the format defined under inventory file in the provided `sample files. <../../../sample files.html>`_ The inventory file is case-sensitive. Follow the format provided in the sample file link. .. caution:: Do not run ssh-keygen commands after passwordless SSH is set up on the nodes. @@ -130,7 +137,13 @@ ________________________________ **Prerequisites** -* Run ``local_repo.yml`` to create an offline repository of all utilities used to secure the login node. For more information, `click here. <../../InstallationGuides/LocalRepo/index.html>`_ +* Ensure that the following entry is present in the ``input/software_config.json``: :: + + {"name": "secure_login_node"} + +* Run ``local_repo.yml`` to create an offline repository of all utilities used to secure the login node. For more information, `click here. <../../CreateLocalRepo/index.html>`_ + +* For secure login node functionality, ensure to add the ``login`` group in the provided inventory file. Enter the following parameters in ``input/login_node_security_config.yml``. @@ -153,9 +166,17 @@ Enter the following parameters in ``input/login_node_security_config.yml``. | ``integer`` | | | Optional | **Default values**: ``180`` | +-----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -|**alert_email_address** | Email address used for sending alerts in case of authentication failure. When blank, authentication failure alerts are disabled. Currently, only one email ID is accepted. | -| ``string`` | | -| Optional | | +|**alert_email_address** | Email address used for sending alerts in case of authentication failure. When blank, authentication failure alerts are disabled. | +| ``string`` | User can mention multiple comma-separated alert email addresses. | +| Optional | **Example**: :: | +| | alert_email_address: "user1@domain.com,user2@domain.com" | ++-----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|**smtp_server** | This parameter will be applicable only when ``alert_email_address`` is provided. | +| ``string`` | This variable contains the SMTP server details configured on the cluster, from where the email alerts would be sent in case of authentication failures. | +| Optional | Currently, Omnia only supports configuration of a single SMTP server on the cluster. The SMTP server should be reachable from the ``login_node`` to receive the email alerts. | +| | **Example**: :: | +| | smtp_server: | +| | - { host: "smtp-server.domain.com", port: "25", sender_address: "alert@domain.com" }" | +-----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |**user** | Access control list of users. Accepted formats are username@ip (root@1.2.3.4) or username (root). Multiple users can be separated using whitespaces. | | ``string`` | | @@ -189,6 +210,8 @@ Enter the following parameters in ``input/login_node_security_config.yml``. Advanced Settings ------------------ -* To install FreeIPA server on the NFS node, `click here <../../Roles/Utils/freeipa_installation.html>`_. +* To install FreeIPA server on the NFS node, `click here <../../../../Utils/freeipa_installation.html>`_. + +* To replicate the OpenLDAP server `click here <../ReplicatingLDAP.html>`_. -* To replicate the OpenLDAP server `click here `_. \ No newline at end of file +* To set up the internal OpenLDAP server as a proxy, `click here <../OpenLDAP_proxy.html>`_. \ No newline at end of file diff --git a/docs/source/InstallationGuides/BuildingClusters/BeeGFS.rst b/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/BuildingCluster/Storage/BeeGFS.rst similarity index 87% rename from docs/source/InstallationGuides/BuildingClusters/BeeGFS.rst rename to docs/source/OmniaInstallGuide/RHEL/OmniaCluster/BuildingCluster/Storage/BeeGFS.rst index 434563212..fbcbdf043 100644 --- a/docs/source/InstallationGuides/BuildingClusters/BeeGFS.rst +++ b/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/BuildingCluster/Storage/BeeGFS.rst @@ -3,13 +3,13 @@ BeeGFS bolt on BeeGFS is a hardware-independent POSIX parallel file system (a.k.a. Software-defined Parallel Storage) developed with a strong focus on performance and designed for ease of use, simple installation, and management. -.. image:: ../../images/BeeGFS_Structure.jpg +.. image:: ../../../../../images/BeeGFS_Structure.jpg **Pre Requisites before installing BeeGFS client** -* Ensure that the BeeGFS server is set up using the `linked steps <../../Appendices/BeeGFSServer.html>`_. -* Ensure that a ``connAuthFile`` is configured on the server as explained `here <../../Appendices/BeeGFSServer.html>`_ +* Ensure that the BeeGFS server is set up using the `linked steps <../../../../../Appendices/BeeGFSServer.html>`_. +* Ensure that a ``connAuthFile`` is configured on the server as explained `here <../../../../../Appendices/BeeGFSServer.html>`_ .. caution:: Configuring a ``connAuthFile`` is now mandatory. Services will no longer start if a ``connAuthFile`` is not configured @@ -29,9 +29,7 @@ BeeGFS is a hardware-independent POSIX parallel file system (a.k.a. Software-def | 8006 | Helper service (beegfs-helperd) | +------+-----------------------------------+ - - -To open the ports required, use the following steps: + To open the ports required, use the following steps: 1. ``firewall-cmd --permanent --zone=public --add-port=/tcp`` @@ -41,29 +39,25 @@ To open the ports required, use the following steps: 4. ``systemctl status firewalld`` - - - .. note:: BeeGFS services over RDMA is only supported on RHEL 8.3 and above due to limitations on BeeGFS. When setting up your cluster with RDMA support, check the BeeGFS documentation to provide appropriate values in ``input/storage_config.yml``. -* If the cluster runs Rocky Linux, ensure that versions running are compatible by checking our `support matrix <../../Overview/SupportMatrix/OperatingSystems/Rocky.html>`_. +* If the cluster runs Rocky Linux, ensure that versions running are compatible by checking our `support matrix <../../../../../Overview/SupportMatrix/OperatingSystems/Rocky.html>`_. **Installing the BeeGFS client via Omnia** - After the required parameters are filled in ``input/storage_config.yml``, Omnia installs BeeGFS on all nodes while executing the ``storage.yml`` playbook. .. caution:: Do not remove or comment any lines in the ``input/storage_config.yml`` file. .. csv-table:: Parameters for storage - :file: ../../Tables/storage_config.csv + :file: ../../../Tables/storage_config.csv :header-rows: 1 :keepspace: .. note:: * BeeGFS client-server communication can take place over TCP or RDMA. If RDMA support is required, set ``beegfs_rdma_support`` should be set to true. Also, OFED should be installed on all cluster nodes. * For BeeGFS communication happening over RDMA, the ``beegfs_mgmt_server`` should be provided with the Infiniband IP of the management server. - * The parameter inventory refers to the `inventory file <../../samplefiles.html>`_ listing all relevant nodes. + * The parameter inventory refers to the `inventory file <../../../../samplefiles.html>`_ listing all relevant nodes. If ``input/storage_config.yml`` is populated before running ``omnia.yml``, BeeGFS client will be set up during the execution of ``omnia.yml``. diff --git a/docs/source/InstallationGuides/BuildingClusters/NFS.rst b/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/BuildingCluster/Storage/NFS.rst similarity index 93% rename from docs/source/InstallationGuides/BuildingClusters/NFS.rst rename to docs/source/OmniaInstallGuide/RHEL/OmniaCluster/BuildingCluster/Storage/NFS.rst index a2db90186..990dc89e9 100644 --- a/docs/source/InstallationGuides/BuildingClusters/NFS.rst +++ b/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/BuildingCluster/Storage/NFS.rst @@ -1,11 +1,11 @@ NFS -____ +===== Network File System (NFS) is a networking protocol for distributed file sharing. A file system defines the way data in the form of files is stored and retrieved from storage devices, such as hard disk drives, solid-state drives and tape drives. NFS is a network file sharing protocol that defines the way files are stored and retrieved from storage devices across networks. .. note:: NFS is a mandatory feature for all clusters set up by Omnia. Omnia sets up the NFS server and mounts the NFS client when ``nfs_server`` value is true. -**Pre requisites** +**Prerequisites** * NFS is set up on Omnia clusters based on the inputs provided in ``input/storage_config.yml``. @@ -20,13 +20,13 @@ Network File System (NFS) is a networking protocol for distributed file sharing. +-----------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+ - .. image:: ../../images/nfs_flowchart.png + .. image:: ../../../../../images/nfs_flowchart.png * The fields listed in ``nfs_client_params`` are: - - **server_ip**: IP of the intended NFS server. To set up an NFS server on the control plane, use the value ``localhost``. Use an IP address to configure access anywhere else. + - **server_ip**: IP of the intended NFS server. To set up an NFS server on the OIM, use the value ``localhost``. Use an IP address to configure access anywhere else. - **server_share_path**: Folder on which the NFS server mounted. @@ -46,7 +46,7 @@ Network File System (NFS) is a networking protocol for distributed file sharing. - { server_ip: 10.5.0.101, server_share_path: "/mnt/share", client_share_path: "/home", client_mount_options: "nosuid,rw,sync,hard", nfs_server: true, slurm_share: true, k8s_share: true } - To configure the cluster nodes to access a new NFS server on the control plane as well as an external NFS server, use the below example: :: + To configure the cluster nodes to access a new NFS server on the OIM as well as an external NFS server, use the below example: :: - { server_ip: localhost, server_share_path: "/mnt/share1", client_share_path: "/home", client_mount_options: "nosuid,rw,sync,hard", nfs_server: true, slurm_share: true, k8s_share: true } - { server_ip: 198.168.0.1, server_share_path: "/mnt/share2", client_share_path: "/mnt/mount2", client_mount_options: "nosuid,rw,sync,hard", nfs_server: false, slurm_share: true, k8s_share: true } @@ -57,7 +57,7 @@ Network File System (NFS) is a networking protocol for distributed file sharing. - { server_ip: 198.168.0.2, server_share_path: "/mnt/share2", client_share_path: "/mnt/mount2", client_mount_options: "nosuid,rw,sync,hard", nfs_server: false, slurm_share: true, k8s_share: true } -* Ensure that an NFS local repository is created by including ``{"name": "nfs"},`` in ``input/software_config.json``. For more information, `click here <../LocalRepo/index.html>`_. +* Ensure that an NFS local repository is created by including ``{"name": "nfs"},`` in ``input/software_config.json``. For more information, `click here <../../../CreateLocalRepo/index.html>`_. * If the intended cluster will run Slurm, set the value of ``slurm_installation_type`` in ``input/omnia_config.yml`` to ``nfs_share``. * If an external NFS share is used, make sure that ``/etc/exports`` on the NFS server is populated with the same paths listed as ``server_share_path`` in the ``nfs_client_params`` in ``input/storage_config.yml``. * Omnia supports all NFS mount options. Without user input, the default mount options are ``nosuid,rw,sync,hard,intr``. @@ -70,7 +70,7 @@ Run the ``storage.yml`` playbook : :: cd storage ansible-playbook storage.yml -i inventory -Use the linked `inventory file <../../samplefiles.html#inventory-file>`_ for the above playbook. +Use the linked `inventory file <../../../../samplefiles.html#inventory-file>`_ for the above playbook. Post configuration, enable the following services (using this command: ``firewall-cmd --permanent --add-service=``) and then reload the firewall (using this command: ``firewall-cmd --reload``). diff --git a/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/BuildingCluster/Storage/index.rst b/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/BuildingCluster/Storage/index.rst new file mode 100644 index 000000000..9020a0418 --- /dev/null +++ b/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/BuildingCluster/Storage/index.rst @@ -0,0 +1,7 @@ +Storage configurations +======================== + +.. toctree:: + + NFS + BeeGFS \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/BuildingCluster/index.rst b/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/BuildingCluster/index.rst new file mode 100644 index 000000000..87126e5d6 --- /dev/null +++ b/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/BuildingCluster/index.rst @@ -0,0 +1,12 @@ +Building an Omnia Cluster +=========================== + +.. toctree:: + :maxdepth: 2 + + AMD_ROCm + Authentication + Storage/index + install_kubernetes + install_slurm + installscheduler diff --git a/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/BuildingCluster/install_kubernetes.rst b/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/BuildingCluster/install_kubernetes.rst new file mode 100644 index 000000000..c1c0310bf --- /dev/null +++ b/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/BuildingCluster/install_kubernetes.rst @@ -0,0 +1,119 @@ +Set up Kubernetes +=================== + +**Prerequisites** + +* Ensure that ``k8s`` entry is present in the ``softwares`` list in ``software_config.json``, as mentioned below: + :: + + "softwares": [ + {"name": "k8s", "version":"1.29.5"}, + ] + +* Ensure to run ``local_repo.yml`` with the ``k8s`` entry present in ``software_config.json``, to download all required Kubernetes packages and images. + +* Once all the required parameters in `omnia_config.yml <../schedulerinputparams.html#id12>`_ are filled in, ``omnia.yml`` can be used to set up Kubernetes. + +* Ensure that ``k8s_share`` is set to ``true`` in `storage_config.yml <../schedulerinputparams.html#storage-config-yml>`_, for one of the entries in ``nfs_client_params``. + +**Inventory details** + +* For Kubernetes, all the applicable inventory groups are ``kube_control_plane``, ``kube_node``, and ``etcd``. + +* The inventory file must contain: + + 1. Exactly 1 ``kube_control_plane``. + 2. At least 1 ``kube_node``. + 3. Odd number of ``etcd`` nodes. + +.. note:: Ensure that the inventory includes an ``[etcd]`` node. etcd is a consistent and highly-available key value store used as Kubernetes' backing store for all cluster data. For more information, `click here. `_ + +**Sample inventory** +:: + + [kube_control_plane] + + 10.5.1.101 + + [kube_node] + + 10.5.1.102 + + [etcd] + + 10.5.1.101 + +.. note:: + If an additional NIC other than admin NIC is present on the cluster, inventory should be updated with argument ``ip``, and ``ip`` should have the value of required admin IP in case node has more than one network interface. If ``kube_control_plane`` has 2 interfaces ``eno1`` and ``eno2`` with IPs ``eno1=10.5.0.3`` and ``eno2=198.168.0.19``, inventory should have the following format: :: + + [kube_control_plane] + + 10.5.0.3 ip=10.5.0.3 + + [kube_node] + + 10.5.0.4 ip=10.5.0.4 + + [etcd] + + 10.5.0.3 ip=10.5.0.3 + +**To install Kubernetes** + +Run either of the following commands: + + 1. :: + + ansible-playbook omnia.yml -i inventory + + 2. :: + + ansible-playbook scheduler.yml -i inventory + +.. note:: To add new nodes to an existing cluster, click `here. <../../../Maintenance/addnode.html>`_ + +**Additional installations** + +Omnia installs the following packages on top of the Kubernetes stack: + +1. *amdgpu-device-plugin (ROCm device plugin)* + + This is a Kubernetes device plugin implementation that enables the registration of AMD GPU in a container cluster for compute workload. + Click `here `_ for more information. + +2. *mpi-operator* + + The MPI Operator makes it easy to run allreduce-style distributed training on Kubernetes. + Click `here `_ for more information. + +3. *xilinx device plugin* + + The Xilinx FPGA device plugin for Kubernetes is a Daemonset deployed on the Kubernetes (k8s) cluster which allows you to: + + i. Discover the FPGAs inserted in each node of the cluster and expose information about FPGA such as number of FPGA, Shell (Target Platform) type and etc. + + ii. Run FPGA accessible containers in the k8s cluster + + Click `here `_ for more information. + +4. *nfs-client-provisioner* + + * NFS subdir external provisioner is an automatic provisioner that use your existing and already configured NFS server to support dynamic provisioning of Kubernetes Persistent Volumes via Persistent Volume Claims. + * The NFS server utilised here is the one mentioned in ``storage_config.yml``. + * Server IP is ```` and path is ``.`` of the entry where ``k8s_share`` is set to ``true``. + + Click `here `_ for more information. + +5. *nvidia-device-plugin* + + For the NVIDIA device plugin to function seamlessly, Omnia installs the "nvidia-container-toolkit" as part of the ``omnia.yml`` or ``scheduler.yml`` playbook execution. The NVIDIA device plugin for Kubernetes is a "DaemonSet" that allows you to automatically: + + i. Expose the number of GPUs on each nodes of your cluster + ii. Keep track of the health of your GPUs + iii. Run GPU enabled containers in your Kubernetes cluster + + Click `here `_ for more information. + +**Optional installation** + +* `PowerScale CSI drivers <../../AdvancedConfigurationsRHEL/PowerScale_CSI.html>`_ \ No newline at end of file diff --git a/docs/source/InstallationGuides/BuildingClusters/install_slurm.rst b/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/BuildingCluster/install_slurm.rst similarity index 85% rename from docs/source/InstallationGuides/BuildingClusters/install_slurm.rst rename to docs/source/OmniaInstallGuide/RHEL/OmniaCluster/BuildingCluster/install_slurm.rst index 0a53f6947..bc9551537 100644 --- a/docs/source/InstallationGuides/BuildingClusters/install_slurm.rst +++ b/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/BuildingCluster/install_slurm.rst @@ -1,4 +1,4 @@ -Install Slurm +Set up Slurm ============== .. note:: Omnia supports slurm installation only on RHEL and Rocky Linux (not supported on Ubuntu). @@ -14,9 +14,9 @@ Install Slurm * Ensure to run ``local_repo.yml`` with the ``slurm`` entry present in ``software_config.json``, to download all required slurm packages. -* Once all the required parameters in `omnia_config.yml `_ are filled in, ``omnia.yml`` can be used to set up Slurm. +* Once all the required parameters in `omnia_config.yml <../schedulerinputparams.html#id13>`_ are filled in, ``omnia.yml`` can be used to set up Slurm. -* When ``slurm_installation_type`` is ``nfs_share`` in ``omnia_config.yml``, ensure that ``slurm_share`` is set to ``true`` in `storage_config.yml `_, for one of the entries in ``nfs_client_params``. +* When ``slurm_installation_type`` is ``nfs_share`` in ``omnia_config.yml``, ensure that ``slurm_share`` is set to ``true`` in `storage_config.yml <../schedulerinputparams.html#id17>`_, for one of the entries in ``nfs_client_params``. **Inventory details** @@ -56,7 +56,7 @@ Run either of the following commands: ansible-playbook scheduler.yml -i inventory -.. note:: To add new nodes to an existing cluster, click `here. <../addinganewnode.html>`_ +.. note:: To add new nodes to an existing cluster, click `here. <../../../Maintenance/addnode.html>`_ **Slurm job based user access** diff --git a/docs/source/InstallationGuides/BuildingClusters/installscheduler.rst b/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/BuildingCluster/installscheduler.rst similarity index 70% rename from docs/source/InstallationGuides/BuildingClusters/installscheduler.rst rename to docs/source/OmniaInstallGuide/RHEL/OmniaCluster/BuildingCluster/installscheduler.rst index 2450e53db..b6fab88b9 100644 --- a/docs/source/InstallationGuides/BuildingClusters/installscheduler.rst +++ b/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/BuildingCluster/installscheduler.rst @@ -1,10 +1,9 @@ -Building clusters ------------------- +Cluster formation +===================== -1. In the ``input/omnia_config.yml``, ``input/security_config.yml``, ``input/telemetry_config.yml`` and [optional] ``input/storage_config.yml`` files, provide the `required details `_. +1. In the ``input/omnia_config.yml``, ``input/security_config.yml``, and ``input/storage_config.yml`` files, provide the `required details <../schedulerinputparams.html>`_. For ``input/telemetry_config.yml``, the details can be found `here <../../../../Telemetry/index.html#id13>`_. - -2. Create an inventory file in the *omnia* folder. Check out the `sample inventory <../../samplefiles.html>`_ for more information. If a hostname is used to refer to the target nodes, ensure that the domain name is included in the entry. IP addresses are also accepted in the inventory file. +2. Create an inventory file in the *omnia* folder. Check out the `sample inventory <../../../samplefiles.html>`_ for more information. If a hostname is used to refer to the target nodes, ensure that the domain name is included in the entry. IP addresses are also accepted in the inventory file. .. include:: ../../Appendices/hostnamereqs.rst @@ -16,10 +15,10 @@ Building clusters 3. ``omnia.yml`` is a wrapper playbook comprising of: i. ``security.yml``: This playbook sets up centralized authentication (LDAP/FreeIPA) on the cluster. For more information, `click here. `_ - ii. ``storage.yml``: This playbook sets up storage tools like `BeeGFS `_ and `NFS `_. + ii. ``storage.yml``: This playbook sets up storage tools like `BeeGFS `_ and `NFS `_. iii. ``scheduler.yml``: This playbook sets up job schedulers (`Slurm `_ or `Kubernetes `_) on the cluster. - iv. ``telemetry.yml``: This playbook sets up `Omnia telemetry and/or iDRAC telemetry <../../Roles/Telemetry/index.html>`_. It also installs `Grafana `_ and `Loki `_ as Kubernetes pods. - v. ``rocm_installation.yml``: This playbook sets up the `ROCm platform for AMD GPU accelerators <../../BuildingClusters/AMD_ROCm.html>`_. + iv. ``telemetry.yml``: This playbook sets up `Omnia telemetry and/or iDRAC telemetry <../../../../Telemetry/index.html>`_. It also installs `Grafana `_ and `Loki `_ as Kubernetes pods. + v. ``rocm_installation.yml``: This playbook sets up the `ROCm platform for AMD GPU accelerators `_. To run ``omnia.yml``: :: @@ -35,5 +34,5 @@ To run ``omnia.yml``: :: * Use the ansible-vault view or edit commands and not the ansible-vault decrypt or encrypt commands. If you have used the ansible-vault decrypt or encrypt commands, provide 644 permission to the parameter files. -4. Once ``omnia.yml`` playbook is successfully executed, the cluster is up and running with the required application stack. Now you can install `AI tools <../Platform/index.html>`_ or utilize the cluster for job execution. +4. Once ``omnia.yml`` playbook is successfully executed, the cluster is up and running with the required application stack. Now you can install `AI tools <../InstallAITools/index.html>`_ or utilize the cluster for job execution. diff --git a/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/OpenLDAP_proxy.rst b/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/OpenLDAP_proxy.rst new file mode 100644 index 000000000..a2fca258c --- /dev/null +++ b/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/OpenLDAP_proxy.rst @@ -0,0 +1,170 @@ +Setting up OpenLDAP as a proxy server +======================================= + +Omnia allows the internal OpenLDAP server to be configured as a proxy, where it utilizes the external LDAP servers as a backend database to store user data and acts as an authentication entity to allow/deny them access to the cluster. OpenLDAP client will be configured through the proxy server which means that there won't be any direct communication between OpenLDAP client and the external LDAP server. + +.. note:: If the OpenLDAP server is set up as a proxy, the user database is not replicated onto the server. + +Perform the following steps to configure OpenLDAP as a proxy server: + +1. Before proceeding with the new configuration, first remove the existing LDAP configurations by removing the ``/usr/local/openldap/etc/openldap/slapd.d/`` folder and then create another directory with the same folder hierarchy using the ``mkdir`` command. Execute the following commands to perform these operations: :: + + rm -rf /usr/local/openldap/etc/openldap/slapd.d/ + mkdir /usr/local/openldap/etc/openldap/slapd.d/ + +2. Now, locate the ``slapd.conf`` config file present in ``/usr/local/openldap/etc/openldap/`` and modify the file to add the new LDAP configurations. Add the following lines to the config file based on the operating system running on the cluster: + + For RHEL/Rocky Linux: :: + + include /usr/local/openldap/etc/openldap/schema/core.schema + include /usr/local/openldap/etc/openldap/schema/cosine.schema + include /usr/local/openldap/etc/openldap/schema/nis.schema + include /usr/local/openldap/etc/openldap/schema/inetorgperson.schema + + + pidfile /usr/local/openldap/var/run/slapd.pid + argsfile /usr/local/openldap/var/run/slapd.args + + # Load dynamic backend modules: + modulepath /usr/local/openldap/libexec/openldap + moduleload back_ldap.la + moduleload back_meta.la + + ####################################################################### + # Meta database definitions + ####################################################################### + database meta + suffix "dc=phantom,dc=test" + rootdn cn=admin,dc=phantom,dc=test + rootpw Dell1234 + + uri "ldap://10.5.0.104:389/dc=phantom,dc=test" + suffixmassage "dc=phantom,dc=test" "dc=perf,dc=test" + idassert-bind + bindmethod=simple + binddn="cn=admin,dc=perf,dc=test" + credentials="Dell1234" + flags=override + mode=none + TLSCACertificateFile /etc/openldap/certs/ldapserver.crt + TLSCertificateFile /etc/openldap/certs/ldapserver.crt + TLSCertificateKeyFile /etc/pki/tls/certs/ldapserver.key + + For Ubuntu: :: + + include /usr/local/openldap/etc/openldap/schema/core.schema + include /usr/local/openldap/etc/openldap/schema/cosine.schema + include /usr/local/openldap/etc/openldap/schema/nis.schema + include /usr/local/openldap/etc/openldap/schema/inetorgperson.schema + + + pidfile /usr/local/openldap/var/run/slapd.pid + argsfile /usr/local/openldap/var/run/slapd.args + + # Load dynamic backend modules: + modulepath /usr/local/openldap/libexec/openldap + moduleload back_ldap.la + moduleload back_meta.la + + ####################################################################### + # Meta database definitions + ####################################################################### + database meta + suffix "dc=phantom,dc=test" + rootdn cn=admin,dc=phantom,dc=test + rootpw Dell1234 + + uri "ldap://10.5.0.104:389/dc=phantom,dc=test" + suffixmassage "dc=phantom,dc=test" "dc=perf,dc=test" + idassert-bind + bindmethod=simple + binddn="cn=admin,dc=perf,dc=test" + credentials="Dell1234" + flags=override + mode=none + TLSCACertificateFile /etc/ssl/certs/ca-certificates.crt + TLSCertificateFile /etc/ssl/certs/ssl-cert-snakeoil.pem + TLSCertificateKeyFile /etc/ssl/private/ssl-cert-snakeoil.key + +Change the **** values in the config file, as described below: + +* **database**: Database used in the ``slapd.conf`` file, that captures the details of the external LDAP server to be used. For example, ``meta``. +* **suffix**: Captures the domain name of internal OpenLDAP user, to refine the user search while attempting to authenticate the user. For example, ``"dc=omnia,dc=test"``. +* **rootdn**: Admin or root username of the internal OpenLDAP server set up by Omnia. For example, ``cn=admin,dc=omnia,dc=test``. +* **rootpw**: Admin password for the internal OpenLDAP server. For example, ``Dell1234``. + +* **uri**: Captures the IP of the external LDAP server along with the port and the domain of the user in ``"ldap://:/"`` format. For example, ``"ldap://10.5.0.104:389/dc=omnia,dc=test"``. +* **suffixmassage**: ``suffixmassage`` allows you to dynamically move the LDAP client information from the existing internal OpenLDAP server to the external LDAP server that you want to configure as a proxy. This is provided in the ``suffixmassage `` format. + + * ```` is the internal OpenLDAP server suffix (base DN). + * ```` is the external LDAP server suffix (base DN). + +* **binddn**: Admin username and domain of the external LDAP server. +* **credentials**: Admin password for the external LDAP server. + +* **TLSCACertificateFile**: Omnia, by default, creates the TLSA certificate in ``/etc/openldap/certs/ldapserver.crt`` for RHEL/Rocky Linux or in ``/etc/ssl/certs/ca-certificates.crt`` for Ubuntu. +* **TLSCertificateFile**: Omnia, by default, creates the TLS certificate in ``/etc/openldap/certs/ldapserver.crt`` for RHEL/Rocky Linux or in ``/etc/ssl/certs/ssl-cert-snakeoil.pem`` for Ubuntu. +* **TLSCertificateKeyFile**: Omnia, by default, creates the certificate key file in ``/etc/pki/tls/certs/ldapserver.key`` for RHEL/Rocky Linux or in ``/etc/ssl/private/ssl-cert-snakeoil.key`` for Ubuntu. + +.. note:: + * The values for ``suffix`` and ``rootdn`` parameters in the ``slapd.conf`` file must be the same as those provided in the ``input/security_config.yml`` file. + + * If you have your own set of TLS certificates and keys that you want to utilize instead of the default ones created by Omnia, then you can provide the path to them in the ``input/security_config.yml`` file. During ``omnia.yml`` execution, the user provided certificates and key files are copied from the OIM to the ``auth_server`` (OpenLDAP). An example for the certificate and key entries in the ``input/security_config.yml`` file for the proxy OpenLDAP server is provided below: :: + + # Certificate Authority(CA) issued certificate file path + tls_ca_certificate: "/root/certificates/omnia_ca_cert.crt" + # OpenLDAP Certificate file path + tls_certificate: "/root/certificates/omnia_cert.pem" + # OpenLDAP Certificate key file path + tls_certificate_key: "/root/certificates/omnia_cert_key.key" + + Use the same certificates and keys in the ``slapd.conf`` file, as shown below: + + Ubuntu: :: + + TLSCACertificateFile /etc/ssl/certs/omnia_ca_cert.crt + TLSCertificateFile /etc/ssl/certs/omnia_cert.pem + TLSCertificateKeyFile /etc/ssl/private/omnia_cert_key.key + + RHEL/ROCKY LINUX: :: + + TLSCACertificateFile /etc/pki/tls/certs/omnia_ca_cert.crt + TLSCertificateFile /etc/pki/tls/certs/omnia_cert.pem + TLSCertificateKeyFile /etc/pki/tls/certs/omnia_cert_key.key + + * Multiple external LDAP servers can also be configured on the proxy server. The OpenLDAP proxy server allows users from multiple external LDAP servers to authenticate onto the cluster. You can provide two sets of external LDAP server details as shown below: :: + + uri "ldap://10.5.0.104:389/dc=omnia1,dc=test" + idassert-bind + bindmethod=simple + binddn="cn=admin,dc=omnia,dc=test" + credentials="Dell1234" + flags=override + mode=none + + uri "ldap://10.5.0.105:389/dc=omnia2,dc=test" + idassert-bind + bindmethod=simple + binddn="cn=admin,dc=omnia,dc=test" + credentials="Dell12345" + flags=override + mode=none + +3. Once the new configurations are present in the ``slapd.conf`` file, execute the following OpenLDAP server "slaptest" command to apply the configurations: :: + + slaptest -f /usr/local/openldap/etc/openldap/slapd.conf -F /usr/local/openldap/etc/openldap/slapd.d + + +4. Change the schema ownership to LDAP and set the necessary file permissions (770). Execute the following commands to do so: :: + + chown -R ldap:ldap /usr/local/openldap/etc/openldap/slapd.d/ + chown root:ldap /usr/local/openldap/etc/openldap/slapd.d/ + chmod -R 754 /usr/local/openldap/etc/openldap/slapd.d/ + chmod 770 /usr/local/openldap/etc/openldap/slapd.d/ + +5. Restart the internal OpenLDAP server to seal in the configurations. Execute the following command to restart the server: :: + + systemctl restart slapd-ltb.service + + +Once these configurations are applied on the internal OpenLDAP server, it sets up the external LDAP server as an authentication server. The internal OpenLDAP server doesn't store any kind of user data and no users can be created/modified from here. \ No newline at end of file diff --git a/docs/source/InstallationGuides/BuildingClusters/ReplicatingLDAP.rst b/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/ReplicatingLDAP.rst similarity index 100% rename from docs/source/InstallationGuides/BuildingClusters/ReplicatingLDAP.rst rename to docs/source/OmniaInstallGuide/RHEL/OmniaCluster/ReplicatingLDAP.rst diff --git a/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/index.rst b/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/index.rst new file mode 100644 index 000000000..2009165cd --- /dev/null +++ b/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/index.rst @@ -0,0 +1,27 @@ +Step 5: Configure the cluster +================================ + +**Features enabled by omnia.yml**: + + * **Centralized authentication**: Once all the required parameters in `security_config.yml `_ are filled in, ``omnia.yml`` can be used to set up FreeIPA/OpenLDAP. + + * **Slurm**: Once all the required parameters in `omnia_config.yml `_ are filled in, ``omnia.yml`` can be used to set up slurm. + + * **Kubernetes**: Once all the required parameters in `omnia_config.yml `_ are filled in, ``omnia.yml`` can be used to set up kubernetes. + + * **Login Node (Additionally secure login node)** + +.. caution:: If you have a proxy server set up for your OIM, you must configure the proxy environment variables on the OIM before running any Omnia playbooks. For more information, `click here <../Setup_CP_proxy.html>`_. + +.. toctree:: + :maxdepth: 2 + + schedulerprereqs + schedulerinputparams + BuildingCluster/index + + + + + + diff --git a/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/schedulerinputparams.rst b/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/schedulerinputparams.rst new file mode 100644 index 000000000..329368b6a --- /dev/null +++ b/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/schedulerinputparams.rst @@ -0,0 +1,61 @@ +Input parameters for the cluster +=================================== + +These parameters are located in ``input/omnia_config.yml``, ``input/security_config.yml``, and ``input/storage_config.yml``. To initiate telemetry support, fill out `these parameters <../../../Telemetry/index.html#id13>`_ in ``input/telemetry_config.yml``. + +.. caution:: Do not remove or comment any lines in the ``input/omnia_config.yml``, ``input/security_config.yml``, ``input/telemetry_config.yml``, and ``input/storage_config.yml`` file. + +omnia_config.yml +------------------- + +.. csv-table:: Parameters for kubernetes setup + :file: ../../../Tables/scheduler_k8s_rhel.csv + :header-rows: 1 + :keepspace: + +.. csv-table:: Parameters for slurm setup + :file: ../../../Tables/scheduler_slurm.csv + :header-rows: 1 + :keepspace: + +security_config.yml +--------------------- + +.. csv-table:: Parameters for Authentication + :file: ../../../Tables/security_config.csv + :header-rows: 1 + :keepspace: + +.. csv-table:: Parameters for OpenLDAP configuration + :file: ../../../Tables/security_config_ldap.csv + :header-rows: 1 + :keepspace: + +.. csv-table:: Parameters for FreeIPA configuration + :file: ../../../Tables/security_config_freeipa.csv + :header-rows: 1 + :keepspace: + + +storage_config.yml +-------------------- + +.. csv-table:: Parameters for Storage + :file: ../../../Tables/storage_config.csv + :header-rows: 1 + :keepspace: + + +Click here for more information on `OpenLDAP, FreeIPA `_, `BeeGFS `_, or `NFS `_. + +.. note:: + + * The ``input/omnia_config.yml`` and ``input/security_config.yml`` files are encrypted during the execution of ``omnia.yml`` playbook. Use the below commands to edit the encrypted input files: + + * ``omnia_config.yml``: :: + + ansible-vault edit omnia_config.yml --vault-password-file .omnia_vault_key + + * ``security_config.yml``: :: + + ansible-vault edit security_config.yml --vault-password-file .security_vault.key \ No newline at end of file diff --git a/docs/source/InstallationGuides/BuildingClusters/schedulerprereqs.rst b/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/schedulerprereqs.rst similarity index 71% rename from docs/source/InstallationGuides/BuildingClusters/schedulerprereqs.rst rename to docs/source/OmniaInstallGuide/RHEL/OmniaCluster/schedulerprereqs.rst index 4ae7f5c7d..62c3be11a 100644 --- a/docs/source/InstallationGuides/BuildingClusters/schedulerprereqs.rst +++ b/docs/source/OmniaInstallGuide/RHEL/OmniaCluster/schedulerprereqs.rst @@ -1,20 +1,20 @@ Before you build clusters -------------------------- -* `Ensure that all cluster nodes are up and running <../InstallingProvisionTool/ViewingDB.html>`_. +* `Ensure that all cluster nodes are up and running <../Provision/ViewingDB.html>`_. * Verify that the inventory file is updated as mentioned in the `inventory sample file <../../samplefiles.html>`_. * For Slurm, all the applicable inventory groups are ``slurm_control_node``, ``slurm_node``, and ``login``. * For Kubernetes, all the applicable groups are ``kube_control_plane``, ``kube_node``, and ``etcd``. - * For Slurm, all the applicable inventory groups are ``slurm_control_node``, ``slurm_node``, and ``login``. * The centralized authentication server inventory group, that is ``auth_server``, is common for both Slurm and Kubernetes. + * For secure login node functionality, ensure to add the ``login`` group in the provided inventory file. * Verify that all nodes are assigned a group. The inventory file is case-sensitive. Follow the format provided in the `sample file link <../../samplefiles.html>`_. .. note:: * The inventory file accepts both IPs and FQDNs as long as they can be resolved by DNS. - * In a multi-node setup, an IP cannot be listed as a control plane and a compute node simultaneously. That is, don't include the ``kube_control_plane`` IP address in the compute node group. In a single node setup, the compute node and the ``kube_control_plane`` must be the same. + * In a multi-node setup, an IP cannot be listed as a control node and a compute node simultaneously. That is, don't include the ``kube_control_plane`` IP address in the compute node group. In a single node setup, the compute node and the ``kube_control_plane`` must be the same. * Users should also ensure that all repositories are available on the cluster nodes. diff --git a/docs/source/OmniaInstallGuide/RHEL/Prereq.sh/index.rst b/docs/source/OmniaInstallGuide/RHEL/Prereq.sh/index.rst new file mode 100644 index 000000000..da14d0903 --- /dev/null +++ b/docs/source/OmniaInstallGuide/RHEL/Prereq.sh/index.rst @@ -0,0 +1,37 @@ +Step 1: Execute prereq.sh +=========================== + +Starting from version 1.7, Omnia will be executed within a Python virtual environment. To set up this environment, the ``prereq.sh`` script is utilized. This script installs the necessary Python 3.11, creates the Python virtual environment, as well as installs Ansible 9.5.1 version and other software packages required by Omnia on the OIM. The predefined path for this virtual environment is ``/opt/omnia/omnia17_venv``. This approach ensures that Omnia has the correct dependencies and runs smoothly within a controlled and isolated environment. + +.. caution:: + + * To run Omnia, it is crucial to use the Python virtual environment created by the ``prereq.sh`` script. Do not delete the virtual environment directory (/opt/omnia/omnia17_venv/) as it is necessary for the proper functioning of Omnia. + * If you have a proxy server set up for your OIM, you must configure the proxy environment variables on the OIM before running any Omnia playbooks. For more information, `click here <../Setup_CP_proxy.html>`_. + * Ensure to execute the Omnia playbooks from inside the git cloned Omnia repository folder. Executing the playbooks outside leads to playbook execution failures. + +* Use the following command to execute the ``prereq.sh`` script on the OIM: :: + + cd omnia + ./prereq.sh + +* To activate the virtual environment, use the following command: :: + + source /opt/omnia/omnia17_venv/bin/activate + +* To verify that the virtual environment is active, check if the following prompt is displayed: :: + + (omnia) [root@ omnia]# + +.. note:: + * Omnia recommends to disable SELinux before proceeding with the installation. If SELinux is not disabled, it will be disabled by the script and the you will be prompted to reboot the OIM. + * The file ``input/software_config.json`` is overwritten with the default values (based on the operating system) when ``prereq.sh`` is executed. + + +.. note:: + + If you want to deactivate the virtual environment set up by the ``prereq.sh`` script, use the following command from within the activated virtual environment: :: + + deactivate + +.. caution:: If you want to delete and recreate the Omnia-created virtual environment, ensure to back up the pip packages before doing so. To backup the packages, run the ``pip freeze >> omnia_venv_pip_reqs.txt`` command from within the activated virtual environment. This command creates a backup file called ``omnia_venv_pip_reqs.txt`` in the current directory. After you have recreated the virtual environment using the ``prereq.sh`` script, restore the pip packages from the activated virtual environment using the ``pip install -r omnia_venv_pip_reqs.txt`` command. + diff --git a/docs/source/InstallationGuides/InstallingProvisionTool/DiscoveryMechanisms/bmc.rst b/docs/source/OmniaInstallGuide/RHEL/Provision/DiscoveryMechanisms/bmc.rst similarity index 64% rename from docs/source/InstallationGuides/InstallingProvisionTool/DiscoveryMechanisms/bmc.rst rename to docs/source/OmniaInstallGuide/RHEL/Provision/DiscoveryMechanisms/bmc.rst index bc1c8ca27..37dce5aee 100644 --- a/docs/source/InstallationGuides/InstallingProvisionTool/DiscoveryMechanisms/bmc.rst +++ b/docs/source/OmniaInstallGuide/RHEL/Provision/DiscoveryMechanisms/bmc.rst @@ -5,8 +5,8 @@ For automatic provisioning of servers and discovery, the BMC method can be used. **Pre requisites** -* Set the IP address of the control plane. The control plane NIC connected to remote servers (through the switch) should be configured with two IPs (BMC IP and admin IP) in a shared LOM or hybrid set up. In the case dedicated network topology, a single IP (admin IP) is required. -.. image:: ../../../images/ControlPlaneNic.png +* Set the IP address of the OIM. The OIM NIC connected to remote servers (through the switch) should be configured with two IPs (BMC IP and admin IP) in a shared LOM or hybrid set up. In the case dedicated network topology, a single IP (admin IP) is required. +.. image:: ../../../../images/ControlPlaneNic.png * To assign IPs on the BMC network while discovering servers using a BMC details, target servers should be in DHCP mode or switch details should be provided. @@ -14,7 +14,7 @@ For automatic provisioning of servers and discovery, the BMC method can be used. * Target servers should be configured to boot in PXE mode with the appropriate NIC as the first boot device. -* If the ``discovery_ranges`` provided are outside the ``bmc_subnet``, ensure the target nodes can reach the control plane. +* If the ``discovery_ranges`` provided are outside the ``bmc_subnet``, ensure the target nodes can reach the OIM. * IPMI over LAN needs to be enabled for the BMC. :: @@ -22,14 +22,14 @@ For automatic provisioning of servers and discovery, the BMC method can be used. racadm get iDRAC.IPMILan - -.. caution:: If you are re-provisioning your cluster (that is, re-running the ``discovery_provision.yml`` playbook) after a `clean-up <../../CleanUpScript.html>`_, ensure to use a different ``static_range`` against ``bmc_network`` in ``input/network_spec.yml`` to avoid a conflict with newly assigned servers. Alternatively, disable any OS available in the ``Boot Option Enable/Disable`` section of your BIOS settings (**BIOS Settings > Boot Settings > UEFI Boot Settings**) on all target nodes. +.. caution:: If you are re-provisioning your cluster (that is, re-running the ``discovery_provision.yml`` playbook) after a `clean-up <../../../Maintenance/cleanup.html>`_, ensure to use a different ``static_range`` against ``bmc_network`` in ``input/network_spec.yml`` to avoid a conflict with newly assigned servers. Alternatively, disable any OS available in the ``Boot Option Enable/Disable`` section of your BIOS settings (**BIOS Settings > Boot Settings > UEFI Boot Settings**) on all target nodes. - All target servers should be reachable from the ``admin_network`` specified in ``input/network_spec.yml``. * BMC network details should be provided in the ``input/network_spec.yml`` file. -When entering details in ``input/network_spec.yml``: +Few things to keep in mind while entering details in ``input/network_spec.yml``: + * Ensure that the netmask bits for the BMC network and the admin network are the same. * The static and dynamic ranges for the BMC network accepts multiple comma-separated ranges. @@ -38,6 +38,6 @@ When entering details in ``input/network_spec.yml``: .. note:: If the value of ``enable_switch_based`` is set to true, nodes will not be discovered via BMC irrespective of the contents in ``input/network_spec.yml``. -To continue to the next steps: +Next step: * `Provisioning the cluster <../installprovisiontool.html>`_ \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/RHEL/Provision/DiscoveryMechanisms/index.rst b/docs/source/OmniaInstallGuide/RHEL/Provision/DiscoveryMechanisms/index.rst new file mode 100644 index 000000000..63456bf61 --- /dev/null +++ b/docs/source/OmniaInstallGuide/RHEL/Provision/DiscoveryMechanisms/index.rst @@ -0,0 +1,80 @@ +Discovery Mechanisms +===================== + +Depending on the values provided in ``input/provision_config.yml``, target nodes can be discovered in one of three ways: + +.. toctree:: + :hidden: + + switch-based + mappingfile + bmc + + +switch_based +------------ + +Omnia can query known switches (by SNMPv3 username/password) for information on target node MAC IDs. + ++---------------------------------------------------------+------------------------------------------------------+ +| Pros | Cons | ++=========================================================+======================================================+ +| The entire discovery process is totally automatic. | Users need to enable IPMI on target servers. | ++---------------------------------------------------------+-----------------------------------+------------------+ +| Admin IP, BMC IP and Infiniband IP address configuration| Servers require a manual PXE boot after the first run| +| is automatic on the target nodes. | of the provision tool. | ++---------------------------------------------------------+------------------------------------------------------+ +| Re-provisioning of servers will be automatic. | | ++---------------------------------------------------------+------------------------------------------------------+ +| PXE booting servers is supported via split ports on the | | +| switch. | | ++---------------------------------------------------------+------------------------------------------------------+ + +For more information regarding switch-based discovery, `click here `_ + +mapping +-------- + +Manually collect PXE NIC information for target servers and manually define them to Omnia using a mapping file using the below format: + +**pxe_mapping_file.csv** +:: + + SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_IP + XXXXXXXX,n1,xx:yy:zz:aa:bb:cc,10.5.0.101,10.3.0.101 + XXXXXXXX,n2,aa:bb:cc:dd:ee:ff,10.5.0.102,10.3.0.102 + ++---------------------------------------------------------+------------------------------------------------------+ +| Pros | Cons | ++=========================================================+======================================================+ +| Easily customizable if the user maintains a list of | The user needs to be aware of the MAC/IP mapping | +| MAC addresses. | required in the network. | ++---------------------------------------------------------+-----------------------------------+------------------+ +| | Servers require a manual PXE boot if iDRAC IPs are | +| | not configured. | ++---------------------------------------------------------+------------------------------------------------------+ + +For more information regarding mapping files, `click here `_ + +bmc +---- + +Omnia can also discover nodes via their iDRAC using IPMI. + ++---------------------------------------------------------+------------------------------------------------------+ +| Pros | Cons | ++=========================================================+======================================================+ +| Discovery and provisioning of servers is automatic. | For iDRACs that are not DHCP enabled (i.e., Static), | +| | users need to enable IPMI manually. | ++---------------------------------------------------------+-----------------------------------+------------------+ +| Admin, BMC and Infiniband IP address configuration is | Servers require a manual PXE boot after the first run| +| automatic on the OIM. | of the provision tool. | ++---------------------------------------------------------+------------------------------------------------------+ +| LOM architecture is supported | | +| (including cloud enclosures: C6420, C6520, C6620). | | ++---------------------------------------------------------+------------------------------------------------------+ + +For more information regarding BMC, `click here `_ + + + diff --git a/docs/source/InstallationGuides/InstallingProvisionTool/DiscoveryMechanisms/mappingfile.rst b/docs/source/OmniaInstallGuide/RHEL/Provision/DiscoveryMechanisms/mappingfile.rst similarity index 80% rename from docs/source/InstallationGuides/InstallingProvisionTool/DiscoveryMechanisms/mappingfile.rst rename to docs/source/OmniaInstallGuide/RHEL/Provision/DiscoveryMechanisms/mappingfile.rst index a88c4607f..ac702912d 100644 --- a/docs/source/InstallationGuides/InstallingProvisionTool/DiscoveryMechanisms/mappingfile.rst +++ b/docs/source/OmniaInstallGuide/RHEL/Provision/DiscoveryMechanisms/mappingfile.rst @@ -1,6 +1,6 @@ mapping -------------- -Manually collect PXE NIC information for target servers and define them to Omnia (using the ``pxe_mapping_file`` variable in ``input/provision_config.yml```) using a mapping file using the below format: +Manually collect PXE NIC information for target servers and define them to Omnia (using the ``pxe_mapping_file`` variable in ``input/provision_config.yml``) using a mapping file using the below format: **pxe_mapping_file.csv** @@ -21,8 +21,8 @@ Manually collect PXE NIC information for target servers and define them to Omnia * Target servers should be configured to boot in PXE mode with the appropriate NIC as the first boot device. * To assign IPs on the BMC network while discovering servers using a mapping file, target servers should be in DHCP mode or switch details should be provided. -.. caution:: If incorrect details are provided in the mapping file and the same is passed on to the Omnia DB (this takes place when ``discovery.yml`` or ``discovery_provision.yml`` is run), delete the nodes with incorrect information using the `linked script. <../../deletenode.html#delete-provisioned-node>`_ After deletion, provide correct details in the mapping file and re-run ``discovery_provision.yml`` or ``discovery/discovery.yml``. If the ``bmc_ip`` alone is incorrect, manually PXE boot the target server to update the database. +.. caution:: If incorrect details are provided in the mapping file and the same is passed on to the Omnia DB (this takes place when ``discovery.yml`` or ``discovery_provision.yml`` is run), delete the nodes with incorrect information using the `linked script. <../../../Maintenance/deletenode.html>`_ After deletion, provide correct details in the mapping file and re-run ``discovery_provision.yml`` or ``discovery/discovery.yml``. If the ``bmc_ip`` alone is incorrect, manually PXE boot the target server to update the database. -To continue to the next steps: +Next step: * `Provisioning the cluster <../installprovisiontool.html>`_ \ No newline at end of file diff --git a/docs/source/InstallationGuides/InstallingProvisionTool/DiscoveryMechanisms/switch-based.rst b/docs/source/OmniaInstallGuide/RHEL/Provision/DiscoveryMechanisms/switch-based.rst similarity index 72% rename from docs/source/InstallationGuides/InstallingProvisionTool/DiscoveryMechanisms/switch-based.rst rename to docs/source/OmniaInstallGuide/RHEL/Provision/DiscoveryMechanisms/switch-based.rst index d8ba5e699..5c8ddb29c 100644 --- a/docs/source/InstallationGuides/InstallingProvisionTool/DiscoveryMechanisms/switch-based.rst +++ b/docs/source/OmniaInstallGuide/RHEL/Provision/DiscoveryMechanisms/switch-based.rst @@ -30,28 +30,28 @@ switch_based -* IPMI over LAN needs to be enabled for the control plane. :: +* IPMI over LAN needs to be enabled for the OIM. :: racadm set iDRAC.IPMILan.Enable 1 racadm get iDRAC.IPMILan * Target servers should be configured to boot in PXE mode with appropriate NIC as the first boot device. -* Set the IP address of the control plane. The control plane NIC connected to remote servers (through the switch) should be configured with two IPs (BMC IP and admin IP) in a shared LOM or hybrid set up. In the case dedicated network topology, a single IP (admin IP) is required. -.. image:: ../../../images/ControlPlaneNic.png +* Set the IP address of the OIM. The OIM NIC connected to remote servers (through the switch) should be configured with two IPs (BMC IP and admin IP) in a shared LOM or hybrid set up. In the case dedicated network topology, a single IP (admin IP) is required. +.. image:: ../../../../images/ControlPlaneNic.png .. caution:: - * Do not use daisy chain ports or the port used to connect to the control plane in ``switch_based_details`` in ``input/provision_config.yml``. This can cause IP conflicts on servers attached to potential target ports. + * Do not use daisy chain ports or the port used to connect to the OIM in ``switch_based_details`` in ``input/provision_config.yml``. This can cause IP conflicts on servers attached to potential target ports. * Omnia does not validate SNMP switch credentials, if the provision tool is run with incorrect credentials, use the clean-up script and re-run the provision tool with the correct credentials. - * If you are re-provisioning your cluster (that is, re-running the ``discovery_provision.yml`` playbook) after a `clean-up <../../CleanUpScript.html>`_, ensure to use a different ``static_range`` against ``bmc_network`` in ``input/network_spec.yml`` to avoid a conflict with newly assigned servers. Alternatively, disable any OS available in the ``Boot Option Enable/Disable`` section of your BIOS settings (**BIOS Settings > Boot Settings > UEFI Boot Settings**) on all target nodes. + * If you are re-provisioning your cluster (that is, re-running the ``discovery_provision.yml`` playbook) after a `clean-up <../../../Maintenance/cleanup.html>`_, ensure to use a different ``static_range`` against ``bmc_network`` in ``input/network_spec.yml`` to avoid a conflict with newly assigned servers. Alternatively, disable any OS available in the ``Boot Option Enable/Disable`` section of your BIOS settings (**BIOS Settings > Boot Settings > UEFI Boot Settings**) on all target nodes. .. note:: * If any of the target nodes have a pre-provisioned BMC IP, ensure that these IPs are not part of the ``static_range`` specified in ``input/network_spec.yml`` under the ``bmc_network`` to avoid any bmc IP conflicts. - * In case of a duplicate node object, duplicate BMC nodes will be deleted automatically by the **duplicate_node_cleanup** service that runs every 30 minutes. When nodes are discovered via mapping and switch details, the nodes discovered via switch details will not be deleted. Delete the node manually `using the delete node playbook. <../../deletenode.html#delete-provisioned-node>`_ + * In case of a duplicate node object, duplicate BMC nodes will be deleted automatically by the **duplicate_node_cleanup** service that runs every 30 minutes. When nodes are discovered via mapping and switch details, the nodes discovered via switch details will not be deleted. Delete the node manually `using the delete node playbook. <../../../Maintenance/deletenode.html>`_ -To clear the configuration on Omnia provisioned switches and ports, `click here <../../../Roles/Utils/portcleanup.html>`_. +* [Optional] To clear the configuration on Omnia provisioned switches and ports, `click here <../../../../Utils/portcleanup.html>`_. -To continue to the next steps: +Next step: * `Provisioning the cluster <../installprovisiontool.html>`_ \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/RHEL/Provision/ViewingDB.rst b/docs/source/OmniaInstallGuide/RHEL/Provision/ViewingDB.rst new file mode 100644 index 000000000..70ae488f8 --- /dev/null +++ b/docs/source/OmniaInstallGuide/RHEL/Provision/ViewingDB.rst @@ -0,0 +1,50 @@ +Checking node status +---------------------- + +Via CLI +-------- + +Run ``nodels all nodelist.status`` for a list of nodes and their statuses. Here's an example of this command output: :: + + omnia-node00001: installing + omnia-node00002: booted + omnia-node00003: powering-on + omnia-node00004: booted + +Possible values of node status are ``powering-off``, ``powering-on``, ``bmcready``, ``installing``, ``booting``, ``post-booting``, ``booted``, and ``failed``. + +.. caution:: Once xCAT is installed, restart your SSH session to the OIM to ensure that the newly set up environment variables come into effect. This will also allow the above command to work correctly. If the new environment variables still do not come into effect, enable manually using: + :: + source /etc/profile.d/xcat.sh + +Via Omnia database [omniadb] +----------------------------- + +1. To access the omniadb, execute: :: + + psql -U postgres + + \c omniadb + + +2. To view the schema being used in the cluster: ``\dn`` + +3. To view the tables in the database: ``\dt`` + +4. To view the contents of the ``nodeinfo`` table: ``select * from cluster.nodeinfo;`` :: + + id | service_tag | node | hostname | admin_mac | admin_ip | bmc_ip | status | discovery_mechanism | bmc_mode | switch_ip | switch_name | switch_port | cpu | gpu | cpu_count | gpu_count$ + ----+-------------+---------------+----------------+-------------------+--------------+------------+--------+---------------------+----------+-----------+-------------+-------------+-----+-----+-----------+------------ + 1 | | oim | newoim.new.dev | 00:0a:f7:dc:11:42 | 10.5.255.254 | 0.0.0.0 | | | | | | | | | | + 2 | xxxxxxx | node2 | node2.new.dev | c4:cb:e1:b5:70:44 | 10.5.0.12 | 10.30.0.12 | booted | mapping | | | | | amd | | 1 | 0 + 3 | xxxxxxx | node3 | node3.new.dev | f4:02:70:b8:bc:2a | 10.5.0.10 | 10.30.0.10 | booted | mapping | | | | | amd | amd | 2 | 1 + (3 rows) + + +Possible values of node status are ``powering-off``, ``powering-on``, ``bmcready``, ``installing``, ``booting``, ``post-booting``, ``booted``, ``failed``, ``ping``, ``noping``, and ``standingby``. + +.. note:: + * The ``gpu_count`` in the database is only updated every time a cluster node is PXE booted. + * Nodes listed as "failed" can be diagnosed using the ``/var/log/xcat/xcat.log`` file on the target node. Correct any underlying issues and `re-provision the node <../../Maintenance/reprovision.html>`_. + * Information on debugging nodes stuck at ``powering-on``, ``bmcready``, or ``installing`` for longer than expected is available `here. <../../../Troubleshooting/FAQ/Common/Provision.html>`_ Correct any underlying issue on the node and `re-provision the node <../../Maintenance/reprovision.html>`_. + * A blank node status indicates that no attempt to provision has taken place. Attempt a manual PXE boot on the node to initiate provisioning. diff --git a/docs/source/OmniaInstallGuide/RHEL/Provision/index.rst b/docs/source/OmniaInstallGuide/RHEL/Provision/index.rst new file mode 100644 index 000000000..27cd932b0 --- /dev/null +++ b/docs/source/OmniaInstallGuide/RHEL/Provision/index.rst @@ -0,0 +1,19 @@ +Step 3: Discover and provision the cluster +=========================================== + +The ``discovery_provision.yml`` playbook achieves the following tasks: + +1. Installation and configuration of the provision tool. +2. Discovery of potential cluster nodes. +3. Provisioning the minimal version of RHEL/Rocky Linux OS on the discovered cluster nodes. + +.. caution:: If you have a proxy server set up for your OIM, you must configure the proxy environment variables on the OIM before running any Omnia playbooks. For more information, `click here <../Setup_CP_proxy.html>`_. + +.. toctree:: + :maxdepth: 2 + + provisionprereqs + DiscoveryMechanisms/index + provisionparams + installprovisiontool + ViewingDB diff --git a/docs/source/InstallationGuides/InstallingProvisionTool/installprovisiontool.rst b/docs/source/OmniaInstallGuide/RHEL/Provision/installprovisiontool.rst similarity index 61% rename from docs/source/InstallationGuides/InstallingProvisionTool/installprovisiontool.rst rename to docs/source/OmniaInstallGuide/RHEL/Provision/installprovisiontool.rst index e72cf0e3f..6d1ac1367 100644 --- a/docs/source/InstallationGuides/InstallingProvisionTool/installprovisiontool.rst +++ b/docs/source/OmniaInstallGuide/RHEL/Provision/installprovisiontool.rst @@ -1,26 +1,23 @@ Provisioning the cluster --------------------------- +============================ Edit the ``input/provision_config.yml``, ``input/provision_config.yml``, and ``input/network_spec.yml`` files to update the required variables. A list of the variables required is available by `discovery mechanism `_. - .. note:: The first PXE device on target nodes should be the designated active NIC for PXE booting. +.. note:: The first PXE device on target nodes should be the designated active NIC for PXE booting. - .. image:: ../../images/BMC_PXE_Settings.png + .. image:: ../../../images/BMC_PXE_Settings.png -Optional configurations managed by the provision tool -+++++++++++++++++++++++++++++++++++++++++++++++++++++ +[Optional] Additional configurations handled by the provision tool +------------------------------------------------------------------------- **Using multiple versions of a given OS** -Omnia now supports deploying different versions of the same OS. With each run of ``discovery_provision.yml``, a new deployable OS image is created with a distinct type: +Omnia now supports deploying different versions of the same OS. With each run of ``discovery_provision.yml``, a new deployable OS image is created with a distinct type depending on the values provided in ``input/software_config.json``. Supported RHEL/Rocky Linux OS's are: - * Rocky Linux: 8.6, 8.7, 8.8 - * RHEL: 8.6, 8.7, 8.8 - * Ubuntu: 20.04, 22.04 + * RHEL 8.6, 8.7, 8.8 + * Rocky Linux 8.6, 8.7, 8.8 -depending on the values provided in ``input/software_config.json``. - -.. note:: While Omnia deploys the minimal version of the OS, the multiple version feature requires that the Rocky Linux full (DVD) version of the OS be provided. +.. note:: While Omnia deploys the minimal version of the OS, the multiple version feature requires that the Rocky Linux full (DVD) version of the OS be provided. **Disk partitioning** @@ -33,33 +30,35 @@ depending on the values provided in ``input/software_config.json``. - { mount_point: "swap", desired_capacity: "10240" } - Running the provision tool -++++++++++++++++++++++++++++ +------------------------------- To deploy the Omnia provision tool, ensure that ``input/provision_config.yml``, ``input/network_spec.yml``, and ``input/provision_config_credentials.yml`` are updated and then run:: ansible-playbook discovery_provision.yml +.. note:: If the ``input/software_config.json`` has AMD ROCm and NVIDIA CUDA drivers mentioned, the AMD and NVIDIA accelerator drivers are installed on the nodes post provisioning. -``discovery_provision.yml`` runs in three stages that can be called individually: +Stages of the provision tool +----------------------------- -.. caution:: Always execute ``discovery_provision.yml`` within the ``omnia`` directory. That is, always change directories (``cd omnia``) to the path where the playbook resides before running the playbook. +.. caution:: Always execute ``discovery_provision.yml`` within the ``omnia`` directory. That is, always change directories (using ``cd omnia``) to the path where the playbook resides before running the playbook. +The provision tool, invoked by the ``discovery_provision.yml`` playbook, runs in three stages that can be called individually: -**Preparing the control plane** +**Stage 1: Preparing the OIM** * Installs required tool packages. * Verifies and updates firewall settings. * Installs xCAT. * Configures Omnia databases basis ``input/network_spec.yml``. - * Creates empty inventory files in the control plane at ``/opt/omnia/omnia_inventory/``. These inventory files will be filled with information of compute node service tag post provisioning based on type of CPUs and GPUs they have. The inventory files are: + * Creates empty inventory files on the OIM at ``/opt/omnia/omnia_inventory/``. These inventory files will be filled with information of compute node service tag post provisioning based on type of CPUs and GPUs they have. The inventory files are: * ``compute_cpu_amd`` * ``compute_cpu_intel`` * ``compute_gpu_amd`` * ``compute_gpu_nvidia`` - * ``compute_servicetag_ip`` + * ``compute_hostname_ip`` .. note:: @@ -68,22 +67,22 @@ To deploy the Omnia provision tool, ensure that ``input/provision_config.yml``, * Node status must be "booted" in DB. * Node's service tag information is present in DB. - * Nodes are not removed from the inventory files even if they are physically disconnected. Ensure to run the `delete node playbook <../deletenode.html#delete-provisioned-node>`_ to remove the node. + * Nodes are not removed from the inventory files even if they are physically disconnected. Ensure to run the `delete node playbook <../../Maintenance/deletenode.html>`_ to remove the node. * To regenerate an inventory file, use the playbook ``omnia/utils/inventory_tagging.yml``. :: - cd prepare_cp - ansible-playbook prepare_cp.yml + cd prepare_oim + ansible-playbook prepare_oim.yml -**Discovering the nodes** +**Stage 2: Discovering the nodes** * Discovers all target servers. * PostgreSQL database is set up with all relevant cluster information such as MAC IDs, hostname, admin IP, BMC IPs etc. - * Configures the control plane with NTP services for cluster node synchronization. + * Configures the OIM with NTP services for cluster node synchronization. To call this playbook individually, run:: @@ -91,59 +90,55 @@ To deploy the Omnia provision tool, ensure that ``input/provision_config.yml``, cd discovery ansible-playbook discovery.yml -**Provisioning the nodes** +**Stage 3: Provisioning the nodes** * The intended operating system and version is provisioned on the primary disk partition on the nodes. If a BOSS Controller card is available on the target node, the operating system is provisioned on the boss controller disks. - To call this playbook individually, run:: + To call this playbook individually, run: :: cd provision ansible-playbook provision.yml .. note:: - * If you are using ``switch_based`` discovery mechanism, you do not need to run ``provision.yml`` playbook. Run ``prepare_cp.yml`` and ``discovery.yml`` and then manually boot the nodes in PXE mode. - * After executing ``discovery_provision.yml`` playbook, user can check the log file available at ``/var/log/omnia.log`` for more information. - ----- -After successfully running ``discovery_provision.yml``, go to `Building Clusters <../BuildingClusters/index.html>`_ to setup Slurm, Kubernetes, NFS, BeeGFS and Authentication. ----- + * If you are using ``switch_based`` discovery mechanism, you do not need to run ``provision.yml`` playbook. Run ``prepare_oim.yml`` and ``discovery.yml`` and then manually boot the nodes in PXE mode. -.. note:: - - * racadm and ipmitool are installed on all target nodes except Ubuntu 20.04. + * After executing ``discovery_provision.yml`` playbook, user can check the log file available at ``/var/log/omnia.log`` for more information. * Ansible playbooks by default run concurrently on 5 nodes. To change this, update the ``forks`` value in ``ansible.cfg`` present in the respective playbook directory. * While the ``admin_nic`` on cluster nodes is configured by Omnia to be static, the public NIC IP address should be configured by user. - * If the target nodes were discovered using switch-based or mapping mechanisms, manually PXE boot the target servers after the ``discovery_provision.yml`` playbook is executed and the target node lists as **booted** `in the nodeinfo table `_. + * If the target nodes were discovered using switch-based or mapping mechanisms, manually PXE boot the target servers after the ``discovery_provision.yml`` playbook is executed and the target node lists as **booted** in the `nodeinfo table `_. - * All ports required for xCAT to run will be opened (For a complete list, check out the `Security Configuration Document <../../SecurityConfigGuide/ProductSubsystemSecurity.html#firewall-settings>`_). + * All ports required for xCAT to run will be opened (For a complete list, check out the `Security Configuration Document <../../../SecurityConfigGuide/ProductSubsystemSecurity.html#firewall-settings>`_). * After running ``discovery_provision.yml``, the file ``input/provision_config_credentials.yml`` will be encrypted. To edit the file, use the command: ``ansible-vault edit provision_config_credentials.yml --vault-password-file .provision_credential_vault_key`` - * Post execution of ``discovery_provision.yml``, IPs/hostnames cannot be re-assigned by changing the mapping file. However, the addition of new nodes is supported as explained `here <../addinganewnode.html>`_. - - * Default Python is installed during provisioning on Ubuntu cluster nodes. For Ubuntu 22.04, Python 3.10 is installed. For Ubuntu 20.04, Python 3.8 is installed. + * Post execution of ``discovery_provision.yml``, IPs/hostnames cannot be re-assigned by changing the mapping file. However, the addition of new nodes is supported as explained `here <../../Maintenance/addnode.html>`_. .. caution:: - * Once xCAT is installed, restart your SSH session to the control plane to ensure that the newly set up environment variables come into effect. If the new environment variables still do not come into effect, enable manually using: :: + * Once xCAT is installed, restart your SSH session to the OIM to ensure that the newly set up environment variables come into effect. If the new environment variables still do not come into effect, enable manually using: :: source /etc/profile.d/xcat.sh - * To avoid breaking the passwordless SSH channel on the control plane, do not run ``ssh-keygen`` commands post execution of ``discovery_provision.yml`` to create a new key. + * To avoid breaking the password-less SSH channel on the OIM, do not run ``ssh-keygen`` commands post execution of ``discovery_provision.yml`` to create a new key. * Do not delete the following directories: - ``/root/xcat`` - ``/root/xcat-dbback`` - ``/docker-registry`` - ``/opt/omnia`` - ``/var/log/omnia`` + - ``/opt/omnia17_venv/`` * On subsequent runs of ``discovery_provision.yml``, if users are unable to log into the server, refresh the ssh key manually and retry. :: ssh-keygen -R * If a subsequent run of ``discovery_provision.yml`` fails, the ``input/provision_config.yml`` file will be unencrypted. -To create a node inventory in ``/opt/omnia``, `click here <../PostProvisionScript.html>`_. +**Next steps**: + +* After successfully running ``discovery_provision.yml``, go to `Building Clusters <../OmniaCluster/index.html>`_ to setup Kubernetes, NFS, BeeGFS, and Authentication. + +* To create a node inventory in ``/opt/omnia``, `click here <../ViewInventory.html>`_. diff --git a/docs/source/OmniaInstallGuide/RHEL/Provision/provisionparams.rst b/docs/source/OmniaInstallGuide/RHEL/Provision/provisionparams.rst new file mode 100644 index 000000000..612ae7438 --- /dev/null +++ b/docs/source/OmniaInstallGuide/RHEL/Provision/provisionparams.rst @@ -0,0 +1,105 @@ +Input parameters for the provision tool +----------------------------------------- + +Fill in all required parameters in ``input/provision_config.yml``, ``input/provision_config_credentials.yml``, ``input/software_config.json``, and ``input/network_spec.yml``. + +.. caution:: Do not remove or comment any lines in the above mentioned ``.yml`` files. + +.. csv-table:: provision_config.yml + :file: ../../../Tables/Provision_config.csv + :header-rows: 1 + :keepspace: + +.. [1] Boolean parameters do not need to be passed with double or single quotes. + +.. csv-table:: provision_config_credentials.yml + :file: ../../../Tables/Provision_creds.csv + :header-rows: 1 + :keepspace: + +.. note:: + + * The ``input/provision_config_credentials.yml`` file is encrypted on the first execution of the ``discovery_provision.yml`` or ``local_repo.yml`` playbooks. + + * To view the encrypted parameters: :: + + ansible-vault view provision_config_credentials.yml --vault-password-file .provision_credential_vault_key + + * To edit the encrypted parameters: :: + + ansible-vault edit provision_config_credentials.yml --vault-password-file .provision_credential_vault_key + + +.. csv-table:: software_config.json + :file: ../../../Tables/software_config_rhel.csv + :header-rows: 1 + :keepspace: + + +.. csv-table:: network_spec.yml + :file: ../../../Tables/network_spec.csv + :header-rows: 1 + :keepspace: + +.. note:: + + * If the ``nic_name`` is identical on both the ``admin_network`` and the ``bmc_network``, it indicates a LOM setup. Otherwise, it's a dedicated setup. + * BMC network details are not required when target nodes are discovered using a mapping file. + * If ``bmc_network`` properties are provided, target nodes will be discovered using the BMC method in addition to the methods whose details are explicitly provided in ``provision_config.yml``. + * The strings ``admin_network`` and ``bmc_network`` in the ``input/network_spec.yml`` file should not be edited. Also, the properties ``nic_name``, ``static_range``, and ``dynamic_range`` cannot be edited on subsequent runs of the provision tool. + * ``netmask_bits`` are mandatory and should be same for both ``admin_network`` and ``bmc_network`` (that is, between 1 and 32; 1 and 32 are also acceptable values). + +.. caution:: + * Do not assign the subnet 10.4.0.0/24 to any interfaces in the network as nerdctl uses it by default. + * All provided network ranges and NIC IP addresses should be distinct with no overlap in the ``input/network_spec.yml``. + * Ensure that all the iDRACs are reachable from the OIM. + +A sample of the ``input/network_spec.yml`` where nodes are discovered using a mapping file is provided below: :: + + --- + Networks: + - admin_network: + nic_name: "eno1" + netmask_bits: "16" + static_range: "10.5.0.1-10.5.0.200" + dynamic_range: "10.5.1.1-10.5.1.200" + correlation_to_admin: true + admin_uncorrelated_node_start_ip: "10.5.0.50" + network_gateway: "" + DNS: "" + MTU: "1500" + + - bmc_network: + nic_name: "" + netmask_bits: "" + static_range: "" + dynamic_range: "" + reassignment_to_static: true + discover_ranges: "" + network_gateway: "" + MTU: "1500" + +A sample of the ``input/network_spec.yml`` where nodes are discovered using BMC discovery mechanism is provided below: :: + + --- + Networks: + - admin_network: + nic_name: "" + netmask_bits: "" + static_range: "" + dynamic_range: "" + correlation_to_admin: true + admin_uncorrelated_node_start_ip: "" + network_gateway: "" + DNS: "" + MTU: "" + + - bmc_network: + nic_name: "eno1" + netmask_bits: "16" + static_range: "10.3.0.1-10.3.0.200" + dynamic_range: "10.3.1.1-10.3.1.200" + reassignment_to_static: true + discover_ranges: "" + network_gateway: "" + MTU: "1500" \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/RHEL/Provision/provisionprereqs.rst b/docs/source/OmniaInstallGuide/RHEL/Provision/provisionprereqs.rst new file mode 100644 index 000000000..1af36caf5 --- /dev/null +++ b/docs/source/OmniaInstallGuide/RHEL/Provision/provisionprereqs.rst @@ -0,0 +1,80 @@ +Before you run the provision tool +--------------------------------- + +* (Recommended) Run ``prereq.sh`` to get the system ready to deploy Omnia. + +* All target bare-metal servers (cluster nodes) should be reachable to the chosen OIM. + +* The UEFI boot setting should be configured in the BIOS settings before initiating PXE boot on the nodes. + +* Admin and BMC network switches should be configured before running the provision tool. For more information on configuring the switches, `click here <../AdvancedConfigurationsRHEL/ConfiguringSwitches/index.html>`_. + +* Set the IP address of the OIM. The OIM NIC connected to remote servers (through the switch) should be configured with two IPs (BMC IP and admin IP) in a shared LOM or hybrid set up. In the case dedicated network topology, a single IP (admin IP) is required. + +.. figure:: ../../../images/ControlPlaneNic.png + + *OIM NIC IP configuration in a LOM setup* + +.. figure:: ../../../images/ControlPlane_DedicatedNIC.png + + *OIM NIC IP configuration in a dedicated setup* + + +* Set the hostname of the OIM in the ``hostname``. ``domain name`` format. + + .. include:: ../../../Appendices/hostnamereqs.rst + + For example, ``controlplane.omnia.test`` is acceptable. :: + + hostnamectl set-hostname controlplane.omnia.test + +.. note:: The domain name specified for the OIM should be the same as the one specified under ``domain_name`` in ``input/provision_config.yml``. + +* To provision the bare metal servers, download one of the following ISOs to the OIM: + + * `RHEL 8.x `_ + * `Rocky Linux 8.x `_ + +.. note:: Ensure the ISO provided has downloaded seamlessly (No corruption). Verify the SHA checksum/ download size of the ISO file before provisioning to avoid future failures. + +Note the compatibility between cluster OS and OIM OS below: + + +---------------------+--------------------+------------------+ + | | | | + | OIM OS | Cluster Node OS | Compatibility | + +=====================+====================+==================+ + | | | | + | RHEL [1]_ | RHEL | Yes | + +---------------------+--------------------+------------------+ + | | | | + | Rocky | Rocky | Yes | + +---------------------+--------------------+------------------+ + +.. [1] Ensure that OIMs running RHEL have an active subscription or are configured to access local repositories. The following repositories should be enabled on the OIM: **AppStream**, **BaseOS**. + +* Ensure that all connection names under the network manager match their corresponding device names. + To verify network connection names: :: + + nmcli connection + + To verify the device name: :: + + ip link show + +In the event of a mismatch, edit the file ``/etc/sysconfig/network-scripts/ifcfg-`` using the vi editor for RHEL/Rocky Linux clusters. + +* When discovering nodes via a mapping file, all target nodes should be set up in PXE mode before running the playbook. + +.. note:: + + * After configuration and installation of the cluster, changing the OIM is not supported. If you need to change the OIM, you must redeploy the entire cluster. + + * For servers with an existing OS being discovered via BMC, ensure that the first PXE device on target nodes should be the designated active NIC for PXE booting. + + + + + + + + diff --git a/docs/source/OmniaInstallGuide/RHEL/RHELSpace.rst b/docs/source/OmniaInstallGuide/RHEL/RHELSpace.rst new file mode 100644 index 000000000..d83947771 --- /dev/null +++ b/docs/source/OmniaInstallGuide/RHEL/RHELSpace.rst @@ -0,0 +1,16 @@ +Space requirements for the OIM running on RHEL or Rocky Linux OS +===================================================================== + +* For all available software packages that Omnia supports: 50GB +* For the complete set of software images (in ``/`` or ``/var`` partition): 500GB +* For node with limited storage space in ``/`` or ``/var`` partition, Omnia suggests to execute ``local_repo.yml`` playbook with ``repo_config`` set to ``never`` ``input/local_repo_config.yml``. In this scenario, all software packages are downloaded and stored in pre-defined user registry. +* For storing offline repositories (the file path should be specified in ``repo_store_path`` in ``input/local_repo_config.yml``): 50GB + +.. note:: Docker and nerdctl services operate from the ``/var/lib/`` directory. If the OIM has storage constraints, users can mount this directory to another drive of their choice that has more storage capacity. Alternatively, the user can mount any external NFS server on the OIM and use that to store the software packages. + +.. csv-table:: Space requirements for images and packages on OIM + :file: ../../Tables/RHEL_space_req.csv + :header-rows: 1 + :keepspace: + +.. [1] Space allocated as part of OS repository (.iso). No extra space required. diff --git a/docs/source/OmniaInstallGuide/RHEL/RHEL_prereq.rst b/docs/source/OmniaInstallGuide/RHEL/RHEL_prereq.rst new file mode 100644 index 000000000..8217e1257 --- /dev/null +++ b/docs/source/OmniaInstallGuide/RHEL/RHEL_prereq.rst @@ -0,0 +1,23 @@ +Prerequisites +================= + +1. Choose a server outside of your intended cluster with the mentioned `storage requirements `_ to function as your Omnia Infrastructure Manager (OIM). + +2. Ensure that the OIM has a full-featured RHEL operating system (OS) installed. For a complete list of supported OS versions, check out the `Support Matrix <../../Overview/SupportMatrix/OperatingSystems/index.html>`_. + +3. Enable the **AppStream** and **BaseOS** repositories via the RHEL subscription manager. + +4. Ensure that the OIM needs is internet-capable with Git installed. If Git is not installed, use the below commands to install it. :: + + dnf install git -y + +.. note:: If the OIM server has an Infiniband NIC installed, run the below command to install the hardware drivers and Infiniband-related packages: + :: + yum groupinstall "Infiniband Support" -y + +5. Clone the Omnia repository from GitHub on to the OIM server using the following command: :: + + git clone https://github.com/dell/omnia.git + +6. [Optional] `Set up a proxy server for the OIM `_. + diff --git a/docs/source/OmniaInstallGuide/RHEL/Setup_CP_proxy.rst b/docs/source/OmniaInstallGuide/RHEL/Setup_CP_proxy.rst new file mode 100644 index 000000000..d0570d68b --- /dev/null +++ b/docs/source/OmniaInstallGuide/RHEL/Setup_CP_proxy.rst @@ -0,0 +1,48 @@ +Configure a proxy server for the OIM +======================================= + +.. note:: You can skip the proxy setup using ``site_config.yml`` input file if you have direct internet access on the OIM. + +OIM proxy configuration is now available for Omnia users. This means that the OIM will not have direct access to the internet but via a proxy server. To set up the OIM with a proxy server, do the following: + +1. Go to ``omnia/input`` folder. + +2. Open the ``site_config.yml`` file and add the proxy server details to the ``proxy`` variable, as explained below: + ++-----------------------------+-------------------------------------------------------------------------------------------------------------------------------+ +| Parameter | Description | ++=============================+===============================================================================================================================+ +| **http_proxy** | | +| (Mandatory) | * This variable points to the HTTP proxy server and the port associated with the proxy server. | +| | * **Example:** ``"http://corporate-proxy:3128"`` | ++-----------------------------+-------------------------------------------------------------------------------------------------------------------------------+ +| **https_proxy** | | +| (Mandatory) | * This variable points to the HTTPS proxy server and the port associated with the proxy server. | +| | * **Example:** ``"https://corporate-proxy:3128"`` | ++-----------------------------+-------------------------------------------------------------------------------------------------------------------------------+ +| **no_proxy** | | +| (Optional) | * This variable is configured with the OIM hostname, admin network IP or any internal cluster network. | +| | * This value is required to exclude the internal cluster network from the proxy server. | +| | * **Example:** ``controlplane.omnia.test,10.5.0.1`` | ++-----------------------------+-------------------------------------------------------------------------------------------------------------------------------+ + + Sample input: :: + + proxy: + - { http_proxy: "http://corporate-proxy:3128", https_proxy: "http://corporate-proxy:3128", no_proxy: "controlplane.omnia.test,10.5.0.1" } + +3. Configure the ``http_proxy``, ``https_proxy``, and ``no_proxy`` environment variables on the OIM server. + + * Execute the following commands to temporarily update the proxy environment variable: :: + + export http_proxy=http://: + export https_proxy=http://: + export no_proxy="","" + + * For persistent proxy, update the ``/etc/environment`` or ``/root/.bashrc`` with the proxy environment details. :: + + http_proxy=http://: + https_proxy=http://: + no_proxy="","" + +.. caution:: You must configure the proxy environment variables on the OIM before running any Omnia playbooks. \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/RHEL/ViewInventory.rst b/docs/source/OmniaInstallGuide/RHEL/ViewInventory.rst new file mode 100644 index 000000000..bfc21dd1d --- /dev/null +++ b/docs/source/OmniaInstallGuide/RHEL/ViewInventory.rst @@ -0,0 +1,49 @@ +Step 4: View node inventory +================================= + +When ``discovery_provision.yml``, ``prepare_oim.yml``, or ``utils/inventory_tagging.yml`` is run, a set of inventory files is created in ``/opt/omnia/omnia_inventory/`` based on `the Omnia database. `_ The inventories are created based on the type of CPUs and GPUs nodes have. The inventory files are: + + * ``compute_cpu_amd`` :: + + # This file is generated by omnia, and should not be edited + [compute_cpu_amd] + node001.omnia.test + + * ``compute_cpu_intel`` :: + + # This file is generated by omnia, and should not be edited + [compute_cpu_intel] + node001.omnia.test + + * ``compute_gpu_amd`` :: + + # This file is generated by omnia, and should not be edited + [compute_gpu_amd] + node002.omnia.test + node003.omnia.test + + * ``compute_gpu_nvidia`` :: + + # This file is generated by omnia, and should not be edited + [compute_gpu_nvidia] + node001.omnia.test + + * ``compute_hostname_ip`` :: + + # This file is generated by omnia, and should not be edited + [compute_hostname_ip] + node001.omnia.test ansible_host=10.5.0.2 + node002.omnia.test ansible_host=10.5.0.3 + node003.omnia.test ansible_host=10.5.0.4 + +.. note:: + + * Hostnames will only be written into the inventory files after the nodes are successfully PXE booted post provisioning. + * For a node's hostname to list in an inventory file, two conditions must be met: + + * Node status must be "booted" in DB. + * Node's hostname information is present in DB. + * To regenerate all the inventory files, use the playbook ``utils/inventory_tagging.yml``. + + + diff --git a/docs/source/OmniaInstallGuide/RHEL/index.rst b/docs/source/OmniaInstallGuide/RHEL/index.rst new file mode 100644 index 000000000..9d5cfed98 --- /dev/null +++ b/docs/source/OmniaInstallGuide/RHEL/index.rst @@ -0,0 +1,16 @@ +Install Omnia on Red Hat Enterprise Linux (RHEL) or Rocky Linux clusters +========================================================================== + +.. toctree:: + :maxdepth: 2 + + RHEL_prereq + RHELSpace + Prereq.sh/index + CreateLocalRepo/index + Provision/index + ViewInventory + OmniaCluster/index + InstallAITools/index + AdvancedConfigurationsRHEL/index + diff --git a/docs/source/InstallationGuides/pullimagestonodes.rst b/docs/source/OmniaInstallGuide/RHEL/pullimagestonodes.rst similarity index 82% rename from docs/source/InstallationGuides/pullimagestonodes.rst rename to docs/source/OmniaInstallGuide/RHEL/pullimagestonodes.rst index 065edf9b9..7f270028a 100644 --- a/docs/source/InstallationGuides/pullimagestonodes.rst +++ b/docs/source/OmniaInstallGuide/RHEL/pullimagestonodes.rst @@ -1,9 +1,9 @@ Download custom packages/images to the cluster =============================================== -**Download packages/images to the control plane registry** +**Download packages/images to the OIM registry** -To download packages/images to the control plane registry/repository, ``local_repo.yml`` should be executed. +To download packages/images to the OIM registry/repository, ``local_repo.yml`` should be executed. Follow the steps below to download packages/images: @@ -54,7 +54,7 @@ Follow the steps below to download packages/images: ] } - 3. Execute the following command to download required images from internet to control plane: + 3. Execute the following command to download required images from internet to OIM: :: @@ -77,18 +77,18 @@ Follow the steps below to download packages/images: 10.8.0.2 10.8.0.3 - 2. Execute the following command to pull images from control plane to the desired nodes: + 2. Execute the following command to pull images from OIM to the desired nodes: :: cd utils ansible-playbook pull_images_to_nodes.yml -i imagepull_inventory.ini -.. note:: Since the nodes are behind the proxy, they don't have direct internet access. Only the control plane has direct access to the public internet. - Nodes can connect to the internet via the control plane by setting the ``http_proxy`` and ``https_proxy`` environment variables, in the following format: :: +.. note:: Since the nodes are behind the proxy, they don't have direct internet access. Only the OIM has direct access to the public internet. + Nodes can connect to the internet via the OIM by setting the ``http_proxy`` and ``https_proxy`` environment variables, in the following format: :: - export http_proxy=http://:3128 - export https_proxy=http://:3128 + export http_proxy=http://:3128 + export https_proxy=http://:3128 Example: :: diff --git a/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/ConfiguringStorage/index.rst b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/ConfiguringStorage/index.rst new file mode 100644 index 000000000..640225d69 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/ConfiguringStorage/index.rst @@ -0,0 +1,145 @@ +Configuring Storage +======================= + +Configuring PowerVault storage +-------------------------------- + +To configure PowerVault ME4 and ME5 storage arrays, follow the below steps: + +1. Fill out all required parameters in ``storage/powervault_input.yml``: + +.. caution:: Do not remove or comment any lines in the ``storage/powervault_input.yml`` file. + ++--------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Parameter | Details | ++================================+===========================================================================================================================================================================================================================================================+ +| powervault_protocol | This variable indicates the network protocol used for data connectivity . | +| ``string`` | | +| Required | **Default values**: ``sas`` | ++--------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| powervault_controller_mode | This variable indicates the number of controllers available on the target powervault. | +| ``string`` | | +| Required | Choices: | +| | | +| | * ``multi`` <- default | +| | * ``single`` | ++--------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| powervault_locale | Represents the selected language. Currently, only English is supported. | +| ``string`` | | +| Optional | **Default values**: ``English`` | ++--------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| powervault_system_name | The system name used to identify the PowerVault Storage device. The name should be less than 30 characters and must not contain spaces. | +| ``string`` | | +| Optional | **Default values**: ``Unintialized_Name`` | ++--------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| powervault_snmp_notify_level | Select the SNMP notification levels for PowerVault Storage devices. | +| ``string`` | | +| Required | **Default values**: ``none`` | ++--------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| powervault_pool_type | This variable indicates the kind of pool created on the target powervault. | +| ``string`` | | +| Required | Choices: | +| | | +| | * ``linear`` <- default | +| | * ``virtual`` | ++--------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| powervault_raid_levels | Enter the required RAID levels and the minimum / maximum number of disks for each RAID levels. | +| ``string`` | | +| Optional | Choices: | +| | | +| | * ``raid1`` <- default | +| | * ``raid5`` | +| | * ``raid6`` | +| | * ``raid10`` | ++--------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| powervault_disk_range | Enter the range of disks in the format enclosure-number.disk-range,enclosure-number.disk-range. For example, to select disks 3 to 12 in enclosure 1 and to select disks 5 to 23 in enclosure 2, you must enter 1.3-12, 2.5-23. | +| ``string`` | A RAID 10 or 50 disk group with disks in subgroups are separated by colons (with no spaces). RAID-10 example:1.1-2:1.3-4:1.7,1.10 | +| Required | Note: Ensure that the entered disk location is empty and the Usage column lists the range as AVAIL. The disk range specified must be of the same vendor and they must have the same description. | +| | | +| | **Default values**: ``0.0-1`` | ++--------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| powervault_disk_group_name | Specifies the disk group name | +| ``string`` | | +| Required | **Default values**: ``omnia`` | ++--------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| powervault_volumes | Specify the volume details for powervault and NFS Server node. Multiple volumes can be defined as comma-separated values. example: omnia_home1, omnia_home2. | +| ``string`` | | +| Required | **Default values**: ``omnia_home`` | ++--------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| powervault_volume_size | Enter the volume size in the format: SizeGB. | +| ``string`` | | +| Required | **Default values**: ``100GB`` | ++--------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| powervault_pool | Enter the pool for the volume. | +| ``string`` | | +| Required | Choices: | +| | | +| | * ``a`` <- default | +| | * ``A`` | +| | * ``B`` | +| | * ``b`` | ++--------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| powervault_disk_partition_size | Specify the disk partition size as a percentage of available disk space. | +| ``integer`` | | +| Optional | | ++--------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| powervault_server_nic | Enter the NIC of the server to which the PowerVault Storage is connected. Make sure the nfs server also has 3 nics (for internet, OS provision and powervault connection). The nic should be specified based on the provisioned OS on nfs server. | +| ``string`` | | +| Optional | | ++--------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| snmp_trap_destination | The trap destination IP address is the IP address of the SNMP Server where the trap will be sent. If this variable is left blank, SNMP will be disabled. Omnia will not validate this IP. | +| ``string`` | | +| Optional | | ++--------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| snmp_community_name | The SNMP community string used to access statistics, MAC addresses and IPs stored within a router or other device. | +| ``string`` | | +| Optional | **Default values**: ``public`` | ++--------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +2. Run the playbook: :: + + cd storage + ansible-playbook powervault.yml -i inventory -e powervault_username="" -e powervault_password="" + +* The ``inventory`` refers to a list of all nodes separated by a newline. +* ``powervault_username`` and ``powervault_password`` are the credentials used to administrate the array. + + +Configuring NFS server server connected to PowerVault +------------------------------------------------------ + +1. To configure an NFS server, enter the following parameters in ``storage/nfs_server_input.yml`` + ++--------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Parameter | Details | ++====================+=====================================================================================================================================================================================================================================================================================================================================================================================================================================================+ +| powervault_ip | Mandatory field when nfs group is defined with an IP and omnia is required to configure nfs server. IP of Powervault connected to NFS Server should be provided. In a single run of omnia, only one NFS Server is configured. To configure multiple NFS Servers, add one IP in nfs group in a single run of omnia.yml and give variable values accordingly. To configure another nfs node, update variables and run ``nfs_sas.yml`` | +| ``string`` | | +| Optional | | ++--------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| powervault_volumes | Specify the volume details for powervault and NFS Server node | +| ``JSON list`` | For multiple volumes, list of json with volume details should be provided. | +| Required | | +| | * ``server_share_path``: The path at which volume is mounted on nfs node | +| | * ``server_export_options``: Default value is- rw,sync,no_root_squash (unless specified otherwise). For a list of accepted options, `click here `_ | +| | * ``client_shared_path``: The path at which volume is mounted on all nodes. This value is taken as ``server_share_path`` unless specified otherwise. | +| | * ``client_mount_options``: Default value is- nosuid,rw,sync,hard,intr (unless specified otherwise). For a list of accepted options, `click here `_ | +| | | +| | Must specify atleast 1 volume | +| | | +| | **Default values**: `` - { name: omnia_home, server_share_path: /home/omnia_home, server_export_options: }`` | ++--------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +2. Run the playbook: :: + + cd storage + ansible-playbook nfs_sas.yml -i /root/inventory -e powervault_username="xxxxx" -e powervault_password="xxxxxx" + +* Where the ``inventory`` refers to a list of all nodes separated by a newline. +* To set up NFS client services, `click here <../../OmniaCluster/BuildingCluster/Storage/NFS.rst>`_ + + + + + + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/ConfiguringSwitches/ethernet-Z.rst b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/ConfiguringSwitches/ethernet-Z.rst new file mode 100644 index 000000000..6013cf0dc --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/ConfiguringSwitches/ethernet-Z.rst @@ -0,0 +1,83 @@ +Configuring ethernet switches (Z series) +----------------------------------------- + +.. note:: Omnia is specifically designed to support the configuration of Ethernet switches that run on the Dell SmartFabric OS10 network operating system. + +* Edit the ``network/ethernet_zseries_input.yml`` file for all Z series PowerSwitches such as Z9332F-ON, Z9262-ON and Z9264F-ON. The default configuration is written for a Z9264F-ON switch. + +.. caution:: Do not remove or comment any lines in the ``network/ethernet_zseries_input.yml`` file. + ++----------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Name | Details | ++============================+=====================================================================================================================================================================================+ +| os10_config | Global configurations for the switch. | +| ``string`` | | +| Required | Choices: | +| | | +| | * ``interface vlan1`` <- Default | +| | | +| | * ``exit`` | ++----------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| breakout_value | By default, all ports are configured in the 10g-4x breakout mode in which a QSFP28 or QSFP+ port is split into four 10G interfaces. | +| ``string`` | | +| Required | Choices: | +| | | +| | * ``10g-4x`` <- Default | +| | | +| | * ``25g-4x`` | +| | | +| | * ``40g-1x`` | +| | | +| | * ``50g-2x`` | +| | | +| | * ``100g-1x`` | ++----------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| snmp_trap_destination | The trap destination IP address is the IP address of the SNMP Server where the trap will be sent. Ensure that the SNMP IP is valid. | +| ``string`` | | +| Optional | | ++----------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| snmp_community_string | An SNMP community string is a means of accessing statistics stored within a router or other device. | +| ``string`` | | +| Optional | **Default values**: ``public`` | ++----------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ethernet 1/1/(1-63) config | By default: | +| ``string`` | | +| Required | * Port description is provided. | +| | * Each interface is set to "up" state. | +| | * The fanout/breakout mode for 1/1/1 to 1/1/63 is as per the value set in the breakout_value variable. | +| | * Update the individual interfaces of the Dell PowerSwitch S5232F-ON. | +| | * The interfaces are from ethernet 1/1/1 to ethernet 1/1/63. By default, the breakout mode is set for 1/1/1 to 1/1/63. | +| | * Note: The playbooks will fail if any invalid configurations are entered. | ++----------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| save_changes_to_startup | Change it to "true" only when you are certain that the updated configurations and commands are valid. | +| ``boolean`` [1]_ | | +| Required | WARNING: When set to "true", the startup configuration file is updated. If incorrect configurations or commands are entered, the Ethernet switches may not operate as expected. | +| | | +| | Choices: | +| | | +| | * ``false`` <- Default | +| | | +| | * ``true`` | ++----------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +.. [1] Boolean parameters do not need to be passed with double or single quotes. + +* When initializing a factory reset switch, the user needs to ensure DHCP is enabled and an IPv6 address is not assigned. + +* The 65th port on a Z series switch cannot be split. + + * Only odd ports support breakouts on Z9264F-ON. For more information, `click here `_. + +.. note:: The ``breakout_value`` of a port can only be changed after un-splitting the port. + +**Running the playbook**:: + + cd network + + ansible-playbook ethernet_switch_config.yml -i switch_inventory -e ethernet_switch_username=”” -e ethernet_switch_password=”” + +* The ``inventory`` file should be a list of switch IPs separated by newlines. Refer to the switch_inventory section in `Sample Files <../../../samplefiles.html#switch-inventory>`_ for more information. + +* The ``ethernet_switch_username`` and ``ethernet_switch_password`` are the credentials used to authenticate and access the switch using the management port. + + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/ConfiguringSwitches/ethernet-s3_s4.rst b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/ConfiguringSwitches/ethernet-s3_s4.rst new file mode 100644 index 000000000..94dbaa14d --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/ConfiguringSwitches/ethernet-s3_s4.rst @@ -0,0 +1,79 @@ +Configuring ethernet switches (S3 and S4 series) +------------------------------------------------ + +.. note:: Omnia is specifically designed to support the configuration of Ethernet switches that run on the Dell SmartFabric OS10 network operating system. + +* Edit the ``network/ethernet_tor_input.yml`` file for all S3* and S4* PowerSwitches such as S3048-ON, S4048T-ON, S4112F-ON, S4048-ON, S4048T-ON, S4112F-ON, S4112T-ON, and S4128F-ON. + +.. caution:: Do not remove or comment any lines in the ``network/ethernet_tor_input.yml`` file. + ++----------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Name | Details | ++============================+=====================================================================================================================================================================================+ +| os10_config | Global configurations for the switch. | +| ``string`` | | +| Required | Choices: | +| | | +| | * ``interface vlan1`` <- Default | +| | | +| | * ``exit`` | ++----------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| breakout_value | By default, all ports are configured in the 10g-4x breakout mode in which a QSFP28 or QSFP+ port is split into four 10G interfaces. | +| ``string`` | | +| Required | Choices: | +| | | +| | * ``10g-4x`` <- Default | +| | | +| | * ``25g-4x`` | +| | | +| | * ``40g-1x`` | +| | | +| | * ``50g-2x`` | +| | | +| | * ``100g-1x`` | ++----------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| snmp_trap_destination | The trap destination IP address is the IP address of the SNMP Server where the trap will be sent. Ensure that the SNMP IP is valid. | +| ``string`` | | +| Optional | | ++----------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| snmp_community_string | An SNMP community string is a means of accessing statistics stored within a router or other device. | +| ``string`` | | +| Optional | **Default values**: ``public`` | ++----------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ethernet 1/1/(1-52) config | By default: | +| ``string`` | | +| Required | * Port description is provided. | +| | * Each interface is set to "up" state. | +| | * The fanout/breakout mode for 1/1/1 to 1/1/52 is as per the value set in the breakout_value variable. | +| | * Update the individual interfaces of the Dell PowerSwitch S5232F-ON. | +| | * The interfaces are from ethernet 1/1/1 to ethernet 1/1/52. By default, the breakout mode is set for 1/1/1 to 1/1/52. | +| | * Note: The playbooks will fail if any invalid configurations are entered. | ++----------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| save_changes_to_startup | Change it to "true" only when you are certain that the updated configurations and commands are valid. | +| ``boolean`` [1]_ | | +| Required | WARNING: When set to "true", the startup configuration file is updated. If incorrect configurations or commands are entered, the Ethernet switches may not operate as expected. | +| | | +| | Choices: | +| | | +| | * ``false`` <- Default | +| | | +| | * ``true`` | ++----------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +.. [1] Boolean parameters do not need to be passed with double or single quotes. + +* When initializing a factory reset switch, the user needs to ensure DHCP is enabled and an IPv6 address is not assigned. + + +**Running the playbook**:: + + cd network + + ansible-playbook ethernet_switch_config.yml -i switch_inventory -e ethernet_switch_username=”” -e ethernet_switch_password=”” + +* The ``inventory`` file should be a list of switch IPs separated by newlines. Refer to the switch_inventory section in `Sample Files <../../../samplefiles.html#switch-inventory>`_ for more information. + +* The ``ethernet_switch_username`` and ``ethernet_switch_password`` are the credentials used to authenticate and access the switch using the management port. + + + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/ConfiguringSwitches/ethernet-s5.rst b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/ConfiguringSwitches/ethernet-s5.rst new file mode 100644 index 000000000..82292daa0 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/ConfiguringSwitches/ethernet-s5.rst @@ -0,0 +1,80 @@ +Configuring ethernet switches (S5 series) +------------------------------------------------ + +.. note:: Omnia is specifically designed to support the configuration of Ethernet switches that run on the Dell SmartFabric OS10 network operating system. + +* Edit the ``network/ethernet_sseries_input.yml`` file for all S5* PowerSwitches such as S5232F-ON. + +.. caution:: Do not remove or comment any lines in the ``network/ethernet_sseries_input.yml`` file. + ++----------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Name | Details | ++============================+=====================================================================================================================================================================================+ +| os10_config | Global configurations for the switch. | +| ``string`` | | +| Required | Choices: | +| | | +| | * ``interface vlan1`` <- Default | +| | | +| | * ``exit`` | ++----------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| breakout_value | By default, all ports are configured in the 10g-4x breakout mode in which a QSFP28 or QSFP+ port is split into four 10G interfaces. | +| ``string`` | | +| Required | Choices: | +| | | +| | * ``10g-4x`` <- Default | +| | | +| | * ``25g-4x`` | +| | | +| | * ``40g-1x`` | +| | | +| | * ``50g-2x`` | +| | | +| | * ``100g-1x`` | ++----------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| snmp_trap_destination | The trap destination IP address is the IP address of the SNMP Server where the trap will be sent. Ensure that the SNMP IP is valid. | +| ``string`` | | +| Optional | | ++----------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| snmp_community_string | An SNMP community string is a means of accessing statistics stored within a router or other device. | +| ``string`` | | +| Optional | **Default values**: ``public`` | ++----------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ethernet 1/1/(1-34) config | By default: | +| ``string`` | | +| Required | * Port description is provided. | +| | * Each interface is set to "up" state. | +| | * The fanout/breakout mode for 1/1/1 to 1/1/31 is as per the value set in the breakout_value variable. | +| | * Update the individual interfaces of the Dell PowerSwitch S5232F-ON. | +| | * The interfaces are from ethernet 1/1/1 to ethernet 1/1/34. By default, the breakout mode is set for 1/1/1 to 1/1/34. | +| | * Note: The playbooks will fail if any invalid configurations are entered. | ++----------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| save_changes_to_startup | Change it to "true" only when you are certain that the updated configurations and commands are valid. | +| ``boolean`` [1]_ | | +| Required | WARNING: When set to "true", the startup configuration file is updated. If incorrect configurations or commands are entered, the Ethernet switches may not operate as expected. | +| | | +| | Choices: | +| | | +| | * ``false`` <- Default | +| | | +| | * ``true`` | ++----------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +* When initializing a factory reset switch, the user needs to ensure DHCP is enabled and an IPv6 address is not assigned. + +.. [1] Boolean parameters do not need to be passed with double or single quotes. + +.. note:: The ``breakout_value`` of a port can only be changed after un-splitting the port. + +**Running the playbook**:: + + cd network + + ansible-playbook ethernet_switch_config.yml -i switch_inventory -e ethernet_switch_username=”” -e ethernet_switch_password=”” + +* The ``inventory`` file should be a list of switch IPs separated by newlines. Refer to the switch_inventory section in `Sample Files <../../../samplefiles.html#switch-inventory>`_ for more information. + +* The ``ethernet_switch_username`` and ``ethernet_switch_password`` are the credentials used to authenticate and access the switch using the management port. + + + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/ConfiguringSwitches/index.rst b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/ConfiguringSwitches/index.rst new file mode 100644 index 000000000..88f353273 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/ConfiguringSwitches/index.rst @@ -0,0 +1,18 @@ +Configuring switches +===================== + +.. note:: + + * Omnia is specifically designed to support the configuration of Infiniband switches that run on the NVIDIA MLNX-OS network operating system. + * Omnia is specifically designed to support the configuration of Ethernet switches that run on the Dell SmartFabric OS10 network operating system. + * If you are using Ethernet switches that run on a free and open-source network operating system like SONiC OS, it is important to note that the configuration process will need to be done manually by users. + * Omnia supports the configuration of the BMC (out-of-band) and admin network switches for the switches mentioned in the `support matrix <../../../../Overview/SupportMatrix/Hardware/switches.html>`_. However, it is important to note that Omnia only configures the data network and does not handle the configuration of the management network. + * Omnia does not handle the configuration of the management port for switches. Instead, users are responsible for configuring the management port by providing the switch IP and necessary credentials. + +.. toctree:: + infiniband + ethernet-s3_s4 + ethernet-s5 + ethernet-Z + + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/ConfiguringSwitches/infiniband.rst b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/ConfiguringSwitches/infiniband.rst new file mode 100644 index 000000000..552d318fe --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/ConfiguringSwitches/infiniband.rst @@ -0,0 +1,131 @@ +Configuring infiniband switches +-------------------------------- + +.. note:: Omnia is specifically designed to support the configuration of Infiniband switches that run on the NVIDIA MLNX-OS network operating system. + +Depending on the number of ports available on your Infiniband switch, they can be classified into: + + - EDR Switches (36 ports) + - HDR Switches (40 ports) + - NDR Switches (32 ports) + +Input the configuration variables into the ``network/infiniband_edr_input.yml``, ``network/infiniband_hdr_input.yml`` or ``network/infiniband_ndr_input.yml`` as appropriate: + +.. caution:: Do not remove or comment any lines in the ``network/infiniband_edr_input.yml``, ``network/infiniband_hdr_input.yml`` or ``network/infiniband_ndr_input.yml`` file. + ++-------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Parameters | Details | ++=========================+========================================================================================================================================================================+ +| enable_split_port | Indicates whether ports are to be split. | +| ``boolean`` [1]_ | | +| Required | Choices: | +| | | +| | * ``false`` <- default | +| | * ``true`` | ++-------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ib_split_ports | * Stores the split configuration of the ports. | +| ``string`` | * For EDR and HDR switches, the accepted formats are : comma-separated (EX: "1,2"), ranges (EX: "1-10"), comma-separated ranges (EX: "1,2,3-8,9,10-12") | +| Optional | * For NDR switches, the accepted format is: 2/1, 2/2, 3/1 | +| | .. note:: The port prefix IB1 can be ignored when setting this value. | ++-------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| snmp_community_name | The “SNMP community string” is like a user ID or password that allows access to a router's or other device's statistics. | +| ``string`` | | +| Optional | **Default values**: ``public`` | ++-------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| cache_directory | Cache location used by OpenSM | +| ``string`` | | +| Optional | | ++-------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| log_directory | The directory where temporary files of opensm are stored. Can be set to the default directory or enter a directory path to store temporary files. | +| ``string`` | | +| Optional | | ++-------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| mellanox_switch_config | List of configuration lines to apply to the switch. | +| ``string`` | # Example: | +| Optional | # mellanox_switch_config: | +| | # - Command 1 | +| | # - Command 2 | +| | By default, the list is empty. | ++-------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ib 1/(1-xx) config | Indicates the required state of ports 1-xx (depending on the value of 1/x). | +| ``string`` | | +| Optional | **Default values**: ``"no shutdown"`` | ++-------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| save_changes_to_startup | Indicates whether the switch configuration is to persist across reboots. | +| ``boolean`` [1]_ | Choices: | +| Optional | | +| | * ``false`` <- default | +| | * ``true`` | ++-------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +.. [1] Boolean parameters do not need to be passed with double or single quotes. + +**Before you run the playbook** + +Before running ``network/infiniband_switch_config.yml``, ensure that SSL Secure Cookies are disabled. Also, HTTP and JSON Gateway need to be enabled on your switch. This can be verified by running: :: + + show web (To check if SSL Secure Cookies is disabled and HTTP is enabled) + show json-gw (To check if JSON Gateway is enabled) + +In case any of these services are not in the state required, run: :: + + no web https ssl secure-cookie enable (To disable SSL Secure Cookies) + web http enable (To enable the HTTP gateway) + json-gw enable (To enable the JSON gateway) + + +When connecting to a new or factory reset switch, the configuration wizard requests to execute an initial configuration: + +(Recommended) If the user enters 'no', they still have to provide the admin and monitor passwords. + +If the user enters 'yes', they will also be prompted to enter the hostname for the switch, DHCP details, IPv6 details, etc. + +.. note:: + * Currently, Omnia only supports the splitting of switch ports. Switch ports cannot be un-split using this script. For information on manually un-splitting ports, `click here `_. + + * When initializing a factory reset switch, the user needs to ensure DHCP is enabled and an IPv6 address is not assigned. + + * All ports intended for splitting need to be connected to the network before running the playbook. + + * The ``ib_password`` remains unchanged on switches that are in split-ready mode. + +**Running the playbook** + +If ``enable_split_port`` is **true**, run:: + + cd network + ansible-playbook infiniband_switch_config.yml -i switch_inventory -e ib_username="" -e ib_password="" -e ib_admin_password="" -e ib_monitor_password="" -e ib_default_password="" -e ib_switch_type="" + + +If ``enable_split_port`` is **false**, run:: + + cd network + ansible-playbook infiniband_switch_config.yml -i switch_inventory -e ib_username="" -e ib_password="" -e ib_switch_type="" + + +* Where ``ib_username`` is the username used to authenticate into the switch. + +* Where ``ib_password`` is the password used to authenticate into the switch. + +* Where ``ib_admin_password`` is the intended password to authenticate into the switch after ``infiniband_switch_config.yml`` has run. + +* Where ``ib_monitor_password`` is the mandatory password required while running the initial configuration wizard on the Infiniband switch. + +.. note:: + + * ``ib_admin_password`` and ``ib_monitor_password`` have the following constraints: + + * Passwords should contain 8-64 characters. + + * Passwords should be different from username. + + * Passwords should be different from 5 previous passwords. + + * Passwords should contain at least one of each: Lowercase, uppercase and digits. + +* The inventory file should be a list of switch IPs separated by newlines. Refer to the switch_inventory section in `Sample Files <../../../samplefiles.html#switch-inventory>`_ for more information. + +* Where ``ib_default_password`` is the password used to authenticate into factory reset/fresh-install switches. + +* Where ``ib_switch_type`` refers to the model of the switch: HDR/EDR/NDR + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/CustomLocalRepo.rst b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/CustomLocalRepo.rst new file mode 100644 index 000000000..72d2e3890 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/CustomLocalRepo.rst @@ -0,0 +1,131 @@ +Configuring custom repositories +------------------------------- + +Use the local repository feature to create a customized set of local repositories on the OIM for the cluster nodes to access. + +1. Ensure the ``custom`` entry is included in the ``software_config.json`` file. :: + + { + "cluster_os_type": "ubuntu", + "cluster_os_version": "22.04", + "repo_config": "partial", + "softwares": [ + {"name": "amdgpu", "version": "6.2.2"}, + {"name": "cuda", "version": "12.3.2"}, + {"name": "bcm_roce", "version": "230.2.54.0"}, + {"name": "ofed", "version": "24.01-0.3.3.1"}, + {"name": "openldap"}, + {"name": "secure_login_node"}, + {"name": "nfs"}, + {"name": "beegfs", "version": "7.4.2"}, + {"name": "k8s", "version":"1.29.5"}, + {"name": "roce_plugin"}, + {"name": "jupyter"}, + {"name": "kubeflow"}, + {"name": "kserve"}, + {"name": "pytorch"}, + {"name": "tensorflow"}, + {"name": "vllm"}, + {"name": "telemetry"}, + {"name": "ucx", "version": "1.15.0"}, + {"name": "openmpi", "version": "4.1.6"}, + {"name": "intelgaudi", "version": "1.18.0-524"}, + {"name": "csi_driver_powerscale", "version":"v2.11.0"} + ], + "bcm_roce": [ + {"name": "bcm_roce_libraries", "version": "230.2.54.0"} + ], + "amdgpu": [ + {"name": "rocm", "version": "6.2.2" } + ], + "intelgaudi": [ + {"name": "intel"} + ], + "vllm": [ + {"name": "vllm_amd"}, + {"name": "vllm_nvidia"} + ], + "pytorch": [ + {"name": "pytorch_cpu"}, + {"name": "pytorch_amd"}, + {"name": "pytorch_nvidia"}, + {"name": "pytorch_gaudi"} + ], + "tensorflow": [ + {"name": "tensorflow_cpu"}, + {"name": "tensorflow_amd"}, + {"name": "tensorflow_nvidia"} + ] + } + +.. note:: For Rocky Linux OS, the ``cluster_os_type`` in the above sample will be ``rocky``. + +2. Create a ``custom.json`` file in the following directory: ``input/config//`` to define the repositories. For example, For a cluster running RHEL 8.8, go to ``input/config/rhel/8.8/`` and create the file there. The file is a JSON list consisting of the package name, repository type, URL (optional), and version (optional). Below is a sample version of the file: :: + + { + "custom": { + "cluster": [ + { + "package": "ansible==5.3.2", + "type": "pip_module" + }, + { + "package": "docker-ce-24.0.4", + "type": "rpm", + "repo_name": "docker-ce-repo" + }, + + { + "package": "gcc", + "type": "rpm", + "repo_name": "appstream" + }, + { + "package": "community.general", + "type": "ansible_galaxy_collection", + "version": "4.4.0" + }, + + { + "package": "perl-Switch", + "type": "rpm", + "repo_name": "codeready-builder" + }, + { + "package": "prometheus-slurm-exporter", + "type": "git", + "url": "https://github.com/vpenso/prometheus-slurm-exporter.git", + "version": "master" + }, + { + "package": "ansible.utils", + "type": "ansible_galaxy_collection", + "version": "2.5.2" + }, + { + "package": "prometheus-2.23.0.linux-amd64", + "type": "tarball", + "url": "https://github.com/prometheus/prometheus/releases/download/v2.23.0/prometheus-2.23.0.linux-amd64.tar.gz" + }, + { + "package": "metallb-native", + "type": "manifest", + "url": "https://raw.githubusercontent.com/metallb/metallb/v0.13.4/config/manifests/metallb-native.yaml" + }, + { + "package": "registry.k8s.io/pause", + "version": "3.9", + "type": "image" + } + + ] + } + } + +2. Enter the parameters required in ``input/local_repo_config.yml`` as explained `here <../CreateLocalRepo/InputParameters.html#id2>`_. + +3. Run the following commands: :: + + cd local_repo + ansible-playbook local_repo.yml + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/Habana_accelerator.rst b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/Habana_accelerator.rst new file mode 100644 index 000000000..db7a161df --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/Habana_accelerator.rst @@ -0,0 +1,30 @@ +Alternate method to install the Intel Gaudi Software Stack and Driver +======================================================================= + +The accelerator role allows users to set up the `Intel Gaudi Software Stack and Driver `_. This tools allow users to unlock the potential of installed Intel Gaudi accelerators. + +**Prerequisites** + +* The Intel Gaudi local repositories must be configured using the `local_repo.yml <../CreateLocalRepo/index.html>`_ script. +* The ``input/software_config.json`` must contain valid ``intelgaudi`` version. See `input parameters <../CreateLocalRepo/InputParameters.html>`_ for more information. + +.. note:: Intel Gaudi platform is only supported on Ubuntu 22.04 clusters containing Intel Gaudi accelerators. + +**Playbook configurations** + +The following configurations takes place while running the ``accelerator.yml`` playbook: + + i. Servers with Intel Gaudi accelerators are identified and the latest drivers and software stack are downloaded and installed. + ii. Servers with no accelerator are skipped. + +**Executing the playbook** + +To install all the latest drivers and toolkits, run: :: + + cd accelerator + ansible-playbook accelerator.yml -i inventory + +.. note:: + + * While executing the ``accelerator.yml`` playbook for Intel Gaudi nodes, a Cron job is run which brings up the Intel Gaudi scale-out network interfaces. + * If a node contains an Intel Gaudi GPU with internet access during provisioning, then the user needs to install the Gaudi driver using the ``accelerator.yml`` playbook. \ No newline at end of file diff --git a/docs/source/InstallationGuides/InstallingProvisionTool/IPruleassignment.rst b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/IPruleassignment.rst similarity index 61% rename from docs/source/InstallationGuides/InstallingProvisionTool/IPruleassignment.rst rename to docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/IPruleassignment.rst index 8ec3048dc..f4faf89ef 100644 --- a/docs/source/InstallationGuides/InstallingProvisionTool/IPruleassignment.rst +++ b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/IPruleassignment.rst @@ -3,7 +3,10 @@ IP rule assignment This playbook is used for updating IP rule of the additional configured NICs. -.. note:: ``ip_rule_assignment`` is only supported for clusters running on Ubuntu OS. +.. note:: + + * ``ip_rule_assignment`` is only supported for clusters running on Ubuntu OS. + * Assigning an IP rule is not supported for VLAN NICs using the ``ip_rule_assignment.yml`` playbook. If you want to assign an IP rule to your VLAN NIC, `click here `_. **Prerequisites** @@ -49,3 +52,24 @@ This playbook is used for updating IP rule of the additional configured NICs. For an example inventory template, go to ``omnia/examples/ip_rule_inv_template``. .. note:: To implement IP rule changes, user must reboot the nodes. + +Assign an IP rule to a VLAN NIC +--------------------------------- + +To assign an IP rule to a VLAN NIC, do the following: + +1. Find your VLAN NIC name by executing the following command: :: + + nmcli connection show + +2. Use the VLAN NIC name from the above command output to configure the IP rule using the following command. Here, ``100`` is the metric value and ``192.168.1.100`` is the IP of the configured VLAN NIC. :: + + nmcli connection modify ipv4.routing-rules "priority <100> from <192.168.1.100> table <100>" ipv4.route-table <100> ipv4.route-metric <100> + +3. Configure the gateway using the following command: :: + + nmcli connection modify ipv4.gateway <192.168.1.1> + +4. After you are done configuring the IP rule and the gateway, use the following command to activate the VLAN NIC: :: + + nmcli connection up diff --git a/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/KubernetesAccess.rst b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/KubernetesAccess.rst new file mode 100644 index 000000000..d04d10150 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/KubernetesAccess.rst @@ -0,0 +1,58 @@ +Granting Kubernetes access +--------------------------- + +Omnia grants Kubernetes node access to users defined on the ``kube_control_plane`` using the ``k8s_access.yml`` playbook. + +**Prerequisite** + +* Ensure that the Kubernetes cluster is up and running. + +**Input parameters** + +* Update the variable ``user_name``, in the ``input/k8s_access_config.yml`` file with a comma-separated list of users. + + +---------------+------------------------------------------------------------------------------------------------+ + | Parameter | Details | + +===============+================================================================================================+ + | **user_name** | * A comma-separated list of users to whom access must be granted. | + | | * Every user defined here must have a home directory configured on the ``kube_control_plane``. | + | ``String`` | | + | | * **Sample values**: ``user1`` or ``user1,user2,user3``. | + | Required | | + +---------------+------------------------------------------------------------------------------------------------+ + +* Verify that all intended users have a home directory (in the format ``/home/``) set up on the ``kube_control_plane``. + +* The passed inventory should contain a defined ``kube_control_plane``. + +:: + + [auth_server] + + #node12 + + #AI Scheduler: Kubernetes + + [kube_control_plane] + + # node1 + + + [kube_node] + + # node2 + + # node3 + + # node4 + + # node5 + + # node6 + + + +To run the playbook, use the below command: :: + + cd scheduler + ansible-playbook k8s_access.yml -i inventory \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/PowerScale_CSI.rst b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/PowerScale_CSI.rst new file mode 100644 index 000000000..806f41763 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/PowerScale_CSI.rst @@ -0,0 +1,293 @@ +Deploy CSI drivers for Dell PowerScale storage solutions +=========================================================== + +Dell PowerScale is a flexible and secure scale-out NAS (network attached storage) solution designed to simplify storage requirements for AI and HPC workloads. To enable the PowerScale storage solution on the Kubernetes clusters, Omnia installs the Dell CSI PowerScale driver (version 2.11.0) on the nodes using helm charts. Once the PowerScale CSI driver is installed, the PowerScale nodes can be connected to the Kubernetes clusters for storage requirements. +To know more about the CSI PowerScale driver, `click here `_. + +.. caution:: PowerScale CSI driver installation is only supported on RHEL 8.8, Rocky Linux 8.8, and Ubuntu 22.04 clusters. + +.. note:: Omnia doesn't configure any PowerScale device via OneFS (operating system for PowerScale). Omnia configures the deployed Kubernetes cluster to interact with the PowerScale storage. + +PowerScale SmartConnect [Optional] +------------------------------------- + +* To utilize the PowerScale SmartConnect hostname, it is necessary for the user to have an upstream DNS server that includes delegation mappings of hostname to PowerScale IP addresses. During the provisioning of cluster nodes, users can specify the IP of the upstream ``DNS`` server in the ``input/network_spec.yml`` file. This ensures that the Omnia cluster recognizes and is aware of the upstream DNS server, enabling the use of PowerScale SmartConnect hostname functionality. For example: :: + + --- + Networks: + - admin_network: + nic_name: + netmask_bits: "16" + static_range: + dynamic_range: + correlation_to_admin: true + admin_uncorrelated_node_start_ip: "" + network_gateway: "" + DNS: + MTU: "1500" + +* If the user did not specify the upstream DNS server during the provisioning process and wishes to utilize PowerScale SmartConnect afterwards, then the user must first add the upstream DNS server IP to the ``DNS`` entry in ``input/network_spec.yml`` and then re-run the ``discovery-provision.yml`` playbook. + +Prerequisites +-------------- + +1. Download the ``secret.yaml`` file template from this `link `_. + +2. Update the following parameters in the ``secret.yaml`` file as per your cluster details and keep the rest as default values. For example: + + * clusterName: + * username: + * password: + * endpoint: + .. note:: If PowerScale SmartConnect hostname is configured, user can provide the PowerScale hostname for ``endpoint``. Otherwise user can provide PowerScale IP address as well. + * endpointPort: + * isDefault: true + * isiPath: "/ifs/data/csi" + + *Reference values from OneFS portal:* + + .. image:: ../../../images/CSI_1.png + +3. Download the ``values.yaml`` files template using the following command: :: + + wget https://raw.githubusercontent.com/dell/helm-charts/csi-isilon-2.11.0/charts/csi-isilon/values.yaml + +4. Update the following parameters in the ``values.yaml`` file and keep the rest as default values. Refer the below sample values: + + * controllerCount: 1 + + * replication: + + enabled: false + + * snapshot: + + enabled: true + + * resizer: + + enabled: false + + * healthMonitor: + + enabled: false + + * endpointPort:8080 + + * skipCertificateValidation: true + + * isiAccessZone: System + + * isiPath: /ifs/data/csi + + +.. note:: In order to integrate PowerScale solution to the deployed Kubernetes cluster, Omnia 1.7 requires the following fixed parameter values in ``values.yaml`` file: + + * controllerCount: 1 + * Replication: false + * Snapshot: true + * skipCertificateValidation: true + +.. note:: Once the PowerScale CSI driver has been deployed, the parameters in the ``values.yaml`` can't be changed. If the user wants to modify the ``values.yaml`` file, they must first uninstall the PowerScale CSI driver from the cluster and then re-install with the updated parameters. + +Installation Process +--------------------- + +1. Once ``secret.yaml`` and ``values.yaml`` is filled up with the necessary details, copy both files to any directory on the OIM. For example, ``/tmp/secret.yaml`` and ``/tmp/values.yaml``. + +2. Add the ``csi_driver_powerscale`` entry along with the driver version to the ``omnia/input/software_config.json`` file: :: + + {"name": "csi_driver_powerscale", "version":"v2.11.0"} + + .. note:: By default, the ``csi_driver_powerscale`` entry is not present in the ``input/software_config.json``. + +3. Execute the ``local_repo.yml`` playbook to download the required artifacts to the OIM: :: + + cd local_repo + ansible-playbook local_repo.yml + +4. Add the filepath of the ``secret.yaml`` and ``values.yaml`` file to the ``csi_powerscale_driver_secret_file_path`` and ``csi_powerscale_driver_values_file_path`` variables respectively, present in the ``omnia/input/omnia_config.yml`` file. + +5. Execute the ``omnia.yml`` playbook to install the PowerScale CSI driver: :: + + cd omnia + ansible-playbook omnia.yml -i + +.. note:: + * There isn't a separate playbook to run for PowerScale CSI driver installation. Running ``omnia.yml`` with necessary inputs installs the driver. If Kubernetes is already deployed on the cluster, users can also run the ``scheduler.yml`` playbook to install the PowerScale CSI driver. + * After running ``omnia.yml`` playbook, the ``secret.yaml`` file will be encrypted. User can use below command to decrypt and edit it if required: :: + + cd omnia + ansible-vault edit --vault-password-file scheduler/roles/k8s_csi_powerscale_plugin/files/.csi_powerscale_secret_vault + +.. caution:: Do not delete the vault key file ``.csi_powerscale_secret_vault``, otherwise users will not be able to decrypt the ``secret.yaml`` file anymore. + +Expected Results +------------------ + +* After the successful execution of the ``omnia.yml`` playbook, the PowerScale CSI driver is deployed in the isilon namespace. +* Along with PowerScale driver installation a storage class named **ps01** is also created. The details of the storage class are as follows: :: + + apiVersion: storage.k8s.io/v1 + kind: StorageClass + metadata: + name: ps01 + provisioner: csi-isilon.dellemc.com + reclaimPolicy: Delete + allowVolumeExpansion: true + volumeBindingMode: Immediate + parameters: + AccessZone: < access zone mentioned in values.yaml file > + Isipath: < isipath mentioned in values.yaml file > + RootClientEnabled: "true" + csi.storage.k8s.io/fstype: "nfs" + +* If there are errors during CSI driver installation, the whole ``omnia.yml`` playbook execution does not stop or fail. It pauses for 30 seconds with CSI driver installation failure error message and then proceeds with rest of the playbook execution. +* For an unsuccessful driver installation scenario, the user first needs to follow the manual removal steps mentioned below from the ``kube_control_plane``, and then re-run the ``omnia.yml`` playbook for CSI driver installation. + +Post installation +------------------- + +**[Optional] Create custom storage class** + +If user wants to create a custom storage class, they can do so by following the sample storage class `template `_. + +*Sample storageclass template*: :: + + apiVersion: storage.k8s.io/v1 + kind: StorageClass + metadata : + name: + provisioner: csi-isilon.dellemc.com + reclaimPolicy: Delete + allowVolumeExpansion: true + volumeBindingMode: Immediate + parameters : + clusterName: #optional + AccessZone: System + AzServiceIP: #optional + Isipath: #sample: /ifs/data/csi/ + RootClientEnabled: "true" + csi.storage.k8s.io/fstype: "nfs" + +.. note:: + + * If PowerScale SmartConnect hostname is configured and the delegated host list is set up in the external DNS server, then the user can provide the PowerScale hostname for ``AzServiceIP``. Otherwise user can provide PowerScale IP address as well. + * If there are any changes to the storage class parameters in a PowerScale cluster, the user must update the existing storage class or create a new one as needed. + +**Apply storage class** + +Use the following command to apply the storageclass: :: + + kubectl apply -f + +**Create Persistent Volume Claim (PVC)** + +Once the storage class is created, the same can be used to create PVC. + +*Sample deployment with PVC*: :: + + apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: pvc-powerscale + spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 1Gi + storageClassName: ps01 + --- + apiVersion: apps/v1 + kind: Deployment + metadata: + name: deploy-busybox-01 + spec: + strategy: + type: Recreate + replicas: 1 + selector: + matchLabels: + app: deploy-busybox-01 + template: + metadata: + labels: + app: deploy-busybox-01 + spec: + containers: + - name: busybox + image: registry.k8s.io/busybox + command: ["sh", "-c"] + args: ["while true; do touch /data/datafile; rm -f /data/datafile; done"] + volumeMounts: + - name: data + mountPath: /data + env: + - name: http_proxy + value: "http://:3128" + - name: https_proxy + value: "http://:3128" + volumes: + - name: data + persistentVolumeClaim: + claimName: pvc-powerscale + +**Apply the deployment manifest along with PVC** + +Use the following command to apply the manifest: :: + + kubectl apply -f + +*Expected Result*: + +* Once the above manifest is applied, a PVC is created under name ``pvc-powerscale`` and is in ``Bound`` status. Use the ``kubectl get pvc -A`` command to bring up the PVC information. For example: :: + + root@node001:/opt/omnia/csi-driver-powerscale/csi-powerscale/dell-csi-helm-installer# kubectl get pvc -A + NAMESPACE NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS VOLUMEATTRIBUTESCLASS AGE + default pvc-powerscale Bound k8s-b00f77b817 1Gi RWX ps01 27h + +* User can also verify the same information from the OneFS portal. In the sample image below, it is mapped with the ``VOLUME`` entry from the above example: ``k8s-b00f77b817``: + +.. image:: ../../../images/CSI_OneFS.png + +Removal +-------- + +To remove the PowerScale driver manually, do the following: + +1. Login to the ``kube_control_plane``. + +2. Execute the following command to switch to the ``dell-csi-helm-installer`` directory: :: + + cd /opt/omnia/csi-driver-powerscale/csi-powerscale/dell-csi-helm-installer + +3. Once you're inside the ``dell-csi-helm-installer`` directory, use the following command to trigger the ``csi-uninstall`` script: :: + + ./csi-uninstall.sh --namespace isilon + +4. After running the previous command, the PowerScale driver is removed. But, the secret and the created PVC are not removed. If users want to remove them, they need to do it manually from the "isilon" namespace. + +5. If users don't want to use PowerScale anymore, they can remove the following as well: + + a. Remove the PowerScale secret by executing the following commands one after the other: + + i. ``kubectl delete secret isilon-creds -n isilon`` + + ii. ``kubectl delete secret isilon-certs-0 -n isilon`` + + b. Remove any custom user deployment and PVC that was using PowerScale storage class. + + c. Remove the PowerScale storage class. + +.. note:: In case OneFS portal credential changes, users need to perform following steps to update the changes to the ``secret.yaml`` manually: + + 1. Update the ``secret.yaml`` file with the changed credentials. + 2. Login and copy the ``secret.yaml`` file to the ``kube_control_plane``. + 3. Delete the existing secret by executing the following command: :: + + kubectl delete secret isilon-creds -n isilon + + 4. Create the new secret from the updated ``secret.yaml`` file by executing the following command: :: + + kubectl create secret generic isilon-creds -n isilon --from-file=config= \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/ROCm_accelerator.rst b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/ROCm_accelerator.rst new file mode 100644 index 000000000..9ae59d633 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/ROCm_accelerator.rst @@ -0,0 +1,43 @@ +Alternate method to install the AMD ROCm platform +===================================================== + +The accelerator role allows users to set up the `AMD ROCm `_ platform. This tools allow users to unlock the potential of installed AMD GPUs. + +**Prerequisites** + +* The ROCm local repositories must be configured using the `local_repo.yml <../CreateLocalRepo/index.html>`_ script. +* The ``input/software_config.json`` must contain valid ``amdgpu`` and ``rocm`` version. See `input parameters <../CreateLocalRepo/InputParameters.html>`_ for more information. + +**Playbook configurations** + +The following configurations takes place while running the ``accelerator.yml`` playbook: + + i. Servers with AMD GPUs are identified and the latest GPU drivers and ROCm platforms are downloaded and installed. + ii. Servers with no GPU are skipped. + +**Executing the playbook** + +To install all the latest GPU drivers and toolkits, run: :: + + cd accelerator + ansible-playbook accelerator.yml -i inventory + +User permissions for ROCm platforms +------------------------------------ + +* To add an user to the ``render`` and ``video`` group, use the following command: :: + + sudo usermod -a -G render,video + +.. note:: + * is the system name of the end user. + * This command must be run with ``root`` permissions. + * If the root user wants to provide access to other users and their individual GPU nodes, the previous command needs to be run on all of them separately. + +* To enable users to use ROCm tools, use the following command as shown in the below added sample file: :: + + /opt/rocm/bin/ + +.. image:: ../../../images/ROCm_user_permissions.png + +For any configuration changes, check out ROCm's official documentation `here. `_ \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/index.rst b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/index.rst new file mode 100644 index 000000000..c9a15c0fd --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/index.rst @@ -0,0 +1,17 @@ +Advanced configurations for Ubuntu clusters +============================================== + +.. toctree:: + :maxdepth: 2 + + CustomLocalRepo + install_ucx_openmpi + k8s_plugin_roce_nic + KubernetesAccess + ConfiguringSwitches/index + ConfiguringStorage/index + IPruleassignment + AdditionalNIC + ROCm_accelerator + Habana_accelerator + PowerScale_CSI \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/install_ucx_openmpi.rst b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/install_ucx_openmpi.rst new file mode 100644 index 000000000..b5df50fda --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/install_ucx_openmpi.rst @@ -0,0 +1,44 @@ +Configuring UCX and OpenMPI on the cluster +============================================ + +**Prerequisites** + +* Ensure that ``ucx`` and ``openmpi`` entry is present in the ``softwares`` list in ``software_config.json``, as mentioned below: :: + + "softwares": [ + {"name": "ucx", "version": "1.15.0"}, + {"name": "openmpi", "version": "4.1.6"} + ] + +* Ensure to run ``local_repo.yml`` with the ``ucx`` and ``openmpi`` entry present in ``software_config.json``, to download all required UCX and OpenMPI packages. + +* To install any benchmarking software like UCX or OpenMPI, ensure that ``k8s_share`` is set to ``true`` in `storage_config.yml <../OmniaCluster/schedulerinputparams.html#storage-config-yml>`_, for one of the entries in ``nfs_client_params``. If both are set to true, a higher precedence is given to ``slurm_share``. + +**Inventory details** + +* For UCX and OpenMPI, all the applicable inventory groups are ``slurm_control_node`` and ``kube_control_plane``. + +* The inventory file must contain exactly 1 ``slurm_control_node`` or/and 1 ``kube_control_plane``. + +**To install UCX and OpenMPI** + +* UCX will be compiled and installed on the NFS share (based on the ``client_share_path`` provided in the ``nfs_client_params`` in ``input/storage_config.yml``). + +* If the cluster uses Slurm and UCX, OpenMPI is configured to compile with the UCX and Slurm on the NFS share (based on the ``client_share_path`` provided in the ``nfs_client_params`` in ``input/storage_config.yml``). + +Run either of the following commands: + + 1. :: + + ansible-playbook omnia.yml -i inventory + + 2. :: + + ansible-playbook scheduler.yml -i inventory + +.. note:: + + * All corresponding compiled UCX and OpenMPI files will be saved to the ``/compile`` directory on the nfs share. + * All corresponding UCX and OpenMPI executables will be saved to the ``/benchmarks/`` directory on the nfs share. + * The default OpenMPI version for Omnia is 4.1.6. If you change the version in the ``software.json`` file, make sure to update it in the ``openmpi.json`` file in the ``input/config`` directory as well. + * To add new nodes to an existing cluster, click `here <../../Maintenance/addnode.html>`_. \ No newline at end of file diff --git a/docs/source/InstallationGuides/BuildingClusters/k8s_plugin_roce_nic.rst b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/k8s_plugin_roce_nic.rst similarity index 93% rename from docs/source/InstallationGuides/BuildingClusters/k8s_plugin_roce_nic.rst rename to docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/k8s_plugin_roce_nic.rst index 0ef0e048e..5f04bd7a5 100644 --- a/docs/source/InstallationGuides/BuildingClusters/k8s_plugin_roce_nic.rst +++ b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/k8s_plugin_roce_nic.rst @@ -10,21 +10,21 @@ Few important things to keep in mind before proceeding with the installation: 3. VLAN NICs are not supported. 4. This playbook supports the deployment of up to 8 RoCE NIC interfaces. 5. In a scenario where there are two nodes with two separate NICs, the admin must ensure to use aliasing to make the NIC names similar before executing ``deploy_roce_plugin.yml``. -6. Omnia does not validate any parameter entries in the ``input/roce_plugin_config.yml``. It is the user's responsibility to provide correct inputs for the required parameters. In case of any errors encountered due to incorrect entries, delete and re-install the plugin with the correct inputs. For more information, `click here <../../Troubleshooting/FAQ.html>`_. +6. Omnia does not validate any parameter entries in the ``input/roce_plugin_config.yml``. It is the user's responsibility to provide correct inputs for the required parameters. In case of any errors encountered due to incorrect entries, delete and re-install the plugin with the correct inputs. For more information, `click here <../../../Troubleshooting/FAQ/Ubuntu/Provision.html>`_. Install the plugin ------------------- **Prerequisites** -* Ensure Kubernetes is set up on the cluster with ``flannel`` as the input for the ``k8s_cni`` parameter. For the complete list of parameters, `click here `_. -* Ensure that the ``bcm_roce`` drivers are installed on the nodes. -* Ensure that additional NICs have been configured using the ``server_spec_update.yml`` playbook. For more information on how to configure additional NICs, `click here <../InstallingProvisionTool/AdditionalNIC.html>`_. +* Ensure Kubernetes is set up on the cluster with ``flannel`` as the input for the ``k8s_cni`` parameter. For the complete list of parameters, `click here <../OmniaCluster/schedulerinputparams.html#id12>`_. +* Ensure that the Broadcom RoCE drivers are installed on the nodes. +* Ensure that additional NICs have been configured using the ``server_spec_update.yml`` playbook. For more information on how to configure additional NICs, `click here <../../../Utils/AdditionalNIC.html>`_. * Ensure that the ``{"name": "roce_plugin"}`` entry is present in the ``software_config.json`` and the same config has been used while executing the ``local_repo.yml`` playbook. * Ensure to update the below mentioned parameters in ``input/roce_plugin_config.yml``: .. csv-table:: Parameters for RoCE NIC - :file: ../../Tables/roce_config.csv + :file: ../../../Tables/roce_config.csv :header-rows: 1 :keepspace: diff --git a/docs/source/OmniaInstallGuide/Ubuntu/Benchmarks/hpcsoftwarestack.rst b/docs/source/OmniaInstallGuide/Ubuntu/Benchmarks/hpcsoftwarestack.rst new file mode 100644 index 000000000..bab79cfd5 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/Benchmarks/hpcsoftwarestack.rst @@ -0,0 +1,51 @@ +Containerized HPC benchmark execution +-------------------------------------- + +Use this playbook to download docker images and pull images onto cluster nodes using `apptainer `_. + +1. Ensure that the cluster has been `provisioned by the provision tool. <../../InstallationGuides/InstallingProvisionTool/index.html>`_ and the `cluster has been set up using omnia.yml. <../../InstallationGuides/BuildingClusters/index.html>`_ + +2. Enter the following variables in ``utils/hpc_apptainer_job_execution/hpc_apptainer_job_execution_config.yml``: + ++-------------------------+-----------------------------------------------------------------------------------------------------------+ +| Parameter | Details | ++=========================+===========================================================================================================+ +| **hpc_apptainer_image** | * Docker image details to be downloaded in to cluster nodes using apptainer to create a sif file. | +| ``JSON list`` | | +| Required | * Example (for single image): :: | +| | | +| | | +| | hpc_apptainer_image: | +| | | +| | - { image_url: "docker.io/intel/oneapi-hpckit:latest" } | +| | | +| | * Example (for multiple images): :: | +| | | +| | hpc_apptainer_image: | +| | | +| | - { image_url: "docker.io/intel/oneapi-hpckit:latest" } | +| | | +| | - { image_url: "docker.io/tensorflow/tensorflow:latest" } | +| | | +| | * If provided, docker credentials in ``omnia_config.yml``, it will be used for downloading docker images. | +| | | ++-------------------------+-----------------------------------------------------------------------------------------------------------+ +| **hpc_apptainer_path** | * Directory to filepath for storing apptainer sif files on cluster nodes. | +| | | +| ``string`` | * It is recommended to use a directory inside a shared path that is accessible to all cluster nodes. | +| | | +| Required | * **Default value:** ``"/home/omnia-share/softwares/apptainer"`` | ++-------------------------+-----------------------------------------------------------------------------------------------------------+ + +To run the playbook: :: + + cd utils/hpc_apptainer_job_execution + + ansible-playbook hpc_apptainer_job_execution.yml -i inventory + +.. note:: Use the inventory file format specified under `Sample Files. <../../samplefiles.html>`_ + +HPC apptainer jobs can be initiated on a slurm cluster using the following sample command: :: + + srun -N 3 --mpi=pmi2 --ntasks=4 apptainer run /home/omnia-share/softwares/apptainer/oneapi-hpckit_latest.sif hostname + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/Benchmarks/index.rst b/docs/source/OmniaInstallGuide/Ubuntu/Benchmarks/index.rst new file mode 100644 index 000000000..2cf751a84 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/Benchmarks/index.rst @@ -0,0 +1,6 @@ +Running HPC benchmarks on omnia clusters +========================================= + +.. toctree:: + AutomatingOneAPI + AutomatingOpenMPI diff --git a/docs/source/OmniaInstallGuide/Ubuntu/CreateLocalRepo/InputParameters.rst b/docs/source/OmniaInstallGuide/Ubuntu/CreateLocalRepo/InputParameters.rst new file mode 100644 index 000000000..ff6e0845b --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/CreateLocalRepo/InputParameters.rst @@ -0,0 +1,118 @@ +Input parameters for Local Repositories +---------------------------------------- + +* Input all required values in ``input/software_config.json``. + +.. csv-table:: Parameters for Software Configuration + :file: ../../../Tables/software_config_ubuntu.csv + :header-rows: 1 + :keepspace: + :class: longtable + +* Sample version for Ubuntu: + +:: + + { + "cluster_os_type": "ubuntu", + "cluster_os_version": "22.04", + "repo_config": "partial", + "softwares": [ + {"name": "amdgpu", "version": "6.2.2"}, + {"name": "cuda", "version": "12.3.2"}, + {"name": "bcm_roce", "version": "230.2.54.0"}, + {"name": "ofed", "version": "24.01-0.3.3.1"}, + {"name": "openldap"}, + {"name": "secure_login_node"}, + {"name": "nfs"}, + {"name": "beegfs", "version": "7.4.2"}, + {"name": "k8s", "version":"1.29.5"}, + {"name": "roce_plugin"}, + {"name": "jupyter"}, + {"name": "kubeflow"}, + {"name": "kserve"}, + {"name": "pytorch"}, + {"name": "tensorflow"}, + {"name": "vllm"}, + {"name": "telemetry"}, + {"name": "ucx", "version": "1.15.0"}, + {"name": "openmpi", "version": "4.1.6"}, + {"name": "intelgaudi", "version": "1.18.0-524"}, + {"name": "csi_driver_powerscale", "version":"v2.11.0"} + ], + + "bcm_roce": [ + {"name": "bcm_roce_libraries", "version": "230.2.54.0"} + ], + "amdgpu": [ + {"name": "rocm", "version": "6.2.2" } + ], + "intelgaudi": [ + {"name": "intel"} + ], + "vllm": [ + {"name": "vllm_amd"}, + {"name": "vllm_nvidia"} + ], + "pytorch": [ + {"name": "pytorch_cpu"}, + {"name": "pytorch_amd"}, + {"name": "pytorch_nvidia"}, + {"name": "pytorch_gaudi"} + ], + "tensorflow": [ + {"name": "tensorflow_cpu"}, + {"name": "tensorflow_amd"}, + {"name": "tensorflow_nvidia"} + ] + } + +For a list of accepted values in ``softwares``, go to ``input/config//`` and view the list of JSON files available. The filenames present in this location (without the * .json extension) are a list of accepted software names. The repositories to be downloaded for each software are listed the corresponding JSON file. For example, for a cluster running Ubuntu 22.04, go to ``input/config/ubuntu/22.04/`` and view the file list: + +:: + + amdgpu.json + bcm_roce.json + beegfs.json + cuda.json + jupyter.json + k8s.json + kserve.json + kubeflow.json + roce_plugin.json + nfs.json + ofed.json + openldap.json + pytorch.json + tensorflow.json + vllm.json + intelgaudi.json + +For a list of repositories (and their types) configured for AMD GPUs, view the ``amdgpu.json`` file: :: + + { + "amdgpu": { + "cluster": [ + {"package": "linux-headers-$(uname -r)", "type": "deb", "repo_name": "jammy"}, + {"package": "linux-modules-extra-$(uname -r)", "type": "deb", "repo_name": "jammy"}, + {"package": "amdgpu-dkms", "type": "deb", "repo_name": "amdgpu"} + ] + }, + "rocm": { + "cluster": [ + {"package": "rocm-hip-sdk{{ rocm_version }}*", "type": "deb", "repo_name": "rocm"} + ] + } + } + +.. note:: To configure a locally available repository that does not have a pre-defined json file, `click here <../AdvancedConfigurationsUbuntu/CustomLocalRepo.html>`_. + +* Input the required values in ``input/local_repo_config.yml``. + +.. csv-table:: Parameters for Local Repository Configuration + :file: ../../../Tables/local_repo_config_ubuntu.csv + :header-rows: 1 + :keepspace: + :class: longtable + +* Input ``docker_username`` and ``docker_password`` in ``input/provision_config_credentials.yml`` to avoid image pullback errors. \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/Ubuntu/CreateLocalRepo/Prerequisite.rst b/docs/source/OmniaInstallGuide/Ubuntu/CreateLocalRepo/Prerequisite.rst new file mode 100644 index 000000000..9e8ca46a6 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/CreateLocalRepo/Prerequisite.rst @@ -0,0 +1,59 @@ +Prerequisites +=============== + +1. Set the hostname of the OIM in the "hostname.domain name" format. + +.. include:: ../../../Appendices/hostnamereqs.rst + +For example, ``controlplane.omnia.test`` is acceptable. :: + + hostnamectl set-hostname controlplane.omnia.test + +2. To set up persistent offline local repositories, (if the parameter ``repo_config`` in ``input/software_config.json`` is set to ``always``), `click here `_. + +.. note:: The above link explains how to build a mirror on an Ubuntu 22.04 server. Adapt the steps and scripts as required for other versions of Ubuntu OS. + +3. Creating user registries + +.. note:: + + * The ``user_registry`` in ``input/local_repo_config.yml`` supports only nerdctl and docker registries. + * If you define the ``cert_path`` variable, ensure that it points to the absolute path of the user registry certificate present on the Omnia OIM. + * To avoid docker pull limits, provide docker credentials (``docker_username``, ``docker_password``) in ``input/provision_config_credentials.yml``. + +.. caution:: In order to download the software images from an user registry, the user needs to ensure that the ``user_registry`` address provided in ``input/local_repo_config.yml`` is accessible from the Omnia OIM. If the ``user_registry`` is not accessible from the OIM, Omnia will download all the software images listed in ``input/software_config.json`` to the Omnia-registry. Use the ``curl -k `` to check. + +Images listed in ``user_registry`` in ``input/local_repo_config.yml`` are accessed from user defined registries. To ensure that the OIM can correctly access the registry, ensure that the following naming convention is used to save the image: :: + + /:v + +Therefore, for the image of ``calico/cni`` version ``1.2`` available on ``quay.io`` that has been pulled to a local host: ``server1.omnia.test``, the accepted user registry name is: :: + + server1.omnia.test:5001/calico/cni:v1.2 + +Omnia will not be able to configure access to any registries that do not follow this naming convention. Do not include any other extraneous information in the registry name. + +Instructions to pull images from the user registries in the form of a digest: + + * Images pulled from gcr.io does not have a ``tag``, but a ``digest value``. + + *Image pulled from gcr.io* :: + + { + "package": "gcr.io/knative-releases/knative.dev/serving/cmd/webhook", + "digest": "7b138c73fcaaf0b9bb2d414b8a89a780f8c09371d24c6f57969be1694acf4aaa", + "type": "image" + }, + + * While pushing these images to ``user_registry``, user needs to manually enter a ``tag`` as shown in the sample below. Tags make the image unique to Omnia ``user_registry``. If not provided, image will be accessed from the ``gcr.io`` registry, that is, from the internet. + + *Add "tag" value as "omnia" in .json file while pushing the image to user_registry* :: + + { + "package": "gcr.io/knative-releases/knative.dev/serving/cmd/webhook", + "tag": "omnia", + "type": "image" + }, + + * For "kserve" and "kubeflow" images sourced from ``gcr.io``, Omnia updates the digest tag to ``omnia-kserve`` and ``omnia-kubeflow`` while pushing the images to ``user_registry``. + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/CreateLocalRepo/RunningLocalRepo.rst b/docs/source/OmniaInstallGuide/Ubuntu/CreateLocalRepo/RunningLocalRepo.rst new file mode 100644 index 000000000..02f9e55cc --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/CreateLocalRepo/RunningLocalRepo.rst @@ -0,0 +1,69 @@ +Execute local repo playbook +============================= + +The local repository feature helps create offline repositories on the OIM, which all the cluster nodes can access. + +**Configurations made by the playbook** + + * A registry is created on the OIM at :5001. + + * If ``repo_config`` in ``local_repo_config.yml`` is set to ``always`` or ``partial``, all images present in the ``input/config//`` folder will be downloaded to the OIM. + + + * If the image is defined using a tag, the image will be tagged using :5001/: and pushed to the Omnia local registry. + + * If the image is defined using a digest, the image will be tagged using :5001/:omnia and pushed to the Omnia local registry.repositories + + + * When ``repo_config`` in ``local_repo_config.yml`` is set to ``always``, the OIM is set as the default registry mirror. + + * When ``repo_config`` in ``local_repo_config`` is set to ``partial``, the ``user_registry`` (if defined) and the OIM are set as default registry mirrors. + +**Create & Verify local repo** + +* To create local repositories, execute the following command: :: + + cd local_repo + ansible-playbook local_repo.yml + +* Verify changes made by the playbook by executing ``cat /etc/containerd/certs.d/_default/hosts.toml`` on the compute nodes. + +.. note:: + * View the status of packages for the current run of ``local_repo.yml`` in ``/opt/omnia/offline/download_package_status.csv``. Packages which are already a part of Focal or Jammy repositories show up as ``Skipped``. + * ``local_repo.yml`` playbook execution fails if any software package download fails. Packages that fail are marked with a "Failed" status. In such a scenario, the user needs to re-run the ``local_repo.yml`` playbook. For more information, `click here <../../../Troubleshooting/FAQ/Common/LocalRepo.html>`_. + * If ``repo_config`` is set to ``partial``, packages which are part of the ``user_repo_url`` or images which are part of ``user_registry`` have a ``Skipped`` status in ``/opt/omnia/offline/download_package_status.csv``. + * If any software packages fail to download during the execution of this script, other scripts that rely on the package (that is, scripts that install the software) may fail. + +* To fetch images from the ``user_registry`` or the Omnia local registry, run the below commands: + + * Images defined with versions: ``nerdctl pull /:`` + * Images defined with digests: ``nerdctl pull /:omnia`` + +.. note:: + + * After ``local_repo.yml`` has run, the value of ``repo_config`` in ``input/software_config.json`` cannot be updated without running the `oim_cleanup.yml <../../Maintenance/cleanup.html>`_ script first. + + * To configure additional local repositories after running ``local_repo.yml``, update ``software_config.json`` and re-run ``local_repo.yml``. + + * Images downloaded from ``gcr.io`` into the local registry are no longer accessible using digest values. These images are tagged with the 'omnia' tag. Choose one of the following methods when pushing these images to the cluster nodes: + + * Append 'omnia' to the end of the image name while pushing images to the ``user_registry``. Update the image definition in ``input/config///.json`` to follow the same nomenclature. + + * For "kserve" and "kubeflow" images sourced from ``gcr.io``, Omnia updates the digest tag to ``omnia-kserve`` and ``omnia-kubeflow`` while pushing the images to ``user_registry``. + + * If a different tag is provided, update the digest value in ``input/config///.json`` as per the image digest in the ``user_directory``. To get the updated digest from the ``user_registry``, use the below steps: + + * Check the tag of image: ``curl -k https:///v2//tags/list`` + + * Check the digest of the tag: ``curl -H -k https:///v2//manifests/omnia`` + + +**Update local repositories** + +After local repositories have been configured on a cluster, this playbook is used to update every local repository on that cluster. + +To run the playbook: :: + + cd utils + ansible-playbook update_user_repo.yml -i inventory + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/CreateLocalRepo/index.rst b/docs/source/OmniaInstallGuide/Ubuntu/CreateLocalRepo/index.rst new file mode 100644 index 000000000..129c37a7d --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/CreateLocalRepo/index.rst @@ -0,0 +1,15 @@ +Step 2: Create Local repositories for the cluster +================================================== + +The ``local_repo.yml`` playbook creates offline repositories on the OIM server, which all the cluster nodes will access. This playbook execution requires inputs from ``input/software_config.json`` and ``input/local_repo_config.yml``. + +.. caution:: If you have a proxy server set up for your OIM, you must configure the proxy environment variables on the OIM before running any Omnia playbooks. For more information, `click here <../Setup_CP_proxy.html>`_. + +.. toctree:: + Prerequisite + InputParameters + localrepos + RunningLocalRepo + + + diff --git a/docs/source/InstallationGuides/LocalRepo/localrepos.rst b/docs/source/OmniaInstallGuide/Ubuntu/CreateLocalRepo/localrepos.rst similarity index 70% rename from docs/source/InstallationGuides/LocalRepo/localrepos.rst rename to docs/source/OmniaInstallGuide/Ubuntu/CreateLocalRepo/localrepos.rst index d7ebcbc2d..e5cfcf346 100644 --- a/docs/source/InstallationGuides/LocalRepo/localrepos.rst +++ b/docs/source/OmniaInstallGuide/Ubuntu/CreateLocalRepo/localrepos.rst @@ -1,33 +1,49 @@ -Configuring specific local repositories ------------------------------------------ +Configure specific local repositories +======================================== **AMD GPU ROCm** - To install ROCm, do the following: + To install AMD ROCm, do the following: * Include the following line under ``softwares`` in ``input/software_config.json``: :: - {"name": "amdgpu", "version": "6.0"}, + {"name": "amdgpu", "version": "6.2.2"}, * Add the following line below the ``softwares`` section: :: "amdgpu": [ - {"name": "rocm", "version": "6.0" } + {"name": "rocm", "version": "6.2.2"} ] * A sample format is available `here. `_ -**BeeGFS** +.. note:: If ``amdgpu`` group and ``rocm`` subgroup is provided, the AMD GPU drivers are installed during the cluster provisioning process and the AMD ROCm software stack is installed during ``omnia.yml`` playbook execution. - To install BeeGFS, include the following line under ``softwares`` in ``input/software_config.json``: :: +**Intel Gaudi** - {"name": "beegfs", "version": "7.4.2"}, + To install Intel Gaudi, do the following: + + * Include the following line under ``softwares`` in ``input/software_config.json``: -For information on deploying BeeGFS after setting up the cluster, `click here <../BuildingClusters/BeeGFS.html>`_. + :: + + {"name": "intelgaudi", "version": "1.18.0-524"}, + + * Add the following line below the ``softwares`` section: + + :: + + "intelgaudi": [ + {"name": "intel"} + ] + + * A sample format is available `here. `_ + +.. note:: If ``intelgaudi`` group and ``intel`` subgroup is provided, the Intel Gaudi drivers are installed during the cluster provisioning process and the Intel software stack is installed during ``omnia.yml`` playbook execution. **CUDA** @@ -35,7 +51,6 @@ For information on deploying BeeGFS after setting up the cluster, `click here <. {"name": "cuda", "version": "12.3.2"}, - For a list of repositories (and their types) configured for CUDA, view the ``input/config///cuda.json`` file. To customize your CUDA installation, update the file. URLs for different versions can be found `here `_: For Ubuntu: :: @@ -52,46 +67,51 @@ For information on deploying BeeGFS after setting up the cluster, `click here <. } } - For RHEL or Rocky Linux: :: +.. note:: If the package version is customized, ensure that the ``version`` value is updated in ``software_config.json``. + +**OFED** + + To install OFED, include the following line under ``softwares`` in ``input/software_config.json``: :: + + {"name": "ofed", "version": "24.01-0.3.3.1"}, + + + For a list of repositories (and their types) configured for OFED, view the ``input/config///ofed.json`` file. To customize your OFED installation, update the file. + + For Ubuntu: :: { - "cuda": { - "cluster": [ - { "package": "cuda", - "type": "iso", - "url": "https://developer.download.nvidia.com/compute/cuda/12.3.2/local_installers/cuda-repo-rhel8-12-3-local-12.3.2_545.23.08-1.x86_64.rpm", - "path": "" - }, - { "package": "dkms", - "type": "rpm", - "repo_name": "epel" - } - ] - } + "ofed": { + "cluster": [ + { "package": "ofed", + "type": "iso", + "url": "https://content.mellanox.com/ofed/MLNX_OFED-24.01-0.3.3.1/MLNX_OFED_LINUX-24.01-0.3.3.1-ubuntu22.04-x86_64.iso", + "path": "" + } + ] + } } +.. note:: If the package version is customized, ensure that the ``version`` value is updated in ``software_config.json``. - .. note:: - * If the package version is customized, ensure that the ``version`` value is updated in ``software_config.json```. - * If the target cluster runs on RHEL or Rocky Linux, ensure the "dkms" package is included in ``input/config//8.x/cuda.json`` as illustrated above. -**BCM RoCE** +**Broadcom RoCE** To install RoCE, do the following: * Include the following line under ``softwares`` in ``input/software_config.json``: :: - {"name": "bcm_roce", "version": "229.2.61.0"} + {"name": "bcm_roce", "version": "230.2.54.0"} * Add the following line below the ``softwares`` section: :: "bcm_roce": [ - {"name": "bcm_roce_libraries", "version": "229.2.61.0"} + {"name": "bcm_roce_libraries", "version": "230.2.54.0"} ], * A sample format is available `here `_. - For a list of repositories (and their types) configured for RoCE, view the ``input/config/ubuntu//bcm_roce.json``. :: + For a list of repositories (and their types) configured for RoCE, view the ``input/config/ubuntu//bcm_roce.json``. Provide the local paths or URL for the RoCE driver and libraries in the ``bcm_roce.json`` file. A sample format is given below: :: { "bcm_roce": { @@ -99,7 +119,7 @@ For information on deploying BeeGFS after setting up the cluster, `click here <. { "package": "bcm_roce_driver_{{ bcm_roce_version }}", "type": "tarball", - "url": "", + "url": "https://dl.dell.com/FOLDER12115883M/1/Bcom_LAN_230.2.54.0_NXE_Linux_Drivers_230.2.54.0.tar.gz", "path": "" } ] @@ -109,7 +129,7 @@ For information on deploying BeeGFS after setting up the cluster, `click here <. { "package": "bcm_roce_source_{{ bcm_roce_libraries_version }}", "type": "tarball", - "url": "", + "url": "https://dl.dell.com/FOLDER12115885M/1/Bcom_LAN_230.2.54.0_NXE_Linux_Source_230.2.54.0.tar.gz", "path": "" }, {"package": "libelf-dev", "type": "deb", "repo_name": "jammy"}, @@ -132,191 +152,94 @@ For information on deploying BeeGFS after setting up the cluster, `click here <. .. note:: + * If you have a single ``.tar.gz`` file (often called a tarball) for the Broadcom RoCE driver, you must add the same in both the ``bcm_roce`` section and the ``bcm_roce_libraries`` section of the ``bcm_roce.json`` file. * The RoCE driver is only supported on Ubuntu clusters. - * The only accepted URL for the RoCE driver is from the `Dell support `_ site. - -**Kubernetes plugin for the RoCE NIC** - - To install Kubernetes plugin for the RoCE NIC, do the following: - - * Include the following line under ``softwares`` in ``input/software_config.json``: :: - - {"name": "roce_plugin"}, - - * A sample format is available `here `_. - -.. note:: The RoCE plugin is only supported on Ubuntu clusters. - -**Custom repositories** - - Include the following line under ``softwares`` in ``input/software_config.json``: :: - - {"name": "custom"}, + * The only accepted URL for the RoCE driver is from the Dell support site. For more information on downloading drivers, `click here `_. - Create a ``custom.json`` file in the following directory: ``input/config//`` to define the repositories. For example, For a cluster running RHEL 8.8, go to ``input/config/rhel/8.8/`` and create the file there. The file is a JSON list consisting of the package name, repository type, URL (optional), and version (optional). Below is a sample version of the file: :: - - { - "custom": { - "cluster": [ - { - "package": "ansible==5.3.2", - "type": "pip_module" - }, - { - "package": "docker-ce-24.0.4", - "type": "rpm", - "repo_name": "docker-ce-repo" - }, - - { - "package": "gcc", - "type": "rpm", - "repo_name": "appstream" - }, - { - "package": "community.general", - "type": "ansible_galaxy_collection", - "version": "4.4.0" - }, - - { - "package": "perl-Switch", - "type": "rpm", - "repo_name": "codeready-builder" - }, - { - "package": "prometheus-slurm-exporter", - "type": "git", - "url": "https://github.com/vpenso/prometheus-slurm-exporter.git", - "version": "master" - }, - { - "package": "ansible.utils", - "type": "ansible_galaxy_collection", - "version": "2.5.2" - }, - { - "package": "prometheus-2.23.0.linux-amd64", - "type": "tarball", - "url": "https://github.com/prometheus/prometheus/releases/download/v2.23.0/prometheus-2.23.0.linux-amd64.tar.gz" - }, - { - "package": "metallb-native", - "type": "manifest", - "url": "https://raw.githubusercontent.com/metallb/metallb/v0.13.4/config/manifests/metallb-native.yaml" - }, - { - "package": "registry.k8s.io/pause", - "version": "3.9", - "type": "image" - } - - ] - } - } +**BeeGFS** -**FreeIPA** + To install BeeGFS, include the following line under ``softwares`` in ``input/software_config.json``: :: - To install FreeIPA, include the following line under ``softwares`` in ``input/software_config.json``: :: + {"name": "beegfs", "version": "7.4.2"}, - {"name": "freeipa"}, + For information on deploying BeeGFS after setting up the cluster, `click here <../OmniaCluster/BuildingCluster/Storage/BeeGFS.html>`_. -For more information on FreeIPA, `click here <../BuildingClusters/Authentication.html#configuring-freeipa-openldap-security>`_. +**NFS** -**Jupyterhub** + To install NFS, include the following line under ``softwares`` in ``input/software_config.json``: :: - To install Jupyterhub, include the following line under ``softwares`` in ``input/software_config.json``: :: + {"name": "nfs"}, - {"name": "jupyter"}, + For information on deploying NFS after setting up the cluster, `click here <../OmniaCluster/BuildingCluster/Storage/NFS.html>`_. -For information on deploying Jupyterhub after setting up the cluster, `click here <../Platform/InstallJupyterhub.html>`_. +**Kubernetes** -**Kserve** + To install Kubernetes, include the following line under ``softwares`` in ``input/software_config.json``: :: - To install Kserve, include the following line under ``softwares`` in ``input/software_config.json``: :: + {"name": "k8s", "version":"1.29.5"}, - {"name": "kserve"}, + For more information about installing Kubernetes, `click here <../OmniaCluster/BuildingCluster/install_kubernetes.html>`_. -For information on deploying Kserve after setting up the cluster, `click here <../Platform/kserve.html>`_. +.. note:: The version of the software provided above is the only version of the software Omnia supports. -**Kubeflow** - To install kubeflow, include the following line under ``softwares`` in ``input/software_config.json``: :: +**OpenLDAP** - {"name": "kubeflow"}, + To install OpenLDAP, include the following line under ``softwares`` in ``input/software_config.json``: :: -For information on deploying kubeflow after setting up the cluster, `click here <../Platform/kubeflow.html>`_. + {"name": "openldap"}, +For more information on OpenLDAP, `click here <../OmniaCluster/BuildingCluster/Authentication.html#configuring-openldap-security>`_. -**Kubernetes** - To install Kubernetes, include the following line under ``softwares`` in ``input/software_config.json``: :: +**Secure Login Node** - {"name": "k8s", "version":"1.26.12"}, + To secure the login node, include the following line under ``softwares`` in ``input/software_config.json``: :: -For more information about installing Kubernetes, `click here <../BuildingClusters/install_kubernetes.html>`_. + {"name": "secure_login_node"}, -.. note:: The version of the software provided above is the only version of the software Omnia supports. +For more information on configuring login node security, `click here <../OmniaCluster/BuildingCluster/Authentication.html#configuring-login-node-security>`_. -**OFED** - To install OFED, include the following line under ``softwares`` in ``input/software_config.json``: :: +**Telemetry** - {"name": "ofed", "version": "24.01-0.3.3.1"}, + To install Telemetry, include the following line under ``softwares`` in ``input/software_config.json``: :: + {"name": "telemetry"}, - For a list of repositories (and their types) configured for OFED, view the ``input/config///ofed.json`` file. To customize your OFED installation, update the file.: + For information on deploying Telemetry after setting up the cluster, `click here <../../../Telemetry/index.html>`_. - For Ubuntu: :: +**PowerScale CSI driver** - { - "ofed": { - "cluster": [ - { "package": "ofed", - "type": "iso", - "url": "https://content.mellanox.com/ofed/MLNX_OFED-24.01-0.3.3.1/MLNX_OFED_LINUX-24.01-0.3.3.1-ubuntu20.04-x86_64.iso", - "path": "" - } - ] - } - } + To install PowerScale CSI driver, include the following line under ``softwares`` in ``input/software_config.json``: :: + {"name": "csi_driver_powerscale", "version":"v2.11.0"}, - For RHEL or Rocky Linux: :: + For information on PowerScale CSI driver, `click here <../AdvancedConfigurationsUbuntu/PowerScale_CSI.html>`_. - { - "ofed": { - "cluster": [ - { "package": "ofed", - "type": "iso", - "url": "https://content.mellanox.com/ofed/MLNX_OFED-24.01-0.3.3.1/MLNX_OFED_LINUX-24.01-0.3.3.1-rhel8.7-x86_64.iso", - "path": "" - } - ] - } - } +**Jupyterhub** -.. note:: If the package version is customized, ensure that the ``version`` value is updated in ``software_config.json``. + To install Jupyterhub, include the following line under ``softwares`` in ``input/software_config.json``: :: -**OpenLDAP** + {"name": "jupyter"}, - To install OpenLDAP, include the following line under ``softwares`` in ``input/software_config.json``: :: +For information on deploying Jupyterhub after setting up the cluster, `click here <../InstallAITools/InstallJupyterhub.html>`_. - {"name": "openldap"}, +**Kserve** -For more information on OpenLDAP, `click here <../BuildingClusters/Authentication.html#configuring-freeipa-openldap-security>`_. + To install Kserve, include the following line under ``softwares`` in ``input/software_config.json``: :: -**OpenMPI** + {"name": "kserve"}, - To install OpenMPI, include the following line under ``softwares`` in ``input/software_config.json``: :: +For information on deploying Kserve after setting up the cluster, `click here <../InstallAITools/kserve.html>`_. - {"name": "openmpi", "version":"4.1.6"}, +**Kubeflow** -OpenMPI is deployed on the cluster when the above configurations are complete and `omnia.yml <../BuildingClusters/installscheduler.html>`_ playbook is executed. + To install kubeflow, include the following line under ``softwares`` in ``input/software_config.json``: :: -For more information on OpenMPI configurations, `click here <../BuildingClusters/install_ucx_openmpi.html>`_. + {"name": "kubeflow"}, -.. note:: The default OpenMPI version for Omnia is 4.1.6. If you change the version in the ``software.json`` file, make sure to update it in the ``openmpi.json`` file in the ``input/config`` directory as well. +For information on deploying kubeflow after setting up the cluster, `click here <../InstallAITools/kubeflow.html>`_. **Pytorch** @@ -335,20 +258,13 @@ For more information on OpenMPI configurations, `click here <../BuildingClusters "pytorch": [ {"name": "pytorch_cpu"}, {"name": "pytorch_amd"}, - {"name": "pytorch_nvidia"} + {"name": "pytorch_nvidia"}, + {"name": "pytorch_gaudi"} ], * A sample format is available `here. `_ -For information on deploying Pytorch after setting up the cluster, `click here. <../Platform/Pytorch.html>`_ - -**Secure Login Node** - - To secure the login node, include the following line under ``softwares`` in ``input/software_config.json``: :: - - {"name": "secure_login_node"}, - -For more information on configuring login node security, `click here <../BuildingClusters/Authentication.html#configuring-login-node-security>`_. +For information on deploying Pytorch after setting up the cluster, `click here. <../InstallAITools/Pytorch.html>`_ **TensorFlow** @@ -372,17 +288,7 @@ For more information on configuring login node security, `click here <../Buildin * A sample format is available `here. `_ -For information on deploying TensorFlow after setting up the cluster, `click here <../Platform/TensorFlow.html>`_. - -**Unified Communication X** - - To install UCX, include the following line under ``softwares`` in ``input/software_config.json``: :: - - {"name": "ucx", "version":"1.15.0"}, - -UCX is deployed on the cluster when ``local_repo.yml`` playbook is executed, followed by the execution of `omnia.yml <../BuildingClusters/installscheduler.html>`_. - -For more information on UCX configurations, `click here <../BuildingClusters/install_ucx_openmpi.html>`_. +For information on deploying TensorFlow after setting up the cluster, `click here <../InstallAITools/TensorFlow.html>`_. **vLLM** @@ -405,21 +311,96 @@ For more information on UCX configurations, `click here <../BuildingClusters/ins * A sample format is available `here. `_ -For information on deploying vLLM after setting up the cluster, `click here <../Platform/vLLM/index.html>`_. +For information on deploying vLLM after setting up the cluster, `click here <../InstallAITools/vLLM/index.html>`_. -**Intel benchmarks** - To install Intel benchmarks, include the following line under ``softwares`` in ``input/software_config.json``: :: +**OpenMPI** - {"name": "intel_benchmarks", "version": "2024.1.0"}, + To install OpenMPI, include the following line under ``softwares`` in ``input/software_config.json``: :: -For more information on Intel benchmarks, `click here <../Benchmarks/AutomatingOneAPI.html>`_. + {"name": "openmpi", "version":"4.1.6"}, -**AMD benchmarks** +OpenMPI is deployed on the cluster when the above configurations are complete and `omnia.yml <../OmniaCluster/BuildingCluster/installscheduler.html>`_ playbook is executed. - To install AMD benchmarks, include the following line under ``softwares`` in ``input/software_config.json``: :: +For more information on OpenMPI configurations, `click here <../AdvancedConfigurationsUbuntu/install_ucx_openmpi.html>`_. + +.. note:: The default OpenMPI version for Omnia is 4.1.6. If you change the version in the ``software.json`` file, make sure to update it in the ``openmpi.json`` file in the ``input/config`` directory as well. + + +**Unified Communication X (UCX)** + + To install UCX, include the following line under ``softwares`` in ``input/software_config.json``: :: + + {"name": "ucx", "version":"1.15.0"}, - {"name": "amd_benchmarks"}, +UCX is deployed on the cluster when ``local_repo.yml`` playbook is executed, followed by the execution of `omnia.yml <../OmniaCluster/BuildingCluster/installscheduler.html>`_. -For more information on AMD benchmarks, `click here <../Benchmarks/AutomatingOpenMPI.html>`_. +For more information on UCX configurations, `click here <../AdvancedConfigurationsUbuntu/install_ucx_openmpi.html>`_. +**Custom repositories** + + Include the following line under ``softwares`` in ``input/software_config.json``: :: + + {"name": "custom"}, + + Create a ``custom.json`` file in the following directory: ``input/config//`` to define the repositories. For example, For a cluster running RHEL 8.8, go to ``input/config/rhel/8.8/`` and create the file there. The file is a JSON list consisting of the package name, repository type, URL (optional), and version (optional). Below is a sample version of the file: :: + + { + "custom": { + "cluster": [ + { + "package": "ansible==5.3.2", + "type": "pip_module" + }, + { + "package": "docker-ce-24.0.4", + "type": "rpm", + "repo_name": "docker-ce-repo" + }, + + { + "package": "gcc", + "type": "rpm", + "repo_name": "appstream" + }, + { + "package": "community.general", + "type": "ansible_galaxy_collection", + "version": "4.4.0" + }, + + { + "package": "perl-Switch", + "type": "rpm", + "repo_name": "codeready-builder" + }, + { + "package": "prometheus-slurm-exporter", + "type": "git", + "url": "https://github.com/vpenso/prometheus-slurm-exporter.git", + "version": "master" + }, + { + "package": "ansible.utils", + "type": "ansible_galaxy_collection", + "version": "2.5.2" + }, + { + "package": "prometheus-2.23.0.linux-amd64", + "type": "tarball", + "url": "https://github.com/prometheus/prometheus/releases/download/v2.23.0/prometheus-2.23.0.linux-amd64.tar.gz" + }, + { + "package": "metallb-native", + "type": "manifest", + "url": "https://raw.githubusercontent.com/metallb/metallb/v0.13.4/config/manifests/metallb-native.yaml" + }, + { + "package": "registry.k8s.io/pause", + "version": "3.9", + "type": "image" + } + + ] + } + } diff --git a/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/DeepSpeed.rst b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/DeepSpeed.rst new file mode 100644 index 000000000..26ad075eb --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/DeepSpeed.rst @@ -0,0 +1,110 @@ +Setup DeepSpeed for clusters containing Intel Gaudi accelerators +================================================================== + +DeepSpeed is a deep learning optimization library developed by Microsoft, designed to make training large-scale machine learning models more efficient and scalable. It provides several key features that help accelerate training and reduce the resource requirements for training state-of-the-art models. + +Prerequisites +-------------- + +Before deploying a DeepSpeed MPIJob, the following prerequisites must be fulfilled: + +1. Kubeflow must be deployed on all the cluster nodes. `Click here `_ to know more about deploying Kubeflow. + +2. Configure the *mpi-operator* package to execute the v2beta1 API. `Click here `_ to know more about this configuration. + +3. Verify that the cluster nodes have sufficient allocatable resources for the ``hugepages-2Mi`` and ``Intel Gaudi accelerator``. To check the allocatable resources on all nodes, run: :: + + kubectl describe node | grep -A 10 "Allocatable" + +4. [Optional] If required, you can adjust the resource parameters in the ``ds_configuration.yml`` file based on the availability of resources on the nodes. + + +Deploy DeepSpeed +----------------- + +After you have completed all the prerequisites, do the following to deploy a DeepSpeed MPIJob: + +1. Create a namespace to manage all your DeepSpeed workloads. Execute the following command: :: + + kubectl create ns workloads + +2. Verify that the namespace has been created by executing the following command: :: + + kubectl get namespace workloads + + *Expected output*: :: + + NAME STATUS AGE + workloads Active 14s + +3. To create and apply the DeepSpeed configuration file, follow these steps: + + a. Locate the ``ds_configuration.yml`` file in the ``examples/ai_examples/intel/deepSpeed/`` folder. + b. Open the ``ds_configuration.yml`` file. + c. Add the necessary details such as proxy settings, Hugging Face token, and allocated resources for the DeepSpeed MPIJob. + d. After modifying the file, you have two choices: + + - Directly copy the modified file to your ``kube_control_plane``. + - Create a new blank ``.yml`` file, paste the modified contents into it, and save it on your ``kube_control_plane``. + + e. Finally, apply the file using the following command: :: + + kubectl apply -f .yml + + *Expected output*: :: + + mpijob.kubeflow.org/gaudi-llm-ds-ft created + +4. To create and apply the Persistent Volume Claim (PVC) configuration file, required to access shared storage, follow these steps: + + a. Create a new blank ``.yml`` file, + b. Paste the following content into it, and save it on your ``kube_control_plane``. :: + + apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: shared-model + namespace: workloads + spec: + storageClassName: nfs-client + accessModes: + - ReadWriteOnce + resources: + requests: + storage: + + c. Add the necessary details such as name, namespace, and storage size for the DeepSpeed MPIJobs. Use the same configurations as provided in the ``.yml`` file. + d. Finally, apply the file using the following command: :: + + kubectl apply -f .yml + + *Expected output*: :: + + persistentvolumeclaim/shared-model created + +5. After some time, check the status of the pods again to verify if they are up and running. Execute the following command to get the pod status: :: + + kubectl get pod -n workloads + + *Expected output (when pods are running)*: :: + + NAME READY STATUS RESTARTS AGE + gaudi-llm-ds-ft-launcher-zfnls 1/1 Running 0 33s + gaudi-llm-ds-ft-worker-0 1/1 Running 0 33s + +6. [Optional] To better understand the MPIJob resource, you can use the following command: :: + + kubectl explain mpijob --api-version=kubeflow.org/v2beta1 + + *Expected output*: :: + + GROUP: kubeflow.org + KIND: MPIJob + VERSION: v2beta1 + +*Final output*: + +Once DeepSpeed deployment is complete (~ after approx 30 minutes), the following output is displayed while checking the status of the pods using the ``kubectl get pod -n workloads`` command. Here you can see that the launcher pod (``gaudi-llm-ds-ft-launcher-zfnls``) goes to a **Completed** status, and the worker pod (``33sgaudi-llm-ds-ft-worker-0``) is not present as it has exited successfully, signifying a successful deployment: :: + + NAME READY STATUS RESTARTS AGE + gaudi-llm-ds-ft-launcher-zfnls 0/1 Completed 0 10h diff --git a/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/InstallJupyterhub.rst b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/InstallJupyterhub.rst new file mode 100644 index 000000000..050b958c5 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/InstallJupyterhub.rst @@ -0,0 +1,92 @@ +Setup Jupyterhub +----------------- + +Omnia installs Jupyterhub (version 3.2.0) on Kubernetes clusters. Once Jupyterhub is deployed, log into the GUI to create your own Jupyter notebook. For more information, `click here `_. + +**Prerequisites** + +* Ensure that Kubernetes is deployed and all pods are running on the cluster. +* MetalLB pod is up and running to provide external IP to jupyterhub service. +* Ensure the passed inventory file includes ``kube_control_plane`` and ``kube_node`` groups. `Click here <../../samplefiles.html>`_ for a sample file. +* Review the ``tools/jupyterhub_config.yml`` file to ensure that the deployment meets your requirements. If not, modify the file. +* Run ``local_repo.yml`` with ``jupyter`` entry in ``software_config.json``. +* Omnia deploys the ``quay.io/jupyterhub/k8s-singleuser-sample:3.2.0`` image irrespective of whether the intended notebooks are CPU-only, NVIDIA GPU, or AMD GPU. To use a custom image, modify the ``omnia/tools/jupyterhub_config.yml`` file. +* Ensure that NFS storage provisioner has been deployed on the cluster using ``storage.yml`` followed by ``scheduler.yml`` or ``omnia.yml``. Verify that the required NFS storage provisioner is deployed using the below command: :: + + [root@node3 ~]# kubectl get pod -A + NAMESPACE NAME READY STATUS RESTARTS AGE + default nfs-omnia-nfs-subdir-external-provisioner-54785fccd-9mp8z 1/1 Running 1 (12m ago) 3h24m + +* Verify that the default storage class is set to nfs_client for dynamic persistent volume provisioning. :: + + [root@node3 ~]# kubectl get storageclass + NAME PROVISIONER RECLAIMPOLICY VOLUMEBINDINGMODE ALLOWVOLUMEEXPANSION AGE + nfs-client (default) cluster.local/nfs-omnia-nfs-subdir-external-provisioner Delete Immediate true 17h + + +**Deploying Jupyterhub** + +1. Change directories to the ``tools`` folder: :: + + cd tools + +2. Run the ``jupyterhub.yml`` playbook using: :: + + ansible-playbook jupyterhub.yml -i inventory + +.. note:: The default namespace for deployment is ``jupyterhub``. + + +**Accessing the Jupyterhub GUI** + +1. Login to the ``kube_control_plane`` and verify that the Jupyterhub service is running. +2. Find the IP address of the Jupyterhub service using: + + :: + + root@omnianode0000x:/usr/local# kubectl get svc -A + NAMESPACE NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE + default kubernetes ClusterIP xx.xx.xx.xx 443/TCP 2d2h + jupyterhub hub ClusterIP xx.xx.xx.xx 8081/TCP 2d2h + jupyterhub proxy-api ClusterIP xx.xx.xx.xx 8001/TCP 2d2h + jupyterhub proxy-public LoadBalancer xx.xx.xx.xx xx.xx.xx.xx 80:31134/TCP 2d2h + + The IP address is listed against ``proxy-public`` under ``External IP``. + +3. The Jupyterhub GUI should be accessible from the ``kube_control_plane`` via the external IP mentioned above. Use any browser to log in. Currently Jupyterhub authentication is not linked with openLDAP. + +.. image:: ../../../images/Jupyterhub_Login.png + +4. Choose your preferred notebook server option and click **Start**. A pod will be created for the user. + +.. image:: ../../../images/Jupyterhub_UI.png + +.. role:: raw-role(raw) + + :format: html latex + +:raw-role:`
` + +.. image:: ../../../images/Jupyterhub_UI_2.png + +**Stopping the Notebook server** + +1. Click **File > Hub Control Plane**. +2. Select **Stop Server**. + +.. note:: Stopping the notebook server only terminates the user pod. The users data persists and can be accessed by logging in and starting the notebook server again. + +**Redeploy Jupyterhub with new configurations** + +1. Update the ``tools/jupyterhub_config.yml`` file with the new configuration. +2. Re-run the ``jupyterhub.yml`` playbook. :: + + cd tools + ansible-playbook jupyterhub.yml -i inventory + +**Clearing Jupyterhub configuration** + +Clear the existing configuration by running the below command: :: + + kubectl delete ns jupyterhub + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/Llama.rst b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/Llama.rst new file mode 100644 index 000000000..fe2633499 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/Llama.rst @@ -0,0 +1,81 @@ +Deploying Dell Enterprise Pretrained model on the cluster +=========================================================== + +This guide provides a step-by-step approach to deploy the pretrained model from `Dell Enterprise Hub `_. The Meta-Llama-3.1-8b-Instruct model will be deployed as a sample model on a ``kube_control_plane`` node, specifically optimized for **NVIDIA** platforms. The model is containerized and validated to run seamlessly on the latest Dell hardware. By following this documentation, users can deploy the model, run inferences, or delete the service using a standalone Python script. + +The python script is located in the ``omnia/example/ai_examples/nvidia/dell_pretrained_model`` directory. The python script file is named ``dell_pretrained_model_nvidia.py``. + +Prerequisites +-------------- + +Before deployment, the following prerequisites must be fulfilled: + + +1. This sample Meta-Llama-3.1-8b-Instruct pretrained model needs atleast one NVIDIA GPU. This means that the cluster should have atleast one node with an NVIDIA GPU to deploy this model. If you switch to a different model, the GPU requirement might vary. + +2. Kubernetes must be installed and configured on the cluster. + +3. The cluster must have access to the public internet in order to download the model image. If your cluster has a proxy server set up, refer to the page `here <../pullimagestonodes.html>`_ for enabling internet via that proxy. + +4. Python 3.x must be installed on the cluster. The script relies on several Python modules, including: + + * **Standard Library Modules**: Already included with Python 3.x (subprocess, time, argparse, logging, sys, ipaddress). + * **Third-Party Modules**: The requests module must be installed manually if not available by default. You can install it using the following command: :: + + pip install requests + +.. note:: If you're executing the script within the Omnia virtual environment, the requests module is already installed and available on the cluster. In case you run the script outside of the Omnia virtual environment, you might need to install the module manually. + +5. The ``dell_pretrained_model_nvidia.py`` file present in the ``omnia/example/ai_examples/nvidia/dell_pretrained_model`` must be copied to the ``kube_control_plane`` from the OIM server. + + +Usage Instructions +-------------------- + +Follow the below steps to use the Python script in order to deploy, infer, or delete the model service on your Kubernetes cluster: + +1. **Deploy the model and service** + + To deploy the model and create the associated service, run: :: + + python3 dell_pretrained_model_nvidia.py --deploy + +2. **Execute an inference job** + + * To execute an inference job from the ``kube_control_plane`` using the default query, run: :: + + python3 dell_pretrained_model_nvidia.py --infer + + * To execute an inference job from the ``kube_control_plane`` using a specific query, run: :: + + python3 dell_pretrained_model_nvidia.py --infer "" + + * To execute an inference job from outside of the ``kube_control_plane`` using a specific service IP and default query, run: :: + + python3 dell_pretrained_model_nvidia.py --infer --service-ip + + * To execute an inference job from outside of the ``kube_control_plane`` using a specific service IP and a specific query, run: :: + + python3 dell_pretrained_model_nvidia.py --infer "" --service-ip + + .. note:: If you're not aware of the ``service_IP`` of the pretrained model service, use the following command: :: + + kubectl get svc pretrained-model-service + + *Where the service IP address will be listed under the EXTERNAL-IP column.* + +3. **Delete the deployed model and service** + + To delete the deployed model and service, run: :: + + python3 dell_pretrained_model_nvidia.py --delete + +Additional Instructions +------------------------- + +* **Model selection**: To select and download a model from the Dell registry, visit `Dell's Hugging Face Hub `_. Log in using your Hugging Face Hub account credentials to access the models. +* **Deploying other models**: You can deploy other models from the Dell registry for NVIDIA platforms by modifying the ``PRETRAINED_MODEL_CONFIG`` section of the ``dell_pretrained_model_nvidia.py`` file with the desired model image. Ensure the new service name does not conflict with any existing service names, and verify that all other configurations and resource requirements are correct as per the model specifications. +* **Hugging Face token**: If the model requires a Hugging Face token, replace the ``user_HF_token`` value in ``PRETRAINED_MODEL_CONFIG`` section of the ``dell_pretrained_model_nvidia.py`` file with the correct token. +* **Resource availability**: Ensure that sufficient computational resources (GPU, memory) are available on the Kubernetes nodes to deploy and run the models effectively. This includes verifying that the nodes meet the model's requirements for optimal performance. + + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/Pytorch.rst b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/Pytorch.rst new file mode 100644 index 000000000..b5a2c3f2a --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/Pytorch.rst @@ -0,0 +1,84 @@ +Setup PyTorch +--------------- + +PyTorch is a popular open-source deep learning framework, renowned for its dynamic computation graph that enhances flexibility and ease of use, making it a preferred choice for researchers and developers. With strong community support, PyTorch facilitates seamless experimentation and rapid prototyping in the field of machine learning. + + +**Prerequisites** + +* Ensure nerdctl is available on all cluster nodes. + +* If GPUs are present on the target nodes, install NVIDIA CUDA (with containerd) or AMD Rocm drivers during provisioning. CPUs do not require any additional drivers. + +* Use ``local_repo.yml`` to create an offline PyTorch repository. + + + +**[Optional prerequisites]** + +* Ensure the system has enough space. + +* Ensure the inventory file includes a ``kube_control_plane`` and a ``kube_node`` listing all cluster nodes. `Click here <../../samplefiles.html>`_ for a sample file. + +* Nerdctl does not support mounting directories as devices because it is not a feature of containerd (runtime that nerdctl uses). Individual files need to be attached while running nerdctl. + + +**Deploying PyTorch** + +1. Change directories to the ``tools`` folder: :: + + cd tools + +2. Run the ``pytorch.yml`` playbook: :: + + ansible-playbook pytorch.yml -i inventory + +.. note:: During the ``pytorch.yml`` playbook execution, nodes with AMD or NVIDIA GPUs and drivers will install and test either the ``pytorch-AMD`` or ``pytorch-Nvidia`` containers, respectively. If neither GPU type is present with its drivers, it will install and test the ``pytorch-CPU`` container. + +**Accessing PyTorch (CPU)** + +1. Verify that the PyTorch image is present in container engine images: :: + + nerdctl images + +2. Use the container image per your needs: :: + + nerdctl run -it --rm pytorch/pytorch:latest + +For more information, `click here `_. + + +**Accessing PyTorch (AMD GPU)** + +1. Verify that the PyTorch image is present in container engine images: :: + + nerdctl images + +2. Use the container image per your needs: :: + + nerdctl run -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd --device /dev/dri/card0 --device /dev/dri/card1 --device /dev/dri/card2 --device /dev/dri/renderD128 --device /dev/dri/renderD129 --group-add video --ipc=host --shm-size 8G rocm/pytorch:latest + +For more information, `click here `_. + +**Accessing PyTorch (NVIDIA GPU)** + +1. Verify that the PyTorch image is present in container engine images: :: + + nerdctl images + +2. Use the container image per your needs: :: + + nerdctl run --gpus all -it --rm nvcr.io/nvidia/pytorch:23.12-py3 + +For more information, `click here `_. + +**Accessing PyTorch (Intel Gaudi accelerator)** + +1. Verify that the PyTorch image is present in container engine images: :: + + nerdctl images + +2. Use the container image per your needs: :: + + nerdctl run -it --privileged -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.17.1/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/TensorFlow.rst b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/TensorFlow.rst new file mode 100644 index 000000000..42faec9d5 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/TensorFlow.rst @@ -0,0 +1,77 @@ +Setup TensorFlow +----------------- + +TensorFlow is a widely-used open-source deep learning framework, recognized for its static computation graph that optimizes performance and scalability, making it a favored choice for deploying machine learning models at scale in various industries. + +With an Ansible script, deploy TensorFlow on both ``kube_node`` and the ``kube_control_plane``. After the deployment of TensorFlow, you gain access to the TensorFlow container. + + +**Prerequisites** + +* Ensure nerdctl is available on all cluster nodes. + +* If GPUs are present on the target nodes, install NVIDIA CUDA (with containerd) or AMD ROCm drivers during provisioning. CPUs do not require any additional drivers. + +* Use ``local_repo.yml`` to create an offline TensorFlow repository. + +**[Optional prerequisites]** + +* Ensure the system has enough space. + +* Ensure the inventory file includes a ``kube_control_plane`` and a ``kube_node`` listing all cluster nodes. `Click here <../../samplefiles.html>`_ for a sample file. + +* Nerdctl does not support mounting directories as devices because it is not a feature of containerd (runtime that nerdctl uses). Individual files need to be attached while running nerdctl. + +* Container Network Interface should be enabled with nerdctl. + + +**Deploying TensorFlow** + +1. Change directories to the ``tools`` folder: :: + + cd tools + +2. Run the ``tensorflow.yml`` playbook: :: + + ansible-playbook tensorflow.yml -i inventory + +.. note:: During the ``tensorflow.yml`` playbook execution, nodes with AMD or NVIDIA GPUs and drivers will install and test either the ``tensorflow-AMD`` or ``tensorflow-Nvidia`` containers, respectively. If neither GPU type is present with its drivers, it will install and test the ``tensorflow-CPU`` container. + +**Accessing TensorFlow (CPU)** + +1. Verify that the tensorflow image present in container engine images: :: + + nerdctl images + +2. Use the container image per your needs: :: + + nerdctl run -it --rm tensorflow/tensorflow + +For more information, `click here `_. + + +**Accessing TensorFlow (AMD)** + +1. Verify that the tensorflow image present in container engine images: :: + + nerdctl images + +2. Use the container image per your needs: :: + + nerdctl run -it --network=host --device=/dev/kfd --device /dev/dri/card0 --device /dev/dri/card1 --device /dev/dri/card2 --device /dev/dri/renderD128 --device /dev/dri/renderD129 --ipc=host --shm-size 16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined rocm/tensorflow:latest + +For more information, `click here `_. + +**Accessing TensorFlow (NVIDIA)** + +1. Verify that the tensorflow image present in container engine images: :: + + nerdctl images + +2. Use the container image per your needs: :: + + nerdctl run --gpus all -it --rm nvcr.io/nvidia/tensorflow:23.12-tf2-py3 + + +For more information, `click here `_. + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/index.rst b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/index.rst new file mode 100644 index 000000000..d812abae7 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/index.rst @@ -0,0 +1,17 @@ +Step 6: Install AI tools +=============================== + +AI (Artificial Intelligence) tools are software applications or systems that use AI technologies such as machine learning, natural language processing (NLP), computer vision, and deep learning to perform various tasks autonomously or with human interaction. These tools are designed to mimic human intelligence and can be used across different industries and domains for purposes such as automation, data analysis, decision-making, and more. + +.. caution:: Omnia targets all nodes that appear in the Kubernetes inventory when deploying the desired AI toolset; that is, the AI tool will be deployed on every Kubernetes node mentioned in the inventory. Ensure to mention all the desired nodes in the Kubernetes inventory file while deploying the AI tools via their respective playbooks. For more information on how to set up Kubernetes, `click here <../OmniaCluster/BuildingCluster/install_kubernetes.html>`_. + +.. toctree:: + + InstallJupyterhub + kubeflow + Llama + DeepSpeed + vLLM/index + Pytorch + TensorFlow + kserve \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/kserve.rst b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/kserve.rst new file mode 100644 index 000000000..90d59266e --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/kserve.rst @@ -0,0 +1,151 @@ +Setup Kserve +-------------- + +Kserve is an open-source serving platform that simplifies the deployment, scaling, and management of machine learning models in production environments, ensuring efficient and reliable inference capabilities. For more information, `click here. `_ Omnia deploys Kserve (v0.13.0) on the kubernetes cluster. Once Kserve is deployed, any inference service can be installed on the kubernetes cluster. + +.. note:: Omnia 1.7 does not support deploying both Kserve and Kubeflow in the same Kubernetes cluster. If Kubeflow is already deployed on the cluster and you wish to deploy Kserve, you must first remove Kubeflow by following the steps `here `_. + +**Prerequisites** + + * Ensure that Kubernetes is deployed and all pods are running on the cluster. + + * It is advisable not to deploy Kserve immediately after deploying Kubernetes. Omnia suggests allowing a 10-minute gap after Kubernetes installation for Kubernetes pods to stabilize. + + * MetalLB pod is up and running to provide an external IP to ``istio-ingressgateway``. + + * The domain name on the kubernetes cluster should be cluster.local. The Kserve inference service will not work with a custom ``cluster_name`` property on the kubernetes cluster. + + * Run ``local_repo.yml`` with ``kserve`` entry in ``software_config.json``. + + * Ensure the passed inventory file includes ``kube_control_plane`` and ``kube_node`` groups. `Click here <../../samplefiles.html>`_ for a sample file. + + * To access NVIDIA or AMD GPU accelerators for inferencing, Kubernetes NVIDIA or AMD GPU device plugin pods should be in running state. Kserve deployment does not deploy GPU device plugins. + +**Deploy Kserve** + + 1. Change directories to ``tools`` :: + + cd tools + + 2. Run the ``kserve.yml`` playbook: :: + + ansible-playbook kserve.yml -i inventory + + Post deployment, the following dependencies are installed along with Kserve: + + * Istio (version: 1.20.4) + * Certificate manager (version: 1.14.5) + * Knative (version: 1.13.1) + + To verify the installation, run ``kubectl get pod -A`` and look for the namespaces: ``cert-manager``, ``istio-system``, ``knative-serving``, and ``kserve``. :: + + root@sparknode1:/tmp# kubectl get pod -A + NAMESPACE NAME READY STATUS RESTARTS AGE + cert-manager cert-manager-5d999567d7-mfgdk 1/1 Running 0 44h + cert-manager cert-manager-cainjector-5d755dcf56-877dm 1/1 Running 0 44h + cert-manager cert-manager-webhook-7f7b47c4d4-qzjst 1/1 Running 0 44h + default model-store-pod 1/1 Running 0 43h + default sklearn-pvc-predictor-00001-deployment-667d9f764c-clkbn 2/2 Running 0 43h + istio-system istio-ingressgateway-79cc8bf885-lqgm7 1/1 Running 0 44h + istio-system istiod-777dc7ffbc-b4plt 1/1 Running 0 44h + knative-serving activator-59dff6d45c-28t2x 1/1 Running 0 44h + knative-serving autoscaler-dbf4d8d66-4wj8f 1/1 Running 0 44h + knative-serving controller-6bfd96676f-rdlxl 1/1 Running 0 44h + knative-serving net-istio-controller-6ff9b86f6b-9trb8 1/1 Running 0 44h + knative-serving net-istio-webhook-845d4d74b4-r9d8z 1/1 Running 0 44h + knative-serving webhook-678bd64859-q4ghb 1/1 Running 0 44h + kserve kserve-controller-manager-f9c5984c5-xz7lp 2/2 Running 0 44h + +**Deploy inference service** + +**Prerequisites** + + * To deploy a model joblib file with PVC as model storage, `click here `_ + * As part of Kserve deployment, Omnia deploys ``ClusterStorageContainer`` for supporting inference model download from the following endpoints: + + * prefix: gs:// + * prefix: s3:// + * prefix: hdfs:// + * prefix: webhdfs:// + * regex: https://(.+?).blob.core.windows.net/(.+) + * regex: https://(.+?).file.core.windows.net/(.+) + * regex: "https?://(.+)/(.+)" + + * Pull the intended inference model and the corresponding runtime-specific images into the nodes. + * As part of the deployment, Omnia deploys `standard model runtimes. `_ To deploy a custom model, you might need to deploy required model runtime first. + * To avoid problems with image to digest mapping when pulling inference runtime images, make the following config map changes: + + + 1. Edit ``knative-serving`` config map by executing the following command: :: + + kubectl edit configmap -n knative-serving config-deployment + + 2. Add ``docker.io`` and ``index.docker.io`` as part of ``registries-skipping-tag-resolving`` + + .. image:: ../../../images/kserve_config_map.png + + For more information, `click here. <../../../Troubleshooting/KnownIssues/Common/AITools.html>`_ + +**Access the inference service** + +1. Deploy the inference service and verify that the service is up and running using the command: ``kubectl get isvc -A``. :: + + root@sparknode1:/tmp# kubectl get isvc -A + NAMESPACE NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE + default sklearn-pvc http://sklearn-pvc.default.example.com True 100 sklearn-pvc-predictor-00001 9m18s + + +2. Use ``kubectl get svc -A`` to check the external IP of the service ``istio-ingressgateway``. :: + + root@sparknode1:/tmp# kubectl get svc -n istio-system + NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE + istio-ingressgateway LoadBalancer 10.233.30.227 10.20.0.101 15021:32743/TCP,80:30134/TCP,443:32241/TCP 44h + istiod ClusterIP 10.233.18.185 15010/TCP,15012/TCP,443/TCP,15014/TCP 44h + knative-local-gateway ClusterIP 10.233.37.248 80/TCP 44h + +3. To access inferencing from the ingressgateway with HOST header, run the below command from the kube_control_plane or kube_node: :: + + curl -v -H "Host: " -H "Content-Type: application/json" "http://:/v1/models/:predict" -d @./iris-input.json + +For example: :: + + root@sparknode2:/tmp# curl -v -H "Host: sklearn-pvc.default.example.com" -H "Content-Type: application/json" "http://10.20.0.101:80/v1/models/sklearn-pvc:predict" -d @./iris-input.json + * Trying 10.20.0.101:80... + * Connected to 10.20.0.101 (10.20.0.101) port 80 (#0) + > POST /v1/models/sklearn-pvc:predict HTTP/1.1 + > Host: sklearn-pvc.default.example.com + > User-Agent: curl/7.81.0 + > Accept: */* + > Content-Type: application/json + > Content-Length: 76 + > + * Mark bundle as not supporting multiuse + < HTTP/1.1 200 OK + < content-length: 21 + < content-type: application/json + < date: Sat, 16 Mar 2024 09:36:31 GMT + < server: istio-envoy + < x-envoy-upstream-service-time: 7 + < + * Connection #0 to host 10.20.0.101 left intact + {"predictions":[1,1]} + +.. note:: Refer to `image pull <../pullimagestonodes.html>`_ in case of ImagePullBackOff issue while deploying inference service. + +**Remove Kserve** + + 1. Delete all artifacts from the namespace, by entering the following commands: + + * ``kubectl delete all --all --namespace kserve`` + * ``kubectl delete all --all --namespace knative-serving`` + * ``kubectl delete all --all --namespace istio-system`` + * ``kubectl delete all --all --namespace cert-manager`` + + 2. Delete the namespace, by entering the following commands: + + * ``kubectl delete ns kserve`` + * ``kubectl delete ns knative-serving`` + * ``kubectl delete ns istio-system`` + * ``kubectl delete ns cert-manager`` + +.. warning:: Please be careful about any other required deployments sharing the above namespace. Deleting artifacts using ``--all`` will delete all artifacts in the namespace. \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/kubeflow.rst b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/kubeflow.rst new file mode 100644 index 000000000..2dfcb3123 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/kubeflow.rst @@ -0,0 +1,98 @@ +Setup Kubeflow +--------------- + +Kubeflow is an open-source platform for machine learning and MLOps on Kubernetes introduced by Google. + +.. note:: Omnia 1.7 does not support deploying both Kserve and Kubeflow in the same Kubernetes cluster. If Kserve is already deployed on the cluster and you wish to deploy Kubeflow, you must first remove Kserve by following the steps `here `_. + +**Prerequisite** + +Ensure that you have executed ``local_repo.yml`` with Kubeflow specified in the ``software_config.json`` file. + +**Deploy Kubernetes** + +First, ensure that you have a Kubernetes cluster deployed on your compute node. + +For instructions to set up Kubernetes, `click here <../OmniaCluster/BuildingCluster/install_kubernetes.html>`_. + +.. note:: The playbooks automate the process, ensuring consistency across deployments. + +**Deploy Kubeflow** + +1. Change directories to ``tools``: :: + + cd tools + +2. Execute the ``kubeflow.yml`` playbook: :: + + ansible-playbook kubeflow.yml -i inventory + +Sample inventory: :: + + [kube_control_plane] + + 10.5.1.101 + + [kube_node] + + 10.5.1.102 + + 10.5.1.103 + +.. Note:: Ensure that the inventory format aligns with the Kubernetes installation on the cluster. + +**Verify the status of Kubeflow installation** + +To verify the status of Kubeflow installation, use the following command: :: + + kubectl get pod -n kubeflow + +**Obtain External IP of Ingress Gateway** + +Once Kubeflow is deployed, you need to obtain the external IP address of the ingress gateway. Check the external IP address of the ingress gateway using command-line tools like ``kubectl``. This IP address will be used to access the Kubeflow dashboard. Run the following command: +:: + kubectl get svc -A + +**Accessing the Kubeflow Dashboard** + +After obtaining the external IP address of the ingress gateway, you can access the Kubeflow dashboard using a web browser. + + Instructions to access Kubeflow dashboard: + + * Open any browser of your choice and go to ``http://external_ip:80``. + * You will be redirected to the Dex login page. You can find a sample image below. + + .. image:: ../../../images/dex_login.png + +**Login to the Kubeflow dashboard** + +To log in to the Kubeflow dashboard and start using its features, you need to provide the default username and password. For more information, `click here `_. :: + +The Kubeflow manifest documentation is present `here `_. + +.. note:: + + * Refer to `image pull <../pullimagestonodes.html>`_ in case of ImagePullBackOff issue while deploying any user defined task. + * To configure Kubeflow mpi operator version v2beta1, `click here `_. + +**Remove Kubeflow** + + 1. Change directory to ``/opt/omnia/kubeflow/kubeflow``. + + 2. Run either of the following commands: + + a. :: + + while ! /opt/omnia/kustomize/kustomize build example | kubectl delete -f -; do echo "Retrying to delete resources"; sleep 10; done + + The above command tries to delete resources in loop. You can verify that all resources are deleted and halt the command's execution if it doesn't stop automatically after some time. + + b. :: + + /opt/omnia/kustomize/kustomize build example | kubectl delete -f - + + The second command does not utilize a loop and can be used as well, but the user needs to ensure that all resources are deleted. Re-run the command until all resources are deleted. + + .. note:: If any pods are found under the namespace ``kubeflow-user-example-com``, delete the namespace with the following command: + :: + kubectl delete namespace kubeflow-user-example-com diff --git a/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/mpi_operator_config.rst b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/mpi_operator_config.rst new file mode 100644 index 000000000..cb6f082d7 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/mpi_operator_config.rst @@ -0,0 +1,48 @@ +MPI-Operator configuration for DeepSpeed deployment +======================================================= + +While deploying Kubernetes on a cluster, Omnia sets the *mpi-operator* API version to ``v2beta1``. But if you choose to deploy Kubeflow on that same Kubernetes cluster, the *mpi-operator* API version automatically changes to ``v1``. + +In order to configure Kubeflow with *mpi-operator* API version v2beta1, execute the following command: :: + + cd tools + ansible-playbook configure_mpi_operator.yml -i --tags mpiv2beta1 + +*Expected result*: The mpi-operator API version v1 and the training operator of Kubeflow is uninstalled. The mpi-operator API version v2beta1 is installed. + +[Optional] Revert back to the default configuration +------------------------------------------------------ + +If you want to revert back to the default configuration, execute the following commands step-by-step: + +* Step 1: :: + + kubectl delete -f .yml + +*where .yml is the YAML configuration file applied to deploy the DeepSpeed MPIJob.* + +* Step 2: :: + + kubectl delete -f .yml + +*where .yml is the PVC configuration file applied to deploy the DeepSpeed MPIJob.* + +* Step 3: :: + + kubectl delete ns workloads + +* Step 4: :: + + cd tools + ansible-playbook configure_mpi_operator.yml -i --tags mpiv1 + +*Expected result*: + +In the process, the following actions are performed: + +* The YAML configuration file used to deploy the DeepSpeed MPIJob is deleted. +* The PVC configuration file is deleted. +* The namespace for DeepSpeed jobs is deleted. +* The mpi-operator API version v2beta1 is uninstalled. +* The mpi-operator API version v1 is installed. +* The training operator of Kubeflow is also installed. \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/vLLM/HuggingFace.rst b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/vLLM/HuggingFace.rst new file mode 100644 index 000000000..f173baa56 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/vLLM/HuggingFace.rst @@ -0,0 +1,189 @@ +Hugging face environment setup +------------------------------- + +Utilize the following command to setup the Hugging face environment variables + +:: + + nerdctl run -it --network=host --group-add=video --ipc=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device /dev/kfd --device /dev/dri/card0 --device /dev/dri/card1 --device /dev/dri/renderD128 -v /opt/omnia/:/app/model --env "HUGGING_FACE_HUB_TOKEN=hf_xxxxxxxxxxxxxxxxxxxxxx" vllm-rocm:latest bash + +By default, vLLM automatically retrieves models from HuggingFace. If you prefer to utilize models from ModelScope, please set the environment variable value to ``True`` as shown below, + +:: + + export VLLM_USE_MODELSCOPE=True + +**Quick start** + +For a complete list of quick start examples, `click here `_. + +**Endpoint** + +1. *Using api_server* + + * Execute the following command to enable the ``api_server`` inference endpoint inside the container. + + :: + + python -m vllm.entrypoints.api_server --model facebook/opt-125m + + Expected output + + :: + + INFO 01-17 20:25:21 llm_engine.py:73] Initializing an LLM engine with config: model='meta-llama/Llama-2-13b-chat-hf', tokenizer='meta-llama/Llama-2-13b-chat-hf', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=pt, tensor_parallel_size=1, quantization=None, seed=0) + + INFO 01-17 20:25:21 tokenizer.py:32] For some LLaMA V1 models, initializing the fast tokenizer may take a long time. To reduce the initialization time, consider using 'hf-internal-testing/llama-tokenizer' instead of the original tokenizer. + + WARNING[XFORMERS]: xFormers can't load C++/CUDA extensions. xFormers was built for: + + PyTorch 2.1.1+cu121 with CUDA 1201 (you have 2.0.1+gita61a294) + + Python 3.10.13 (you have 3.10.13) + + Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers) + + Memory-efficient attention, SwiGLU, sparse and more won't be available. + + Set XFORMERS_MORE_DETAILS=1 for more details + + MegaBlocks not found. Please install it by `pip install megablocks`. + + STK not found: please see https://github.com/stanford-futuredata/stk + + /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/cuda/__init__.py:546: UserWarning: Can't initialize NVML + + warnings.warn("Can't initialize NVML") + + INFO 01-17 20:25:37 llm_engine.py:222] # GPU blocks: 2642, # CPU blocks: 327 + + INFO: Started server process [10] + + INFO: Waiting for application startup. + + INFO: Application startup complete. + + INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) + + * You can also directly execute following command on compute node to enable to ``api_server`` endpoint. + + :: + + nerdctl run -d --network=host --group-add=video --ipc=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device /dev/kfd --device /dev/dri/card0 --device /dev/dri/card1 --device /dev/dri/renderD128 -v /opt/omnia/:/app/model docker.io/embeddedllminfo/vllm-rocm:vllm-v0.2.4 /bin/bash -c 'export http_proxy=http://:3128 && export https_proxy=http://:3128 && python -m vllm.entrypoints.api_server --model facebook/opt-125m' + + * Once the above command is executed, vllm gets enabled through port 8000. Now, user can utilise endpoint to communicate with the model. + + Endpoint example: + + :: + + kmarks@canihipify2:~$ curl http://localhost:8000/generate \ + + -d '{ + + "prompt": "San Francisco is a", + + "use_beam_search": true, + + "n": 4, + + "temperature": 0 + + }' + + Expected output: + + :: + + {"text":["San Francisco is a city of neighborhoods, each with its own unique character and charm. Here are","San Francisco is a city in California that is known for its iconic landmarks, vibrant","San Francisco is a city of neighborhoods, each with its own unique character and charm. From the","San Francisco is a city in California that is known for its vibrant culture, diverse neighborhoods"]} + + .. note:: Replace ``localhost`` with ``node_ip`` while accessing an external node. + +2. *Using open.ai api* + + * **OpenAI-Compatible Server** + + vLLM can be deployed as a server that implements the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API. By default, it starts the server at http://localhost:8000. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the command below) and implements list models, create chat completion, and create completion endpoints. We are actively adding support for more endpoints. + + * Run the following command: + + :: + + nerdctl run -it --network=host --group-add=video --ipc=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device /dev/kfd --device /dev/dri/card0 --device /dev/dri/card1 --device /dev/dri/renderD128 -v /opt/omnia/:/app/model docker.io/embeddedllminfo/vllm-rocm:vllm-v0.2.4 /bin/bash -c 'export http_proxy=http://:3128 && export https_proxy=http://:3128 && python -m vllm.entrypoints.openai.api_server --model facebook/opt-125m' + + Expected output: + + :: + + INFO: Started server process [259] + + INFO: Waiting for application startup. + + INFO: Application startup complete. + + INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) + + * To install OpenAI, run the following command with root privileges from the host entity. + + :: + + pip install openai + + * Run the following command to invoke the python file: + + :: + + cat vivllmamd.py + + :: + + # Modify OpenAI's API key and API base to use vLLM's API server. + + openai_api_key = "EMPTY" + + openai_api_base = http://localhost:8000/v1 + + client = OpenAI( + + api_key=openai_api_key, + + base_url=openai_api_base, + + ) + + + stream = client.chat.completions.create( + + model="meta-llama/Llama-2-13b-chat-hf", + + messages=[{"role": "user", "content": "Explain the differences betweem Navy Diver and EOD rate card"}], + + max_tokens=4000, + + stream=True, + + ) + + * For chunk in stream: + + :: + + if chunk.choices[0].delta.content is not None: + + print(chunk.choices[0].delta.content, end="") + + * Run the following command: + + :: + + python3 vivllmamd.py + + Expected output: + + :: + + Navy Divers and Explosive Ordnance Disposal (EOD) technicians are both specialized careers in the + + ................................................................................[approx 15 lines] + + have distinct differences in their training, responsibilities, and job requirements. \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/vLLM/benchmarktesting.rst b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/vLLM/benchmarktesting.rst new file mode 100644 index 000000000..00901da68 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/vLLM/benchmarktesting.rst @@ -0,0 +1,5 @@ +For benchmark testing +---------------------- + +1. Navigate to ``vllm/benchmarks/`` inside the container. +2. Modify the python files (.py) to perform benchmark testing. \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/vLLM/index.rst b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/vLLM/index.rst new file mode 100644 index 000000000..20e47a7b5 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/vLLM/index.rst @@ -0,0 +1,100 @@ +Setup vLLM +---------- + +vLLM is a fast and easy-to-use library for LLM inference and serving. It is seamlessly integrated with popular HuggingFace models. It is also compatible with OpenAI API servers and GPUs (both NVIDIA and AMD). vLLM 0.2.4 and above supports model inferencing and serving on AMD GPUs with ROCm. At the moment AWQ quantization is not supported in ROCm, but SqueezeLLM quantization has been ported. Data types currently supported in ROCm are FP16 and BF16. + +For NVIDIA, vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries. + +Omnia deploys vLLM on both the ``kube_node`` and ``kube_control_plane``, using an ansible script. After the deployment of vLLM, access the vLLM container (AMD GPU) and import the vLLM Python package (NVIDIA GPU). For more information, `click here `_ + +.. note:: This playbook is only supported on the Ubuntu 22.04 OS platform. + +**Prerequisites** + +* Ensure nerdctl registry is available on all cluster nodes. + +* Only AMD MI200s (gfx90a) and newer GPUs are supported. + +* For nodes with NVIDIA GPUs, ensure that the GPU has a minimum compute capability of 7.0 (Volta architecture). Few examples of such NVIDIA GPUs are: T4, A100, L4, H100. + +* Ensure the ``kube_node``, ``kube_control_plane`` is setup and running. If NVIDIA or AMD GPU acceleration is required for the task, install the NVIDIA (with containerd) or AMD ROCm GPU drivers during provisioning. + +* Use ``local_repo.yml`` to create an offline vLLM repository. For more information, `click here. <../../CreateLocalRepo/localrepos.html>`_ + +**[Optional prerequisites]** + +* Ensure the server has enough available space. (Approximately 100GB is required for the vLLM image. Any additional scripting will take disk capacity outside the image.) + +* Ensure the provided inventory file has one ``kube_control_plane`` and all cluster nodes should be listed under ``kube_node``. + +* Update the ``/input/software_config.json`` file with the correct vLLM version required. The default value is ``vllm-v0.2.4`` for AMD container and ``vllm latest`` for NVidia. + +* Omnia deploys the vLLM pip installation for NVIDIA GPU, or ``embeddedllminfo/vllm-rocm:vllm-v0.2.4`` container image for AMD GPU. + +* **nerdctl** does not support mounting directories as devices because it is not a feature of containerd (nerdctl runtime). Individual files need to be attached while running nerdctl. + + +**Deploying vLLM** + +1. Change directories to the ``tools`` folder: :: + + cd tools + +2. Run the ``vllm.yml`` playbook using: :: + + ansible-playbook vllm.yml -i inventory + +The default namespace is for deployment is ``vLLM``. + +.. note:: During the ``vllm.yml`` playbook execution, nodes with AMD or NVIDIA GPUs and drivers will install and test either the ``vllm-AMD`` or ``vllm-Nvidia`` containers, respectively. + +**Accessing the vLLM (AMD)** + +1. Verify that the vLLM image is present in the container engine images: :: + + nerdctl images | grep vllm + +2. Run the container image using modifiers to customize the run: :: + + nerdctl run -it --network=host --group-add=video --ipc=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device /dev/kfd --device /dev/dri/card0 --device /dev/dri/card1 --device /dev/dri/renderD128 -v /opt/omnia/:/app/model embeddedllminfo/vllm-rocm:vllm-v0.2.4 + +3. To enable an endpoint, `click here `_. + +**Accessing the vLLM (NVIDIA)** + +1. Verify that the vLLM package is installed: :: + + python3.11 -c "import vllm; print(vllm.__version__)" + +2. Use the package within a python script as demonstrated in the sample below: :: + + from vllm import LLM, SamplingParams + + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + llm = LLM(model="mistralai/Mistral-7B-v0.1") + + outputs = llm.generate(prompts, sampling_params) + + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +3. To enable an endpoint, `click here `_. + +.. toctree:: + + vllmintelgaudi + vllmMI300 + vllmInternet + benchmarktesting + HuggingFace + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/vLLM/vllmInternet.rst b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/vLLM/vllmInternet.rst new file mode 100644 index 000000000..a5198c306 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/vLLM/vllmInternet.rst @@ -0,0 +1,9 @@ +vLLM container internet enablement +----------------------------------- + +To enable internet access within the container, user needs to export ``http_proxy`` and ``https_proxy`` environment variables in the following format + +:: + + export http_proxy=http://:3128 + export https_proxy=http://:3128 \ No newline at end of file diff --git a/docs/source/InstallationGuides/Platform/vLLM/vllmMI300.rst b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/vLLM/vllmMI300.rst similarity index 83% rename from docs/source/InstallationGuides/Platform/vLLM/vllmMI300.rst rename to docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/vLLM/vllmMI300.rst index 77bd4cdef..0082d852b 100644 --- a/docs/source/InstallationGuides/Platform/vLLM/vllmMI300.rst +++ b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/vLLM/vllmMI300.rst @@ -10,11 +10,11 @@ Follow the below steps to setup the vLLM: 1. **Build vLLM** -Run the ``vllm_build.yml`` playbook using + * Update the ``admin-nic-IP`` in the ``vllm_k8s_config.yml`` file located inside the ``omnia/utils/vllm_build`` directory. -:: + * Run the ``vllm_build.yml`` playbook using: :: - ansible-playbook vllm_build.yml + ansible-playbook vllm_build.yml 2. **Verify vLLM** diff --git a/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/vLLM/vllmintelgaudi.rst b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/vLLM/vllmintelgaudi.rst new file mode 100644 index 000000000..af2b86d7d --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/InstallAITools/vLLM/vllmintelgaudi.rst @@ -0,0 +1,118 @@ +vLLM enablement for clusters containing Intel Gaudi accelerators +=================================================================== + +Prerequisites +-------------- + +Before enabling the vLLM capabilities of the cluster running Intel Gaudi accelerators, the following prerequisites must be fulfilled: + +1. Verify that the cluster nodes have sufficient allocatable resources for the ``hugepages-2Mi`` and ``Intel Gaudi accelerator``. To check the allocatable resources on all nodes, run: :: + + kubectl describe node | grep -A 10 "Allocatable" + +2. [Optional] If required, you can adjust the resource parameters in the ``vllm_configuration.yml`` file based on the availability of resources on the nodes. + + +Deploy vLLM (Intel) +---------------------- + +After you have completed all the prerequisites, do the following to deploy vLLM on a cluster running with Intel Gaudi accelerators: + +1. Create a namespace to manage on your ``kube_control_plane`` according to the details provided in ``vllm_configuration.yml`` file. Execute the following command: :: + + kubectl create ns workloads + +2. Verify that the namespace has been created by executing the following command: :: + + kubectl get namespace workloads + + *Expected output*: :: + + NAME STATUS AGE + workloads Active 45s + +3. To create a configuration file for vLLM deployment, follow these steps: + + a. Locate the ``vllm_configuration.yml`` file in the ``examples/ai_examples/intel/vllm`` folder. + b. Open the ``vllm_configuration.yml`` file. + c. Add the necessary details such as Hugging Face token, and allocated resources for the vLLM deployment. + d. After modifying the file, you have two choices: + + - Directly copy the modified file to your ``kube_control_plane``. + - Create a new blank ``.yml`` file, paste the modified contents into it, and save it on your ``kube_control_plane``. + + e. Finally, apply the file using the following command: :: + + kubectl apply -f .yml + + *Expected output*: :: + + service/vllm-llama-svc created + deployment.apps/vllm-llama created + +4. To create and apply the Persistent Volume Claim (PVC) configuration file, required to access shared storage, follow these steps: + + a. Create a new blank ``.yml`` file, + b. Paste the following content into it, and save it on your ``kube_control_plane``. :: + + apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: shared-model + namespace: workloads + spec: + storageClassName: nfs-client + accessModes: + - ReadWriteOnce + resources: + requests: + storage: + + c. Add the necessary details such as name, namespace, and storage size for the vLLM deployment. Use the same configurations as provided in the ``.yml`` file. + d. Finally, apply the file using the following command: :: + + kubectl apply -f .yml + + *Expected output*: :: + + persistentvolumeclaim/shared-model created + +5. Verify the PVC is bound and available for the deployment using the following command: :: + + kubectl get pvc -n workloads + + *Expected output*: :: + + NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE + shared-model Bound pvc-0a066bce-9511-4f73-ac41-957a8088cfb0 400Gi RWX nfs-client 14s + +6. After some time, check the status of the pods again to verify if they are up and running. Execute the following command to get the pod status: :: + + kubectl get pod -n workloads + + *Expected output (when pods are running)*: :: + + NAME READY STATUS RESTARTS AGE + vllm-llama-669bbf5c9b-1h7jm 1/1 Running 0 58s + +6. After approximately 30 minutes, verify the service status of the vLLM deployment using the following command: :: + + kubectl get svc -n workloads + + *Expected output*: :: + + NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE + vllm-llama-svc NodePort 10.233.13.108 8000:32195/TCP 71s + +7. Finally, verify the endpoints using the following command: :: + + kubectl get endpoints vllm-llama-svc -n workloads + + *Expected output*: :: + + NAME ENDPOINTS AGE + vllm-llama-svc 10.233.108.196:8000 82s + +*Final output*: + +Once vLLM deployment is complete, the following output is displayed while executing the ``curl -X POST -d "param1=value1¶m2=value2" :`` command. \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/BuildingCluster/AMD_ROCm.rst b/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/BuildingCluster/AMD_ROCm.rst new file mode 100644 index 000000000..b4a7c7892 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/BuildingCluster/AMD_ROCm.rst @@ -0,0 +1,39 @@ +Install the ROCm platform for AMD GPUs +======================================= + +This playbook sets up the `AMD ROCm `_ platform on the clusters. This tool allows users to unlock the full potential of installed AMD GPUs. + +Ensure that the ROCm local repositories are configured using the `local_repo.yml <../../CreateLocalRepo/localrepos.html#configure-specific-local-repositories>`_ script. + +Ensure that the ``input/software_config.json`` contains valid amdgpu and rocm version. See `input parameters <../../CreateLocalRepo/InputParameters.html>`_ for more information. + +To install all the latest AMD GPU drivers and toolkits, run the ``omnia.yml`` playbook using the following command: :: + + cd omnia + ansible-playbook omnia.yml -i inventory + +The following configurations take place while executing ``rocm_installation.yml``: + + i. Servers with AMD GPUs are identified and the latest GPU drivers and ROCm platforms are downloaded and installed. + ii. Servers with no GPU are skipped. + +User permissions for ROCm platforms +------------------------------------ + +* To add an user to the ``render`` and ``video`` group, use the following command: :: + + sudo usermod -a -G render,video + +.. note:: + * is the system name of the end user. + * This command must be run with ``root`` permissions. + * If the root user wants to provide access to other users and their individual GPU nodes, the previous command needs to be run on all of them separately. :: + +* To enable users to use rocm tools, use the following command as shown in the below added sample file: :: + + /opt/rocm/bin/ + +.. image:: ../../../../images/ROCm_user_permissions.png + +For any configuration changes, check out ROCm's official documentation `here. `_ + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/BuildingCluster/Authentication.rst b/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/BuildingCluster/Authentication.rst new file mode 100644 index 000000000..5f34d8c1e --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/BuildingCluster/Authentication.rst @@ -0,0 +1,205 @@ +Centralized authentication on the cluster +========================================== + +The security feature allows cluster admin users to set up OpenLDAP in order to allow or deny user access to the cluster. + +.. note:: FreeIPA configuration is not supported on Ubuntu (only supported on RHEL/Rocky Linux). OpenLDAP is provided as an alternative for Ubuntu. + +Configuring OpenLDAP security +_______________________________ + +**Prerequisites** + +* Ensure that the following entry is present in the ``input/software_config.json``: :: + + {"name": "openldap"} + +* Run ``local_repo.yml`` to create offline repositories of OpenLDAP. For more information, `click here <../../CreateLocalRepo/localrepos.html>`_. + +* Enter the following parameters in ``input/security_config.yml``: + +.. csv-table:: Parameters for Authentication + :file: ../../../../Tables/security_config.csv + :header-rows: 1 + :keepspace: + +.. csv-table:: Parameters for OpenLDAP configuration + :file: ../../../../Tables/security_config_ldap.csv + :header-rows: 1 + :keepspace: + +Running the security role +-------------------------- + +The wrapper playbook ``omnia.yml`` handles execution of the security or authentication role. Alternatively, execute the ``security.yml`` playbook: :: + + cd security + ansible-playbook security.yml -i inventory + +.. note:: To run the ``security.yml`` playbook independently from the ``omnia.yml`` playbook on Intel Gaudi nodes, start by executing the ``performance_profile.yml`` playbook. Once that’s done, you can run the ``security.yml`` playbook separately. + +The provided inventory should contain ``auth_server`` and ``login`` [optional] groups. The inventory file is case-sensitive. Follow the format provided in the `sample files <../../../samplefiles.html#inventory-file>`_. + + * Do not include the IP of the OIM or local host in the ``auth_server`` group of the inventory file. + * For `secure login node functionality `_, ensure to add the ``login`` group in the provided inventory file. To customize the security features on the login node, update the desired parameters in ``input/login_node_security_config.yml``. + * If a subsequent run of ``security.yml`` fails, the ``security_config.yml`` file will be unencrypted. + +.. note:: + + * Installation of OpenLDAP server on the OIM is not supported. + * Omnia sets up the internal OpenLDAP server for user authentication after the execution of ``security.yml`` or ``omnia.yml`` playbook. If login fails for an OpenLDAP user, then check if the ``slapd-ltd.service`` is running on the authentication server. For more information, `click here <../../../../Troubleshooting/FAQ/Common/Security.html>`_. + +.. caution:: No users will be created by Omnia. + +Create a new user on OpenLDAP +----------------------------- + +1. Create an LDIF file (eg: ``create_user.ldif``) on the auth server containing the following information: + + * DN: The distinguished name that indicates where the user will be created. + * objectClass: The object class specifies the mandatory and optional attributes that can be associated with an entry of that class. Here, the values are ``inetOrgPerson``, ``posixAccount``, and ``shadowAccount``. + * UID: The username of the replication user. + * sn: The surname of the intended user. + * cn: The given name of the intended user. + +Below is a sample file: :: + + # User Creation + dn: uid=ldapuser,ou=People,dc=omnia,dc=test + objectClass: inetOrgPerson + objectClass: posixAccount + objectClass: shadowAccount + cn: ldapuser + sn: ldapuser + loginShell:/bin/bash + uidNumber: 2000 + gidNumber: 2000 + homeDirectory: /home/ldapuser + shadowLastChange: 0 + shadowMax: 0 + shadowWarning: 0 + + # Group Creation + dn: cn=ldapuser,ou=Group,dc=omnia,dc=test + objectClass: posixGroup + cn: ldapuser + gidNumber: 2000 + memberUid: ldapuser + +.. note:: Avoid whitespaces when using an LDIF file for user creation. Extra spaces in the input data may be encrypted by OpenLDAP and cause access failures. + +2. Run the command ``ldapadd -D -w < bind_password > -f create_user.ldif`` to execute the LDIF file and create the account. +3. To set up a password for this account, use the command ``ldappasswd -D -w < bind_password > -S ``. The value of ``user_dn`` is the distinguished name that indicates where the user was created. (In this example, ``uid=ldapuser,ou=People,dc=omnia,dc=test``) + +Setting up Passwordless SSH for the OpenLDAP users +----------------------------------------------------------- + +Once user accounts are created, admins can enable password-less SSH for users to run HPC jobs on the cluster nodes. + +.. note:: Once user accounts are created on the auth server, use the accounts to login to the cluster nodes to reset the password and create a corresponding home directory. + +To customize your setup of password-less SSH, input custom parameters in ``input/passwordless_ssh_config.yml``: + ++-----------------------+--------------------------------------------------------------------------------------------------------------------+ +| Parameter | Details | ++=======================+====================================================================================================================+ +| user_name | The list of users that requires passwordless SSH. Separate the list of users using a comma. | +| ``string`` | Example: ``user1,user2,user3`` | +| Required | | ++-----------------------+--------------------------------------------------------------------------------------------------------------------+ +| authentication_type | Indicates whether LDAP is in use on the cluster. | +| ``string`` | | +| Required | Choices: | +| | ``ldap`` <- Default | ++-----------------------+--------------------------------------------------------------------------------------------------------------------+ + + +Use the below command to enable password-less SSH: :: + + ansible-playbook user_passwordless_ssh.yml -i inventory + +Where inventory follows the format defined under inventory file in the provided `Sample Files <../../../sample files.html>`_. The inventory file is case-sensitive. Follow the format provided in the sample file link. + +.. caution:: Do not run SSH-keygen commands after password-less SSH is set up on the nodes. + +Configuring login node security +________________________________ + +**Prerequisites** + +* Ensure that the following entry is present in the ``input/software_config.json``: :: + + {"name": "secure_login_node"} + +* Run ``local_repo.yml`` to create an offline repository of all utilities used to secure the login node. For more information, `click here. <../../CreateLocalRepo/localrepos.html>`_ + +* For secure login node functionality, ensure to add the ``login`` group in the provided inventory file. + +Enter the following parameters in ``input/login_node_security_config.yml``. + ++-----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Variable | Details | ++=============================+================================================================================================================================================================================+ +| **max_failures** | The number of login failures that can take place before the account is locked out. | +| ``integer`` | | +| Optional | **Default values**: ``3`` | ++-----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|**failure_reset_interval** | Period (in seconds) after which the number of failed login attempts is reset. Min value: 30; Max value: 60. | +| ``integer`` | | +| Optional | **Default values**: ``60`` | ++-----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| **lockout_duration** | Period (in seconds) for which users are locked out. Min value: 5; Max value: 10. | +| ``integer`` | | +| Optional | **Default values**: ``10`` | ++-----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|**session_timeout** | User sessions that have been idle for a specific period can be ended automatically. Min value: 90; Max value: 180. | +| ``integer`` | | +| Optional | **Default values**: ``180`` | ++-----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|**alert_email_address** | Email address used for sending alerts in case of authentication failure. When blank, authentication failure alerts are disabled. | +| ``string`` | User can mention multiple comma-separated alert email addresses. | +| Optional | **Example**: :: | +| | alert_email_address: "user1@domain.com,user2@domain.com" | ++-----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|**smtp_server** | This parameter will be applicable only when ``alert_email_address`` is provided. | +| ``string`` | This variable contains the SMTP server details configured on the cluster, from where the email alerts would be sent in case of authentication failures. | +| Optional | Currently, Omnia only supports configuration of a single SMTP server on the cluster. The SMTP server should be reachable from the ``login_node`` to receive the email alerts. | +| | **Example**: :: | +| | smtp_server: | +| | - { host: "smtp-server.domain.com", port: "25", sender_address: "alert@domain.com" }" | ++-----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|**user** | Access control list of users. Accepted formats are username@ip (root@1.2.3.4) or username (root). Multiple users can be separated using whitespaces. | +| ``string`` | | +| Optional | | ++-----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|**allow_deny** | This variable decides whether users are to be allowed or denied access. Ensure that AllowUsers or DenyUsers entries on sshd configuration file are not commented. | +| ``string`` | | +| Optional | Choices: | +| | | +| | * ``allow`` <- Default | +| | * ``deny`` | ++-----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|**restrict_program_support** | This variable is used to disable services. Root access is mandatory. | +| ``boolean`` | | +| Optional | Choices: | +| | | +| | * ``false`` <- Default | +| | * ``true`` | ++-----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|**restrict_softwares** | List of services to be disabled (Comma-separated). Example: 'telnet,lpd,bluetooth' | +| ``string`` | | +| Optional | Choices: | +| | | +| | * ``telnet`` | +| | * ``lpd`` | +| | * ``bluetooth`` | +| | * ``rlogin`` | +| | * ``rexec`` | ++-----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +Advanced Settings +------------------ + +* To replicate the OpenLDAP server `click here <../ReplicatingLDAP.html>`_. + +* To set up the internal OpenLDAP server as a proxy, `click here <../OpenLDAP_proxy.html>`_. \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/BuildingCluster/Storage/BeeGFS.rst b/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/BuildingCluster/Storage/BeeGFS.rst new file mode 100644 index 000000000..441f1175c --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/BuildingCluster/Storage/BeeGFS.rst @@ -0,0 +1,75 @@ +BeeGFS bolt on +-------------- + +BeeGFS is a hardware-independent POSIX parallel file system (a.k.a. Software-defined Parallel Storage) developed with a strong focus on performance and designed for ease of use, simple installation, and management. + +.. image:: ../../../../../images/BeeGFS_Structure.jpg + + +**Pre Requisites before installing BeeGFS client** + +* Ensure that the BeeGFS server is set up using the `linked steps <../../../../../Appendices/BeeGFSServer.html>`_. +* Ensure that a ``connAuthFile`` is configured on the server as explained `here <../../../../../Appendices/BeeGFSServer.html>`_ + +.. caution:: Configuring a ``connAuthFile`` is now mandatory. Services will no longer start if a ``connAuthFile`` is not configured + +* Ensure that the following ports are open for TCP and UDP connectivity: + + +------+-----------------------------------+ + | Port | Service | + +======+===================================+ + | 8008 | Management service (beegfs-mgmtd) | + +------+-----------------------------------+ + | 8003 | Storage service (beegfs-storage) | + +------+-----------------------------------+ + | 8004 | Client service (beegfs-client) | + +------+-----------------------------------+ + | 8005 | Metadata service (beegfs-meta) | + +------+-----------------------------------+ + | 8006 | Helper service (beegfs-helperd) | + +------+-----------------------------------+ + + + +To open the ports required, use the following steps: + + 1. ``firewall-cmd --permanent --zone=public --add-port=/tcp`` + + 2. ``firewall-cmd --permanent --zone=public --add-port=/udp`` + + 3. ``firewall-cmd --reload`` + + 4. ``systemctl status firewalld`` + + + + +.. note:: BeeGFS services over RDMA is only supported on RHEL 8.3 and above due to limitations on BeeGFS. When setting up your cluster with RDMA support, check the BeeGFS documentation to provide appropriate values in ``input/storage_config.yml``. + + +**Installing the BeeGFS client via Omnia** + +After the required parameters are filled in ``input/storage_config.yml``, Omnia installs BeeGFS on all nodes while executing the ``storage.yml`` playbook. + +.. caution:: Do not remove or comment any lines in the ``input/storage_config.yml`` file. + +.. csv-table:: Parameters for storage + :file: ../../../Tables/storage_config.csv + :header-rows: 1 + :keepspace: + +.. note:: + * BeeGFS client-server communication can take place over TCP or RDMA. If RDMA support is required, set ``beegfs_rdma_support`` should be set to true. Also, OFED should be installed on all cluster nodes. + * For BeeGFS communication happening over RDMA, the ``beegfs_mgmt_server`` should be provided with the Infiniband IP of the management server. + * The parameter inventory refers to the `inventory file <../../../../samplefiles.html>`_ listing all relevant nodes. + +If ``input/storage_config.yml`` is populated before running ``omnia.yml``, BeeGFS client will be set up during the execution of ``omnia.yml``. + +If ``omnia.yml`` is not leveraged to set up BeeGFS, execute the ``storage.yml`` playbook : :: + + cd storage + ansible-playbook storage.yml -i inventory + +.. note:: To run the ``storage.yml`` playbook independently from the ``omnia.yml`` playbook on Intel Gaudi nodes, start by executing the ``performance_profile.yml`` playbook. Once that’s done, you can run the ``storage.yml`` playbook separately. + + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/BuildingCluster/Storage/NFS.rst b/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/BuildingCluster/Storage/NFS.rst new file mode 100644 index 000000000..9a949e3ff --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/BuildingCluster/Storage/NFS.rst @@ -0,0 +1,96 @@ +NFS +==== + +Network File System (NFS) is a networking protocol for distributed file sharing. A file system defines the way data in the form of files is stored and retrieved from storage devices, such as hard disk drives, solid-state drives and tape drives. NFS is a network file sharing protocol that defines the way files are stored and retrieved from storage devices across networks. + +.. note:: NFS is a mandatory feature for all clusters set up by Omnia. Omnia sets up the NFS server and mounts the NFS client when ``nfs_server`` value is true. + +**Prerequisites** + +* NFS is set up on Omnia clusters based on the inputs provided in ``input/storage_config.yml``. + + +-----------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+ + | Parameter | Details | + +=======================+=============================================================================================================================================================+ + | **nfs_client_params** | * This JSON list contains all parameters required to set up NFS. | + | | * For a bolt-on set up where there is a pre-existing NFS server, set ``nfs_server`` to ``false``. | + | ``JSON List`` | * When ``nfs_server`` is set to ``true``, an NFS share is created on a server IP in the cluster for access by all other cluster nodes. | + | | * Ensure that the value of ``share_path`` in ``input/omnia_config.yml`` matches at least one of the ``client_share_path`` values in the JSON list provided. | + | Required | | + +-----------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+ + + + .. image:: ../../../../../images/nfs_flowchart.png + + * The fields listed in ``nfs_client_params`` are: + + - **server_ip**: IP of the intended NFS server. To set up an NFS server on the OIM, use the value ``localhost``. Use an IP address to configure access anywhere else. + + - **server_share_path**: Folder on which the NFS server mounted. + + - **client_share_path**: Target directory for the NFS mount on the client. If left empty, the respective ``server_share_path value`` will be taken for ``client_share_path``. + + - **nfs_server**: Indicates whether an external NFS server is available (``false``) or an NFS server will need to be created (``true``). + + - **client_mount_options**: Indicates the NFS share mount options. + + - **k8s_share**: Indicates that the target cluster uses Kubernetes. + + .. note:: To install any benchmarking software like UCX or OpenMPI, ``k8s_share`` must be set to true. + + To configure all cluster nodes to access a single external NFS server export, use the below sample: :: + + - { server_ip: 10.5.0.101, server_share_path: "/mnt/share", client_share_path: "/home", client_mount_options: "nosuid,rw,sync,hard", nfs_server: true, slurm_share: true, k8s_share: true } + + To configure the cluster nodes to access a new NFS server on the OIM as well as an external NFS server, use the below example: :: + + - { server_ip: localhost, server_share_path: "/mnt/share1", client_share_path: "/home", client_mount_options: "nosuid,rw,sync,hard", nfs_server: true, slurm_share: true, k8s_share: true } + - { server_ip: 198.168.0.1, server_share_path: "/mnt/share2", client_share_path: "/mnt/mount2", client_mount_options: "nosuid,rw,sync,hard", nfs_server: false, slurm_share: true, k8s_share: true } + + To configure the cluster nodes to access new NFS server exports on the cluster nodes, use the below sample: :: + + - { server_ip: 198.168.0.1, server_share_path: "/mnt/share1", client_share_path: "/mnt/mount1", client_mount_options: "nosuid,rw,sync,hard", nfs_server: false, slurm_share: true, k8s_share: true } + - { server_ip: 198.168.0.2, server_share_path: "/mnt/share2", client_share_path: "/mnt/mount2", client_mount_options: "nosuid,rw,sync,hard", nfs_server: false, slurm_share: true, k8s_share: true } + + +* Ensure that an NFS local repository is created by including ``{"name": "nfs"},`` in ``input/software_config.json``. For more information, `click here <../../../CreateLocalRepo/InputParameters.html>`_. +* If an external NFS share is used, make sure that ``/etc/exports`` on the NFS server is populated with the same paths listed as ``server_share_path`` in the ``nfs_client_params`` in ``input/storage_config.yml``. +* Omnia supports all NFS mount options. Without user input, the default mount options are ``nosuid,rw,sync,hard,intr``. + + +**Executing the playbook** + +Execute the ``storage.yml`` playbook: :: + + cd storage + ansible-playbook storage.yml -i inventory + +Use the linked `inventory file <../../../../samplefiles.html#inventory-file>`_ for the above playbook. + +.. note:: To run the ``storage.yml`` playbook independently from the ``omnia.yml`` playbook on Intel Gaudi nodes, start by executing the ``performance_profile.yml`` playbook. Once that’s done, you can run the ``storage.yml`` playbook separately. + +Post configuration, enable the following services (using this command: ``firewall-cmd --permanent --add-service=``) and then reload the firewall (using this command: ``firewall-cmd --reload``). + + - nfs + + - rpc-bind + + - mountd + +.. caution:: + * After an NFS client is configured, if the NFS server is rebooted, the client may not be able to reach the server. In those cases, restart the NFS services on the server using the below command: + + :: + + systemctl disable nfs-server + systemctl enable nfs-server + systemctl restart nfs-server + + * When ``nfs_server`` is false, enable the following services after configuration using this command: ``firewall-cmd --permanent --add-service=``) and then reload the firewall (using this command: ``firewall-cmd --reload``). + + - nfs + + - rpc-bind + + - mountd + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/BuildingCluster/Storage/index.rst b/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/BuildingCluster/Storage/index.rst new file mode 100644 index 000000000..9020a0418 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/BuildingCluster/Storage/index.rst @@ -0,0 +1,7 @@ +Storage configurations +======================== + +.. toctree:: + + NFS + BeeGFS \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/BuildingCluster/index.rst b/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/BuildingCluster/index.rst new file mode 100644 index 000000000..4f72ac5b2 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/BuildingCluster/index.rst @@ -0,0 +1,13 @@ +Building an Omnia Cluster +=========================== + +.. toctree:: + :maxdepth: 2 + + AMD_ROCm + Authentication + Storage/index + install_kubernetes + installscheduler + + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/BuildingCluster/install_kubernetes.rst b/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/BuildingCluster/install_kubernetes.rst new file mode 100644 index 000000000..a59f180fb --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/BuildingCluster/install_kubernetes.rst @@ -0,0 +1,122 @@ +Set up Kubernetes +================== + +**Prerequisites** + +* Ensure that ``k8s`` entry is present in the ``softwares`` list in ``software_config.json``, as mentioned below: + :: + + "softwares": [ + {"name": "k8s", "version":"1.29.5"}, + ] + +* Ensure to run ``local_repo.yml`` with the ``k8s`` entry present in ``software_config.json``, to download all required Kubernetes packages and images. + +* Once all the required parameters in `omnia_config.yml <../schedulerinputparams.html#id12>`_ are filled in, ``omnia.yml`` can be used to set up Kubernetes. + +* Ensure that ``k8s_share`` is set to ``true`` in `storage_config.yml <../schedulerinputparams.html#storage-config-yml>`_, for one of the entries in ``nfs_client_params``. + +**Inventory details** + +* For Kubernetes, all the applicable inventory groups are ``kube_control_plane``, ``kube_node``, and ``etcd``. + +* The inventory file must contain: + + 1. Exactly 1 ``kube_control_plane``. + 2. At least 1 ``kube_node``. + 3. Odd number of ``etcd`` nodes. + +.. note:: Ensure that the inventory includes an ``[etcd]`` node. etcd is a consistent and highly-available key value store used as Kubernetes' backing store for all cluster data. For more information, `click here. `_ + +**Sample inventory** +:: + + [kube_control_plane] + + 10.5.1.101 + + [kube_node] + + 10.5.1.102 + + [etcd] + + 10.5.1.101 + + +**To install Kubernetes** + +Run either of the following commands: + + 1. :: + + ansible-playbook omnia.yml -i inventory + + 2. :: + + ansible-playbook scheduler.yml -i inventory + +.. note:: + + * To run the ``scheduler.yml`` playbook independently from the ``omnia.yml`` playbook on Intel Gaudi nodes, start by executing the ``performance_profile.yml`` playbook. Once that’s done, you can run the ``scheduler.yml`` playbook separately. + * To add new nodes to an existing cluster, click `here. <../../../Maintenance/addnode.html>`_ + +**Additional installations** + +Omnia installs the following packages on top of the Kubernetes stack: + +1. *amdgpu-device-plugin (ROCm device plugin)* + + This is a Kubernetes device plugin implementation that enables the registration of AMD GPU in a container cluster for compute workload. + Click `here `_ for more information. + +2. *mpi-operator* + + The MPI Operator makes it easy to run allreduce-style distributed training on Kubernetes. + Click `here `_ for more information. + +3. *xilinx device plugin* + + The Xilinx FPGA device plugin for Kubernetes is a Daemonset deployed on the Kubernetes (k8s) cluster which allows you to: + + i. Discover the FPGAs inserted in each node of the cluster and expose information about FPGA such as number of FPGA, Shell (Target Platform) type and etc. + + ii. Run FPGA accessible containers in the k8s cluster + + Click `here `_ for more information. + +4. *nfs-client-provisioner* + + * NFS subdir external provisioner is an automatic provisioner that use your existing and already configured NFS server to support dynamic provisioning of Kubernetes Persistent Volumes via Persistent Volume Claims. + * The NFS server utilised here is the one mentioned in ``storage_config.yml``. + * Server IP is ```` and path is ``.`` of the entry where ``k8s_share`` is set to ``true``. + + Click `here `_ for more information. + +5. *nvidia-device-plugin* + + For the NVIDIA device plugin to function seamlessly, Omnia installs the "nvidia-container-toolkit" as part of the ``omnia.yml`` or ``scheduler.yml`` playbook execution. The NVIDIA device plugin for Kubernetes is a "DaemonSet" that allows you to automatically: + + i. Expose the number of GPUs on each nodes of your cluster + ii. Keep track of the health of your GPUs + iii. Run GPU enabled containers in your Kubernetes cluster + + Click `here `_ for more information. + +6. *gaudi-device-plugin* + + The Gaudi device plugin is a Kubernetes device plugin implementation that enables the registration of Intel Gaudi AI accelerators in a container cluster. This plugin enables the efficient utilization of Gaudi accelerators for compute workloads within the cluster. + For the gaudi-device-plugin to function seamlessly, Omnia installs the “habanalabs-container-runtime” as part of the ``omnia.yml`` or ``scheduler.yml`` playbook execution. + + The Gaudi device plugin for Kubernetes is a “DaemonSet” that allows you to automatically: + + i. Enable the registration of Intel Gaudi accelerators in your Kubernetes cluster. + ii. Keep track of device health. + iii. Run jobs on the Intel Gaudi accelerators. + + Click `here `_ for more information. + +**Optional installation** + +* `Kubernetes device plugin for the RoCE NIC <../../AdvancedConfigurationsUbuntu/k8s_plugin_roce_nic.html>`_ +* `PowerScale CSI drivers <../../AdvancedConfigurationsUbuntu/PowerScale_CSI.html>`_ \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/BuildingCluster/installscheduler.rst b/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/BuildingCluster/installscheduler.rst new file mode 100644 index 000000000..08f866012 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/BuildingCluster/installscheduler.rst @@ -0,0 +1,37 @@ +Cluster formation +==================== + +1. In the ``input/omnia_config.yml``, ``input/security_config.yml``, and ``input/storage_config.yml`` files, provide the `required details <../schedulerinputparams.html>`_. For ``input/telemetry_config.yml``, the details can be found `here <../../../../Telemetry/index.html#id13>`_. + +2. Create an inventory file in the *omnia* folder. Check out the `sample inventory <../../../samplefiles.html>`_ for more information. If a hostname is used to refer to the target nodes, ensure that the domain name is included in the entry. IP addresses are also accepted in the inventory file. + +.. include:: ../../Appendices/hostnamereqs.rst + +.. note:: Omnia creates a log file which is available at: ``/var/log/omnia.log``. + +3. ``omnia.yml`` is a wrapper playbook and achieves the following tasks: + + i. ``security.yml``: This playbook sets up centralized authentication (OpenLDAP) on the cluster. For more information, `click here. `_ + ii. ``storage.yml``: This playbook sets up storage tools such as, `NFS `_. + iii. ``scheduler.yml``: This playbook sets up the (`Kubernetes `_) job scheduler on the cluster. + iv. ``telemetry.yml``: This playbook sets up `Omnia telemetry and/or iDRAC telemetry <../../../../Telemetry/index.html>`_. It also installs `Grafana `_ and `Loki `_ as Kubernetes pods. + v. ``rocm_installation.yml``: This playbook sets up the `ROCm platform for AMD GPU accelerators `_. + vi. ``performance_profile.yml``: This playbook is located in the ``utils/performance_profile`` directory and it enables you to optimize system performance for specific workloads. For more information, see `Performance profile configuration <../../../../Utils/tuneD.html>`_. + +.. note:: To run the ``scheduler.yml``, ``security.yml``, ``telemetry.yml``, or ``storage.yml`` playbook independently from the ``omnia.yml`` playbook on Intel Gaudi nodes, start by executing the ``performance_profile.yml`` playbook. Once that’s done, you can run the respective playbooks separately. + +To run ``omnia.yml``: :: + + ansible-playbook omnia.yml -i inventory + +.. note:: + * If you want to view or edit the ``omnia_config.yml`` file, run the following command: + + - ``ansible-vault view omnia_config.yml --vault-password-file .omnia_vault_key`` -- To view the file. + + - ``ansible-vault edit omnia_config.yml --vault-password-file .omnia_vault_key`` -- To edit the file. + + * Use the ansible-vault view or edit commands and not the ansible-vault decrypt or encrypt commands. If you have used the ansible-vault decrypt or encrypt commands, provide 644 permission to the parameter files. + +4. Once ``omnia.yml`` playbook is successfully executed, the cluster is up and running with the required application stack. Now, you can install `AI tools <../../InstallAITools/index.html>`_ or utilize the cluster for job execution. + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/OpenLDAP_proxy.rst b/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/OpenLDAP_proxy.rst new file mode 100644 index 000000000..c55d8bafe --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/OpenLDAP_proxy.rst @@ -0,0 +1,170 @@ +Setting up OpenLDAP as a proxy server +======================================= + +Omnia allows the internal OpenLDAP server to be configured as a proxy, where it utilizes the external LDAP servers as a backend database to store user data and acts as an authentication entity to allow/deny them access to the cluster. OpenLDAP client will be configured through the proxy server which means that there won't be any direct communication between OpenLDAP client and the external LDAP server. + +.. note:: If the OpenLDAP server is set up as a proxy, the user database is not replicated onto the server. + +Perform the following steps to configure OpenLDAP as a proxy server: + +1. Before proceeding with the new configuration, first remove the existing LDAP configurations by removing the ``/usr/local/openldap/etc/openldap/slapd.d/`` folder and then create another directory with the same folder hierarchy using the ``mkdir`` command. Execute the following commands to perform these operations: :: + + rm -rf /usr/local/openldap/etc/openldap/slapd.d/ + mkdir /usr/local/openldap/etc/openldap/slapd.d/ + +2. Now, locate the ``slapd.conf`` config file present in ``/usr/local/openldap/etc/openldap/`` and modify the file to add the new LDAP configurations. Add the following lines to the config file based on the operating system running on the cluster: + + For RHEL/Rocky Linux: :: + + include /usr/local/openldap/etc/openldap/schema/core.schema + include /usr/local/openldap/etc/openldap/schema/cosine.schema + include /usr/local/openldap/etc/openldap/schema/nis.schema + include /usr/local/openldap/etc/openldap/schema/inetorgperson.schema + + + pidfile /usr/local/openldap/var/run/slapd.pid + argsfile /usr/local/openldap/var/run/slapd.args + + # Load dynamic backend modules: + modulepath /usr/local/openldap/libexec/openldap + moduleload back_ldap.la + moduleload back_meta.la + + ####################################################################### + # Meta database definitions + ####################################################################### + database meta + suffix "dc=phantom,dc=test" + rootdn cn=admin,dc=phantom,dc=test + rootpw Dell1234 + + uri "ldap://10.5.0.104:389/dc=phantom,dc=test" + suffixmassage "dc=phantom,dc=test" "dc=perf,dc=test" + idassert-bind + bindmethod=simple + binddn="cn=admin,dc=perf,dc=test" + credentials="Dell1234" + flags=override + mode=none + TLSCACertificateFile /etc/openldap/certs/ldapserver.crt + TLSCertificateFile /etc/openldap/certs/ldapserver.crt + TLSCertificateKeyFile /etc/pki/tls/certs/ldapserver.key + + For Ubuntu: :: + + include /usr/local/openldap/etc/openldap/schema/core.schema + include /usr/local/openldap/etc/openldap/schema/cosine.schema + include /usr/local/openldap/etc/openldap/schema/nis.schema + include /usr/local/openldap/etc/openldap/schema/inetorgperson.schema + + + pidfile /usr/local/openldap/var/run/slapd.pid + argsfile /usr/local/openldap/var/run/slapd.args + + # Load dynamic backend modules: + modulepath /usr/local/openldap/libexec/openldap + moduleload back_ldap.la + moduleload back_meta.la + + ####################################################################### + # Meta database definitions + ####################################################################### + database meta + suffix "dc=phantom,dc=test" + rootdn cn=admin,dc=phantom,dc=test + rootpw Dell1234 + + uri "ldap://10.5.0.104:389/dc=phantom,dc=test" + suffixmassage "dc=phantom,dc=test" "dc=perf,dc=test" + idassert-bind + bindmethod=simple + binddn="cn=admin,dc=perf,dc=test" + credentials="Dell1234" + flags=override + mode=none + TLSCACertificateFile /etc/ssl/certs/ca-certificates.crt + TLSCertificateFile /etc/ssl/certs/ssl-cert-snakeoil.pem + TLSCertificateKeyFile /etc/ssl/private/ssl-cert-snakeoil.key + +Change the **** values in the config file, as described below: + +* **database**: Database used in the ``slapd.conf`` file, that captures the details of the external LDAP server to be used. For example, ``meta``. +* **suffix**: Captures the domain name of internal OpenLDAP user, to refine the user search while attempting to authenticate the user. For example, ``"dc=omnia,dc=test"``. +* **rootdn**: Admin or root username of the internal OpenLDAP server set up by Omnia. For example, ``cn=admin,dc=omnia,dc=test``. +* **rootpw**: Admin password for the internal OpenLDAP server. For example, ``Dell1234``. + +* **uri**: Captures the IP of the external LDAP server along with the port and the domain of the user in ``"ldap://:/"`` format. For example, ``"ldap://10.5.0.104:389/dc=omnia,dc=test"``. +* **suffixmassage**: ``suffixmassage`` allows you to dynamically move the LDAP client information from the existing internal OpenLDAP server to the external LDAP server that you want to configure as a proxy. This is provided in the ``suffixmassage `` format. + + * ```` is the internal OpenLDAP server suffix (base DN). + * ```` is the external LDAP server suffix (base DN). + +* **binddn**: Admin username and domain of the external LDAP server. +* **credentials**: Admin password for the external LDAP server. + +* **TLSCACertificateFile**: Omnia, by default, creates the TLSA certificate in ``/etc/openldap/certs/ldapserver.crt`` for RHEL/Rocky Linux or in ``/etc/ssl/certs/ca-certificates.crt`` for Ubuntu. +* **TLSCertificateFile**: Omnia, by default, creates the TLS certificate in ``/etc/openldap/certs/ldapserver.crt`` for RHEL/Rocky Linux or in ``/etc/ssl/certs/ssl-cert-snakeoil.pem`` for Ubuntu. +* **TLSCertificateKeyFile**: Omnia, by default, creates the certificate key file in ``/etc/pki/tls/certs/ldapserver.key`` for RHEL/Rocky Linux or in ``/etc/ssl/private/ssl-cert-snakeoil.key`` for Ubuntu. + +.. note:: + * The values for ``suffix`` and ``rootdn`` parameters in the ``slapd.conf`` file must be the same as those provided in the ``input/security_config.yml`` file. + + * If you have your own set of TLS certificates and keys that you want to utilize instead of the default ones created by Omnia, then you can provide the path to them in the ``input/security_config.yml`` file. During ``omnia.yml`` execution, the user provided certificates and key files are copied from the OIM to the ``auth_server`` (OpenLDAP). An example for the certificate and key entries in the ``input/security_config.yml`` file for the proxy OpenLDAP server is provided below: :: + + # Certificate Authority(CA) issued certificate file path + tls_ca_certificate: "/root/certificates/omnia_ca_cert.crt" + # OpenLDAP Certificate file path + tls_certificate: "/root/certificates/omnia_cert.pem" + # OpenLDAP Certificate key file path + tls_certificate_key: "/root/certificates/omnia_cert_key.key" + + Use the same certificates and keys in the ``slapd.conf`` file, as shown below: + + Ubuntu: :: + + TLSCACertificateFile /etc/ssl/certs/omnia_ca_cert.crt + TLSCertificateFile /etc/ssl/certs/omnia_cert.pem + TLSCertificateKeyFile /etc/ssl/private/omnia_cert_key.key + + RHEL/ROCKY: :: + + TLSCACertificateFile /etc/pki/tls/certs/omnia_ca_cert.crt + TLSCertificateFile /etc/pki/tls/certs/omnia_cert.pem + TLSCertificateKeyFile /etc/pki/tls/certs/omnia_cert_key.key + + * Multiple external LDAP servers can also be configured on the proxy server. The OpenLDAP proxy server allows users from multiple external LDAP servers to authenticate onto the cluster. You can provide two sets of external LDAP server details as shown below: :: + + uri "ldap://10.5.0.104:389/dc=omnia1,dc=test" + idassert-bind + bindmethod=simple + binddn="cn=admin,dc=omnia,dc=test" + credentials="Dell1234" + flags=override + mode=none + + uri "ldap://10.5.0.105:389/dc=omnia2,dc=test" + idassert-bind + bindmethod=simple + binddn="cn=admin,dc=omnia,dc=test" + credentials="Dell12345" + flags=override + mode=none + +3. Once the new configurations are present in the ``slapd.conf`` file, execute the following OpenLDAP server "slaptest" command to apply the configurations: :: + + slaptest -f /usr/local/openldap/etc/openldap/slapd.conf -F /usr/local/openldap/etc/openldap/slapd.d + + +4. Change the schema ownership to LDAP and set the necessary file permissions (770). Execute the following commands to do so: :: + + chown -R ldap:ldap /usr/local/openldap/etc/openldap/slapd.d/ + chown root:ldap /usr/local/openldap/etc/openldap/slapd.d/ + chmod -R 754 /usr/local/openldap/etc/openldap/slapd.d/ + chmod 770 /usr/local/openldap/etc/openldap/slapd.d/ + +5. Restart the internal OpenLDAP server to seal in the configurations. Execute the following command to restart the server: :: + + systemctl restart slapd-ltb.service + + +Once these configurations are applied on the internal OpenLDAP server, it sets up the external LDAP server as an authentication server. The internal OpenLDAP server doesn't store any kind of user data and no users can be created/modified from here. \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/ReplicatingLDAP.rst b/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/ReplicatingLDAP.rst new file mode 100644 index 000000000..36a2e33e1 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/ReplicatingLDAP.rst @@ -0,0 +1,57 @@ +How to replicate OpenLDAP server +--------------------------------- +.. note:: This is a manual and optional configuration that the user can perform. + +Once Omnia has set up an OpenLDAP server for the cluster, external LDAP servers can be replicated onto the cluster OpenLDAP server using the following steps. + +**[Optional]Create a replication user** + +1. Create an LDIF file (eg: ``replication_user.ldif``) on the external LDAP server (source) containing the following information: + + * DN: The distinguished name that indicates where the user will be created. + * objectClass: The object class specifies the mandatory and optional attributes that can be associated with an entry of that class. Here, the values are ``simpleSecurityObject``, ``account``, and ``shadowAccount``. + * UID: The username of the replication user. + * Description: A user-defined string describing the account. + * UserPassword: The SHA encrypted value of the intended user password. This can be obtained using ``slappasswd`` + +.. note:: In case of external LDAP server replication, ensure that the ``homeDirectory`` is always set to ``/home``. + +Below is a sample file: :: + + dn: uid=replicauser,dc=orchid,dc=cluster + objectClass: simpleSecurityObject + objectclass: account + objectClass: shadowAccount + uid: replicauser + description: Replication User + userPassword: {SSHA}BL5xdrUvHQ8GPvdvHhO/4OmKHYoXQlIK + +2. Run the command ``ldapadd -D -w < bind_password > -f replication_user.ldif`` to execute the LDIF file and create the account. + +**Initiate the replication** + +1. Create an LDIF file (eg: ``Replication.ldif``) on the auth server on the cluster (destination) containing the following information: + + * Provider: The IP address of the source LDAP server. It is routed over the LDAP protocol and via port 389. + * binddn: The distinguished name of the dedicated replication user or admin user being used to authenticate the replication. + * credentials: The corresponding password of the user indicated in ``binddn``. + * searchbase: The groups of users to be replicated. + +Below is a sample file: :: + + dn: olcDatabase={1}mdb,cn=config + changetype: modify + add: olcSyncRepl + olcSyncRepl: rid=001 + provider=ldap://xx.xx.xx.xx:389/ + bindmethod=simple + binddn="uid=replicauser,dc=orchid,dc=cluster" + credentials=sync1234 + searchbase="dc=orchid,dc=cluster" + scope=sub + schemachecking=on + type=refreshAndPersist + retry="30 5 300 3" + interval=00:00:05:00 + +2. Run the command ``ldapadd -D cn=,cn=config -w < config_password > -f Replication.ldif`` to execute the LDIF file and initiate the replication. \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/index.rst b/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/index.rst new file mode 100644 index 000000000..04e4f756d --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/index.rst @@ -0,0 +1,22 @@ +Step 5: Configure the cluster +================================ + +**Features enabled by omnia.yml**: + + * **Centralized authentication**: Once all the required parameters in `security_config.yml `_ are filled in, ``omnia.yml`` can be used to set up OpenLDAP. + + * **Kubernetes**: Once all the required parameters in `omnia_config.yml `_ are filled in, ``omnia.yml`` can be used to set up kubernetes. + + * **Login Node (Additionally secure login node)** + +.. caution:: If you have a proxy server set up for your OIM, you must configure the proxy environment variables on the OIM before running any Omnia playbooks. For more information, `click here <../Setup_CP_proxy.html>`_. + +.. toctree:: + :maxdepth: 2 + + schedulerprereqs + schedulerinputparams + BuildingCluster/index + + + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/schedulerinputparams.rst b/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/schedulerinputparams.rst new file mode 100644 index 000000000..83d66a37a --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/schedulerinputparams.rst @@ -0,0 +1,53 @@ +Input parameters for the cluster +=================================== + +These parameters are located in ``input/omnia_config.yml``, ``input/security_config.yml``, and ``input/storage_config.yml``. To initiate telemetry support, fill out `these parameters <../../../Telemetry/index.html#id13>`_ in ``input/telemetry_config.yml``. + +.. caution:: Do not remove or comment any lines in the ``input/omnia_config.yml``, ``input/security_config.yml``, ``input/telemetry_config.yml``, and ``input/storage_config.yml`` file. + +omnia_config.yml +------------------- + +.. csv-table:: Parameters for kubernetes setup + :file: ../../../Tables/scheduler_k8s_ubuntu.csv + :header-rows: 1 + :keepspace: + + +security_config.yml +--------------------- + +.. csv-table:: Parameters for Authentication + :file: ../../../Tables/security_config.csv + :header-rows: 1 + :keepspace: + +.. csv-table:: Parameters for OpenLDAP configuration + :file: ../../../Tables/security_config_ldap.csv + :header-rows: 1 + :keepspace: + + +storage_config.yml +-------------------- + +.. csv-table:: Parameters for Storage + :file: ../../../Tables/storage_config.csv + :header-rows: 1 + :keepspace: + + +Click here for more information on `OpenLDAP `_, `BeeGFS `_, or `NFS `_. + +.. note:: + + * The ``input/omnia_config.yml`` and ``input/security_config.yml`` files are encrypted during the execution of ``omnia.yml`` playbook. Use the below commands to edit the encrypted input files: + + * ``omnia_config.yml``: :: + + ansible-vault edit omnia_config.yml --vault-password-file .omnia_vault_key + + * ``security_config.yml``: :: + + ansible-vault edit security_config.yml --vault-password-file .security_vault.key + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/schedulerprereqs.rst b/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/schedulerprereqs.rst new file mode 100644 index 000000000..d77c2b871 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/OmniaCluster/schedulerprereqs.rst @@ -0,0 +1,30 @@ +Before you build clusters +-------------------------- + +* `Ensure that all cluster nodes are up and running <../Provision/ViewingDB.html>`_. + +* Verify that the inventory file is updated as mentioned in the `inventory sample file <../../samplefiles.html>`_. + + * For Kubernetes, all the applicable groups are ``kube_control_plane``, ``kube_node``, and ``etcd``. + * The centralized authentication server inventory group, that is ``auth_server``, is common for both Slurm and Kubernetes. + * For secure login node functionality, ensure to add the ``login`` group in the provided inventory file. + + +* Verify that all nodes are assigned a group. The inventory file is case-sensitive. Follow the format provided in the `sample file link <../../samplefiles.html>`_. + +.. note:: + * The inventory file accepts both IPs and FQDNs as long as they can be resolved by DNS. + * In a multi-node setup, an IP cannot be listed as a control node and a compute node simultaneously. That is, don't include the ``kube_control_plane`` IP address in the compute node group. In a single node setup, the compute node and the ``kube_control_plane`` must be the same. + +* Users should also ensure that all repositories are available on the cluster nodes. + +* If the cluster requires more than 10 kubernetes nodes, use a docker enterprise account to avoid docker pull limits. + + + + + + + + + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/Prereq.sh/index.rst b/docs/source/OmniaInstallGuide/Ubuntu/Prereq.sh/index.rst new file mode 100644 index 000000000..da60eebac --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/Prereq.sh/index.rst @@ -0,0 +1,39 @@ +Step 1: Execute prereq.sh +=========================== + +Starting from version 1.7, Omnia will be executed within a Python virtual environment. To set up this environment, the ``prereq.sh`` script is utilized. This script installs the necessary Python 3.11, creates the Python virtual environment, as well as installs Ansible 9.5.1 version and other software packages required by Omnia on the OIM. The predefined path for this virtual environment is ``/opt/omnia/omnia17_venv``. This approach ensures that Omnia has the correct dependencies and runs smoothly within a controlled and isolated environment. + +.. caution:: + + * To run Omnia, it is crucial to use the Python virtual environment created by the ``prereq.sh`` script. Do not delete the virtual environment directory (``/opt/omnia/omnia17_venv/``) as it is necessary for the proper functioning of Omnia. + * If you have a proxy server set up for your OIM, you must configure the proxy environment variables on the OIM before running any Omnia playbooks. For more information, `click here <../Setup_CP_proxy.html>`_. + * Ensure to execute the Omnia playbooks from inside the git cloned Omnia repository folder. Executing the playbooks outside leads to playbook execution failures. + + +* Use the following command to execute the ``prereq.sh`` script on the OIM: :: + + cd omnia + ./prereq.sh + +* To activate the virtual environment, use the following command: :: + + source /opt/omnia/omnia17_venv/bin/activate + +* To verify that the virtual environment is active, check if the following prompt is displayed: :: + + (omnia) [root@ omnia]# + +.. note:: + * Omnia recommends to disable SELinux before proceeding with the installation. If SELinux is not disabled, it will be disabled by the script and the you will be prompted to reboot the OIM. + * The file ``input/software_config.json`` is overwritten with the default values (based on the operating system) when ``prereq.sh`` is executed. + + +.. note:: + + If you want to deactivate the virtual environment set up by the ``prereq.sh`` script, use the following command from within the activated virtual environment: :: + + deactivate + +.. caution:: If you want to delete and recreate the Omnia-created virtual environment, ensure to back up the pip packages before doing so. To backup the packages, run the ``pip freeze >> omnia_venv_pip_reqs.txt`` command from within the activated virtual environment. This command creates a backup file called ``omnia_venv_pip_reqs.txt`` in the current directory. After you have recreated the virtual environment using the ``prereq.sh`` script, restore the pip packages from the activated virtual environment using the ``pip install -r omnia_venv_pip_reqs.txt`` command. + + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/Provision/DiscoveryMechanisms/bmc.rst b/docs/source/OmniaInstallGuide/Ubuntu/Provision/DiscoveryMechanisms/bmc.rst new file mode 100644 index 000000000..37dce5aee --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/Provision/DiscoveryMechanisms/bmc.rst @@ -0,0 +1,43 @@ +BMC +--- + +For automatic provisioning of servers and discovery, the BMC method can be used. + +**Pre requisites** + +* Set the IP address of the OIM. The OIM NIC connected to remote servers (through the switch) should be configured with two IPs (BMC IP and admin IP) in a shared LOM or hybrid set up. In the case dedicated network topology, a single IP (admin IP) is required. +.. image:: ../../../../images/ControlPlaneNic.png + +* To assign IPs on the BMC network while discovering servers using a BMC details, target servers should be in DHCP mode or switch details should be provided. + +* BMC credentials should be the same across all servers and provided as input to Omnia in the parameters explained below. + +* Target servers should be configured to boot in PXE mode with the appropriate NIC as the first boot device. + +* If the ``discovery_ranges`` provided are outside the ``bmc_subnet``, ensure the target nodes can reach the OIM. + +* IPMI over LAN needs to be enabled for the BMC. :: + + racadm set iDRAC.IPMILan.Enable 1 + racadm get iDRAC.IPMILan + + +.. caution:: If you are re-provisioning your cluster (that is, re-running the ``discovery_provision.yml`` playbook) after a `clean-up <../../../Maintenance/cleanup.html>`_, ensure to use a different ``static_range`` against ``bmc_network`` in ``input/network_spec.yml`` to avoid a conflict with newly assigned servers. Alternatively, disable any OS available in the ``Boot Option Enable/Disable`` section of your BIOS settings (**BIOS Settings > Boot Settings > UEFI Boot Settings**) on all target nodes. + +- All target servers should be reachable from the ``admin_network`` specified in ``input/network_spec.yml``. + +* BMC network details should be provided in the ``input/network_spec.yml`` file. + +Few things to keep in mind while entering details in ``input/network_spec.yml``: + + * Ensure that the netmask bits for the BMC network and the admin network are the same. + + * The static and dynamic ranges for the BMC network accepts multiple comma-separated ranges. + + * The network gateways on both admin and BMC networks are optional. + +.. note:: If the value of ``enable_switch_based`` is set to true, nodes will not be discovered via BMC irrespective of the contents in ``input/network_spec.yml``. + +Next step: + +* `Provisioning the cluster <../installprovisiontool.html>`_ \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/Ubuntu/Provision/DiscoveryMechanisms/index.rst b/docs/source/OmniaInstallGuide/Ubuntu/Provision/DiscoveryMechanisms/index.rst new file mode 100644 index 000000000..63456bf61 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/Provision/DiscoveryMechanisms/index.rst @@ -0,0 +1,80 @@ +Discovery Mechanisms +===================== + +Depending on the values provided in ``input/provision_config.yml``, target nodes can be discovered in one of three ways: + +.. toctree:: + :hidden: + + switch-based + mappingfile + bmc + + +switch_based +------------ + +Omnia can query known switches (by SNMPv3 username/password) for information on target node MAC IDs. + ++---------------------------------------------------------+------------------------------------------------------+ +| Pros | Cons | ++=========================================================+======================================================+ +| The entire discovery process is totally automatic. | Users need to enable IPMI on target servers. | ++---------------------------------------------------------+-----------------------------------+------------------+ +| Admin IP, BMC IP and Infiniband IP address configuration| Servers require a manual PXE boot after the first run| +| is automatic on the target nodes. | of the provision tool. | ++---------------------------------------------------------+------------------------------------------------------+ +| Re-provisioning of servers will be automatic. | | ++---------------------------------------------------------+------------------------------------------------------+ +| PXE booting servers is supported via split ports on the | | +| switch. | | ++---------------------------------------------------------+------------------------------------------------------+ + +For more information regarding switch-based discovery, `click here `_ + +mapping +-------- + +Manually collect PXE NIC information for target servers and manually define them to Omnia using a mapping file using the below format: + +**pxe_mapping_file.csv** +:: + + SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_IP + XXXXXXXX,n1,xx:yy:zz:aa:bb:cc,10.5.0.101,10.3.0.101 + XXXXXXXX,n2,aa:bb:cc:dd:ee:ff,10.5.0.102,10.3.0.102 + ++---------------------------------------------------------+------------------------------------------------------+ +| Pros | Cons | ++=========================================================+======================================================+ +| Easily customizable if the user maintains a list of | The user needs to be aware of the MAC/IP mapping | +| MAC addresses. | required in the network. | ++---------------------------------------------------------+-----------------------------------+------------------+ +| | Servers require a manual PXE boot if iDRAC IPs are | +| | not configured. | ++---------------------------------------------------------+------------------------------------------------------+ + +For more information regarding mapping files, `click here `_ + +bmc +---- + +Omnia can also discover nodes via their iDRAC using IPMI. + ++---------------------------------------------------------+------------------------------------------------------+ +| Pros | Cons | ++=========================================================+======================================================+ +| Discovery and provisioning of servers is automatic. | For iDRACs that are not DHCP enabled (i.e., Static), | +| | users need to enable IPMI manually. | ++---------------------------------------------------------+-----------------------------------+------------------+ +| Admin, BMC and Infiniband IP address configuration is | Servers require a manual PXE boot after the first run| +| automatic on the OIM. | of the provision tool. | ++---------------------------------------------------------+------------------------------------------------------+ +| LOM architecture is supported | | +| (including cloud enclosures: C6420, C6520, C6620). | | ++---------------------------------------------------------+------------------------------------------------------+ + +For more information regarding BMC, `click here `_ + + + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/Provision/DiscoveryMechanisms/mappingfile.rst b/docs/source/OmniaInstallGuide/Ubuntu/Provision/DiscoveryMechanisms/mappingfile.rst new file mode 100644 index 000000000..ac702912d --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/Provision/DiscoveryMechanisms/mappingfile.rst @@ -0,0 +1,28 @@ +mapping +-------------- +Manually collect PXE NIC information for target servers and define them to Omnia (using the ``pxe_mapping_file`` variable in ``input/provision_config.yml``) using a mapping file using the below format: + +**pxe_mapping_file.csv** + + +:: + + SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_IP + XXXXXXXX,n1,xx:yy:zz:aa:bb:cc,10.5.0.101,10.3.0.101 + XXXXXXXX,n2,aa:bb:cc:dd:ee:ff,10.5.0.102,10.3.0.102 + +.. note:: + * The header fields mentioned above are case sensitive. + * The service tags provided are not validated. Ensure the correct service tags are provided. + * The hostnames provided should not contain the domain name of the nodes. + * All fields mentioned in the mapping file are mandatory except ``bmc_ip``. + * The MAC address provided in ``pxe_mapping_file.csv`` should refer to the PXE NIC on the target nodes. + * If the field ``bmc_ip`` is not populated, manually set the nodes to PXE mode and start provisioning. If the fields are populated and IPMI is enabled, Omnia will take care of provisioning automatically. + * Target servers should be configured to boot in PXE mode with the appropriate NIC as the first boot device. + * To assign IPs on the BMC network while discovering servers using a mapping file, target servers should be in DHCP mode or switch details should be provided. + +.. caution:: If incorrect details are provided in the mapping file and the same is passed on to the Omnia DB (this takes place when ``discovery.yml`` or ``discovery_provision.yml`` is run), delete the nodes with incorrect information using the `linked script. <../../../Maintenance/deletenode.html>`_ After deletion, provide correct details in the mapping file and re-run ``discovery_provision.yml`` or ``discovery/discovery.yml``. If the ``bmc_ip`` alone is incorrect, manually PXE boot the target server to update the database. + +Next step: + +* `Provisioning the cluster <../installprovisiontool.html>`_ \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/Ubuntu/Provision/DiscoveryMechanisms/switch-based.rst b/docs/source/OmniaInstallGuide/Ubuntu/Provision/DiscoveryMechanisms/switch-based.rst new file mode 100644 index 000000000..ca6928e70 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/Provision/DiscoveryMechanisms/switch-based.rst @@ -0,0 +1,57 @@ +switch_based +------------- + +**Pre requisites** + +* Set the value of ``enable_switch_based`` to true in ``input/provision_config.yml``. Additionally, ensure that the variable ``switch_based_details`` in ``input/provision_config.yml`` is populated with the IP address and port details of the ToR switch. + +* Switch port range where all BMC NICs are connected should be provided. + +* BMC credentials should be the same across all servers and provided as input to Omnia. All BMC network details should be provided in ``input/network_spec.yml``. + +* SNMP v3 should be enabled on the switch and the credentials should be provided in ``input/provision_config_credentials.yml``. + +* Non-admin user credentials for the switch need to be provided. + +.. note:: + * To create an SNMPv3 user on S series switches (running OS10), use the following commands: + + - To create SNMP view: ``snmp-server view test_view internet included`` + - To create SNMP group: ``snmp-server group testgroup 3 auth read test_view`` + - To create SNMP users: ``snmp-server user authuser1 testgroup 3 auth sha authpasswd1`` + * To verify the changes made, use the following commands: + + - To view the SNMP views: ``show snmp view`` + - To view the SNMP groups: ``show snmp group`` + - To view the SNMP users: ``show snmp user`` + * To save this configuration for later use, run: ``copy running-configuration startup-configuration`` + * For more information on SNMP on S series switch `click here `_ + * For more information on SNMP on N series switch `click here `_ + + + +* IPMI over LAN needs to be enabled for the OIM. :: + + racadm set iDRAC.IPMILan.Enable 1 + racadm get iDRAC.IPMILan + +* Target servers should be configured to boot in PXE mode with appropriate NIC as the first boot device. + +* Set the IP address of the OIM. The OIM NIC connected to remote servers (through the switch) should be configured with two IPs (BMC IP and admin IP) in a shared LOM or hybrid set up. In the case dedicated network topology, a single IP (admin IP) is required. +.. image:: ../../../../images/ControlPlaneNic.png + +.. caution:: + * Do not use daisy chain ports or the port used to connect to the OIM in ``switch_based_details`` in ``input/provision_config.yml``. This can cause IP conflicts on servers attached to potential target ports. + * Omnia does not validate SNMP switch credentials, if the provision tool is run with incorrect credentials, use the clean-up script and re-run the provision tool with the correct credentials. + * If you are re-provisioning your cluster (that is, re-running the ``discovery_provision.yml`` playbook) after a `clean-up <../../../Maintenance/cleanup.html>`_, ensure to use a different ``static_range`` against ``bmc_network`` in ``input/network_spec.yml`` to avoid a conflict with newly assigned servers. Alternatively, disable any OS available in the ``Boot Option Enable/Disable`` section of your BIOS settings (**BIOS Settings > Boot Settings > UEFI Boot Settings**) on all target nodes. + + +.. note:: + * If any of the target nodes have a pre-provisioned BMC IP, ensure that these IPs are not part of the ``static_range`` specified in ``input/network_spec.yml`` under the ``bmc_network`` to avoid any bmc IP conflicts. + * In case of a duplicate node object, duplicate BMC nodes will be deleted automatically by the **duplicate_node_cleanup** service that runs every 30 minutes. When nodes are discovered via mapping and switch details, the nodes discovered via switch details will not be deleted. Delete the node manually using the `delete node <../../../Maintenance/deletenode.html>`_ playbook. + +* [Optional] To clear the configuration on Omnia provisioned switches and ports, `click here <../../../../Roles/Utils/portcleanup.html>`_. + +Next step: + +* `Provisioning the cluster <../installprovisiontool.html>`_ \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/Ubuntu/Provision/ViewingDB.rst b/docs/source/OmniaInstallGuide/Ubuntu/Provision/ViewingDB.rst new file mode 100644 index 000000000..7c6fd6a3e --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/Provision/ViewingDB.rst @@ -0,0 +1,49 @@ +Checking node status +===================== + +Via CLI +-------- + +Run ``nodels all nodelist.status`` for a list of nodes and their statuses. Here's an example of this command output: :: + + omnia-node00001: installing + omnia-node00002: booted + omnia-node00003: powering-on + omnia-node00004: booted + +Possible values of node status are ``powering-off``, ``powering-on``, ``bmcready``, ``installing``, ``booting``, ``post-booting``, ``booted``, and ``failed``. + +.. caution:: Once xCAT is installed, restart your SSH session to the OIM to ensure that the newly set up environment variables come into effect. This will also allow the above command to work correctly. If the new environment variables still do not come into effect, enable manually using: + :: + source /etc/profile.d/xcat.sh + +Via Omnia database [omniadb] +----------------------------- + +1. To access the omniadb, execute: :: + + psql -U postgres + + \c omniadb + + +2. To view the schema being used in the cluster: ``\dn`` + +3. To view the tables in the database: ``\dt`` + +4. To view the contents of the ``nodeinfo`` table: ``select * from cluster.nodeinfo;`` :: + + id | service_tag | node | hostname | admin_mac | admin_ip | bmc_ip | status | discovery_mechanism | bmc_mode | switch_ip | switch_name | switch_port | cpu | gpu | cpu_count | gpu_count$ + ----+-------------+---------------+----------------+-------------------+--------------+------------+--------+---------------------+----------+-----------+-------------+-------------+-----+-----+-----------+---------- + 1 | | oim | newoim.new.dev | 00:0a:f7:dc:11:42 | 10.5.255.254 | 0.0.0.0 | | | | | | | | | | + 2 | xxxxxxx | node2 | node2.new.dev | c4:cb:e1:b5:70:44 | 10.5.0.12 | 10.30.0.12 | booted | mapping | | | | | amd | | 1 | 0 + 3 | xxxxxxx | node3 | node3.new.dev | f4:02:70:b8:bc:2a | 10.5.0.10 | 10.30.0.10 | booted | mapping | | | | | amd | amd | 2 | 1 + (3 rows) + +Possible values of node status are ``powering-off``, ``powering-on``, ``bmcready``, ``installing``, ``booting``, ``post-booting``, ``booted``, ``failed``, ``ping``, ``noping``, and ``standingby``. + +.. note:: + * The ``gpu_count`` in the database is only updated every time a cluster node is PXE booted. + * Nodes listed as "failed" can be diagnosed using the ``/var/log/xcat/xcat.log`` file on the target node. Correct any underlying issues and `re-provision the node <../../Maintenance/reprovision.html>`_. + * Information on debugging nodes stuck at ``powering-on``, ``bmcready``, or ``installing`` for longer than expected is available `here <../../../Troubleshooting/FAQ/Common/Provision.html>`_. Correct any underlying issue on the node and `re-provision the node <../../Maintenance/reprovision.html>`_. + * A blank node status indicates that no attempt to provision has taken place. Attempt a manual PXE boot on the node to initiate provisioning. diff --git a/docs/source/OmniaInstallGuide/Ubuntu/Provision/index.rst b/docs/source/OmniaInstallGuide/Ubuntu/Provision/index.rst new file mode 100644 index 000000000..1f2100271 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/Provision/index.rst @@ -0,0 +1,20 @@ +Step 3: Discover and provision the cluster +=========================================== + +The ``discovery_provision.yml`` playbook achieves the following tasks: + +1. Installation and configuration of the provision tool. +2. Discovery of potential cluster nodes. +3. Provisioning the "server install image" of Ubuntu OS on the discovered cluster nodes. + +.. caution:: If you have a proxy server set up for your OIM, you must configure the proxy environment variables on the OIM before running any Omnia playbooks. For more information, `click here <../Setup_CP_proxy.html>`_. + +.. toctree:: + :maxdepth: 2 + + provisionprereqs + DiscoveryMechanisms/index + provisionparams + installprovisiontool + ViewingDB + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/Provision/installprovisiontool.rst b/docs/source/OmniaInstallGuide/Ubuntu/Provision/installprovisiontool.rst new file mode 100644 index 000000000..f732cbf32 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/Provision/installprovisiontool.rst @@ -0,0 +1,148 @@ +Provisioning the cluster +============================ + +Edit the ``input/provision_config.yml``, ``input/provision_config.yml``, and ``input/network_spec.yml`` files to update the required variables. A list of the variables required is available by `discovery mechanism `_. + +.. note:: The first PXE device on target nodes should be the designated active NIC for PXE booting. + + .. image:: ../../../images/BMC_PXE_Settings.png + +[Optional] Additional configurations handled by the provision tool +----------------------------------------------------------------------- + +**Using multiple versions of a given OS** + +Omnia now supports deploying different versions of the same OS. With each run of ``discovery_provision.yml``, a new deployable OS image is created with a distinct type depending on the values provided in ``input/software_config.json``. Supported Ubuntu OS's are: + + * Ubuntu 20.04 + * Ubuntu 22.04 + +**Disk partitioning** + + Omnia now allows for customization of disk partitions applied to remote servers. The disk partition ``desired_capacity`` has to be provided in MB. Valid ``mount_point`` values accepted for disk partition are ``/var``, ``/tmp``, ``/usr``, ``swap``. The default partition size provided for RHEL/Rocky Linux is /boot: 1024MB, /boot/efi: 256MB and remaining space to / partition. Default partition size provided for Ubuntu is /boot: 2148MB, /boot/efi: 1124MB and remaining space to / partition. Values are accepted in the form of JSON list such as: + + :: + + disk_partition: + - { mount_point: "/var", desired_capacity: "102400" } + - { mount_point: "swap", desired_capacity: "10240" } + + +Running the provision tool +---------------------------- + +To deploy the Omnia provision tool, ensure that ``input/provision_config.yml``, ``input/network_spec.yml``, and ``input/provision_config_credentials.yml`` are updated and then execute: :: + + ansible-playbook discovery_provision.yml + +.. note:: + + * If the ``input/software_config.json`` has AMD ROCm, Intel Gaudi, and NVIDIA CUDA drivers mentioned, the AMD, Intel, and NVIDIA accelerator drivers are installed on the nodes post provisioning. + * Omnia recommends to install the Intel Gaudi driver post provisioning using the ``accelerator.yml`` playbook in case the node has internet connectivity during provisioning. For more information, `click here <../AdvancedConfigurationsUbuntu/Habana_accelerator.html>`_. + +Stages of the provision tool +----------------------------- + +.. caution:: Always execute ``discovery_provision.yml`` within the ``omnia`` directory. That is, always change directories (using ``cd omnia``) to the path where the playbook resides before running the playbook. + +The provision tool, invoked by the ``discovery_provision.yml`` playbook, runs in three stages that can be called individually: + +**Stage 1: Preparing the OIM** + + * Installs required tool packages. + * Verifies and updates firewall settings. + * Installs xCAT. + * Configures Omnia databases basis ``input/network_spec.yml``. + * Creates empty inventory files on the OIM at ``/opt/omnia/omnia_inventory/``. These inventory files will be filled with information of compute node service tag post provisioning based on type of CPUs and GPUs they have. The inventory files are: + + * ``compute_cpu_amd`` + * ``compute_cpu_intel`` + * ``compute_gpu_amd`` + * ``compute_gpu_nvidia`` + * ``compute_gpu_intel`` + * ``compute_hostname_ip`` + + .. note:: + + * Service tags will only be written into the inventory files after the nodes are successfully PXE booted post provisioning. + * For a node's service tag to list in an inventory file, two conditions must be met: + + * Node status must be "booted" in DB. + * Node's service tag information is present in DB. + * Nodes are not removed from the inventory files even if they are physically disconnected. Ensure to run the `delete node playbook <../../Maintenance/deletenode.html>`_ to remove the node. + * To regenerate an inventory file, use the playbook ``omnia/utils/inventory_tagging.yml``. + + + :: + + cd prepare_oim + ansible-playbook prepare_oim.yml + +**Stage 2: Discovering the nodes** + + * Discovers all target servers. + + * PostgreSQL database is set up with all relevant cluster information such as MAC IDs, hostname, admin IP, BMC IPs etc. + + * Configures the OIM with NTP services for cluster node synchronization. + + + To call this playbook individually, run: :: + + cd discovery + ansible-playbook discovery.yml + +**Stage 3: Provisioning the nodes** + + * The intended operating system and version is provisioned on the primary disk partition on the nodes. If a BOSS Controller card is available on the target node, the operating system is provisioned on the boss controller disks. + + To call this playbook individually, run:: + + cd provision + ansible-playbook provision.yml + +.. note:: + + * If you are using ``switch_based`` discovery mechanism, you do not need to run ``provision.yml`` playbook. Run ``prepare_oim.yml`` and ``discovery.yml`` and then manually boot the nodes in PXE mode. + + * After executing ``discovery_provision.yml`` playbook, user can check the log file available at ``/var/log/omnia.log`` for more information. + + * racadm and ipmitool are installed on all target nodes except Ubuntu 20.04. + + * Ansible playbooks by default run concurrently on 5 nodes. To change this, update the ``forks`` value in ``ansible.cfg`` present in the respective playbook directory. + + * While the ``admin_nic`` on cluster nodes is configured by Omnia to be static, the public NIC IP address should be configured by user. + + * If the target nodes were discovered using switch-based or mapping mechanisms, manually PXE boot the target servers after the ``discovery_provision.yml`` playbook is executed and the target node lists as **booted** in the `nodeinfo table `_. + + * All ports required for xCAT to run will be opened (For a complete list, check out the `Security Configuration Document <../../../SecurityConfigGuide/ProductSubsystemSecurity.html#firewall-settings>`_). + + * After running ``discovery_provision.yml``, the file ``input/provision_config_credentials.yml`` will be encrypted. To edit the file, use the command: ``ansible-vault edit provision_config_credentials.yml --vault-password-file .provision_credential_vault_key`` + + * Post execution of ``discovery_provision.yml``, IPs/hostnames cannot be re-assigned by changing the mapping file. However, the addition of new nodes is supported as explained `here <../../Maintenance/addnode.html>`_. + +.. caution:: + + * Once xCAT is installed, restart your SSH session to the OIM to ensure that the newly set up environment variables come into effect. If the new environment variables still do not come into effect, enable manually using: :: + + source /etc/profile.d/xcat.sh + + * To avoid breaking the password-less SSH channel on the OIM, do not run ``ssh-keygen`` commands post execution of ``discovery_provision.yml`` to create a new key. + * Do not delete the following directories: + - ``/root/xcat`` + - ``/root/xcat-dbback`` + - ``/docker-registry`` + - ``/opt/omnia`` + - ``/var/log/omnia`` + - ``/opt/omnia17_venv/`` + * On subsequent runs of ``discovery_provision.yml``, if users are unable to log into the server, refresh the ssh key manually and retry. :: + + ssh-keygen -R + + * If a subsequent run of ``discovery_provision.yml`` fails, the ``input/provision_config.yml`` file will be unencrypted. + +**Next steps**: + +* Create a node inventory in ``/opt/omnia``. To know more, `click here <../ViewInventory.html>`_. + +* After creating an inventory, go to `Configure the cluster <../OmniaCluster/index.html>`_ to build a cluster and set up Kubernetes, NFS, BeeGFS, and Authentication. diff --git a/docs/source/OmniaInstallGuide/Ubuntu/Provision/provisionparams.rst b/docs/source/OmniaInstallGuide/Ubuntu/Provision/provisionparams.rst new file mode 100644 index 000000000..c819f6d83 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/Provision/provisionparams.rst @@ -0,0 +1,107 @@ +Input parameters for the provision tool +----------------------------------------- + +Fill in all required parameters in ``input/provision_config.yml``, ``input/provision_config_credentials.yml``, ``input/software_config.json``, and ``input/network_spec.yml``. + +.. caution:: Do not remove or comment any lines in the above mentioned ``.yml`` files. + +.. csv-table:: provision_config.yml + :file: ../../../Tables/Provision_config.csv + :header-rows: 1 + :keepspace: + +.. [1] Boolean parameters do not need to be passed with double or single quotes. + +.. csv-table:: provision_config_credentials.yml + :file: ../../../Tables/Provision_creds.csv + :header-rows: 1 + :keepspace: + +.. note:: + + * The ``input/provision_config_credentials.yml`` file is encrypted on the first execution of the ``discovery_provision.yml`` or ``local_repo.yml`` playbooks. + + * To view the encrypted parameters: :: + + ansible-vault view provision_config_credentials.yml --vault-password-file .provision_credential_vault_key + + * To edit the encrypted parameters: :: + + ansible-vault edit provision_config_credentials.yml --vault-password-file .provision_credential_vault_key + + +.. csv-table:: software_config.json + :file: ../../../Tables/software_config_ubuntu.csv + :header-rows: 1 + :keepspace: + + +.. csv-table:: network_spec.yml + :file: ../../../Tables/network_spec.csv + :header-rows: 1 + :keepspace: + +.. note:: While provisioning the cluster, ensure to add an upstream DNS server IP against the ``DNS`` entry in ``input/network_spec.yml`` if you intend to use the PowerScale SmartConnect hostname later. For more information, `click here <../AdvancedConfigurationsUbuntu/PowerScale_CSI.html#powerscale-smartconnect-optional>`_. + +.. note:: + + * If the ``nic_name`` is identical on both the ``admin_network`` and the ``bmc_network``, it indicates a LOM setup. Otherwise, it's a dedicated setup. + * BMC network details are not required when target nodes are discovered using a mapping file. + * If ``bmc_network`` properties are provided, target nodes will be discovered using the BMC method in addition to the methods whose details are explicitly provided in ``provision_config.yml``. + * The strings ``admin_network`` and ``bmc_network`` in the ``input/network_spec.yml`` file should not be edited. Also, the properties ``nic_name``, ``static_range``, and ``dynamic_range`` cannot be edited on subsequent runs of the provision tool. + * ``netmask_bits`` are mandatory and should be same for both ``admin_network`` and ``bmc_network`` (that is, between 1 and 32; 1 and 32 are also acceptable values). + +.. caution:: + * Do not assign the subnet 10.4.0.0/24 to any interfaces in the network as nerdctl uses it by default. + * All provided network ranges and NIC IP addresses should be distinct with no overlap in the ``input/network_spec.yml``. + * Ensure that all the iDRACs are reachable from the OIM. + +A sample of the ``input/network_spec.yml`` where nodes are discovered using a mapping file is provided below: :: + + --- + Networks: + - admin_network: + nic_name: "eno1" + netmask_bits: "16" + static_range: "10.5.0.1-10.5.0.200" + dynamic_range: "10.5.1.1-10.5.1.200" + correlation_to_admin: true + admin_uncorrelated_node_start_ip: "10.5.0.50" + network_gateway: "" + DNS: "" + MTU: "1500" + + - bmc_network: + nic_name: "" + netmask_bits: "" + static_range: "" + dynamic_range: "" + reassignment_to_static: true + discover_ranges: "" + network_gateway: "" + MTU: "1500" + +A sample of the ``input/network_spec.yml`` where nodes are discovered using BMC discovery mechanism is provided below: :: + + --- + Networks: + - admin_network: + nic_name: "" + netmask_bits: "" + static_range: "" + dynamic_range: "" + correlation_to_admin: true + admin_uncorrelated_node_start_ip: "" + network_gateway: "" + DNS: "" + MTU: "" + + - bmc_network: + nic_name: "eno1" + netmask_bits: "16" + static_range: "10.3.0.1-10.3.0.200" + dynamic_range: "10.3.1.1-10.3.1.200" + reassignment_to_static: true + discover_ranges: "" + network_gateway: "" + MTU: "1500" \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/Ubuntu/Provision/provisionprereqs.rst b/docs/source/OmniaInstallGuide/Ubuntu/Provision/provisionprereqs.rst new file mode 100644 index 000000000..cbcee354c --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/Provision/provisionprereqs.rst @@ -0,0 +1,76 @@ +Before you run the provision tool +--------------------------------- + +* (Recommended) Run ``prereq.sh`` to get the system ready to deploy Omnia. + +* All target bare-metal servers (cluster nodes) should be reachable from the chosen OIM. + +* The UEFI boot setting should be configured in the BIOS settings before initiating PXE boot on the nodes. + +* Admin and BMC network switches should be configured before running the provision tool. For more information on configuring the switches, `click here <../AdvancedConfigurationsUbuntu/ConfiguringSwitches/index.html>`_. + +* Set the IP address of the OIM. The OIM NIC connected to remote servers (through the switch) should be configured with two IPs (BMC IP and admin IP) in a `shared LOM <../../../Overview/NetworkTopologies/lom.html>`_ or `hybrid <../../../Overview/NetworkTopologies/hybrid.html>`_ setup. In the case of a `dedicated <../../../Overview/NetworkTopologies/dedicated.html>`_ setup, a single IP (admin IP) is required. + +.. figure:: ../../../images/ControlPlaneNic.png + + *OIM NIC IP configuration in a LOM or Hybrid setup* + +.. figure:: ../../../images/ControlPlane_DedicatedNIC.png + + *OIM NIC IP configuration in a dedicated setup* + + +* Set the hostname of the OIM in the ``.`` format. + + .. include:: ../../../Appendices/hostnamereqs.rst + + For example, ``controlplane.omnia.test`` is acceptable. :: + + hostnamectl set-hostname controlplane.omnia.test + +.. note:: The domain name specified for the OIM should be the same as the one specified under ``domain_name`` in ``input/provision_config.yml``. + +* To provision the bare metal servers, download the following ISO to the OIM: + + `Ubuntu 22.04 `_ + +.. note:: Ensure the ISO provided has downloaded seamlessly (not corrupted). Verify the SHA checksum/ download size of the ISO file before provisioning to avoid future failures. + +Note the compatibility between cluster OS and OIM OS below: + + +---------------------+--------------------+------------------+ + | | | | + | OIM OS | Cluster Node OS | Compatibility | + +=====================+====================+==================+ + | | | | + | Ubuntu | Ubuntu | Yes | + +---------------------+--------------------+------------------+ + + +* Ensure that all connection names under the network manager match their corresponding device names. + + To verify network connection names: :: + + nmcli connection + + To verify the device name: :: + + ip link show + +In the event of a mismatch, edit the file ``/etc/netplan/00-installer-config.yaml`` using the vi editor for Ubuntu clusters. + +* When discovering nodes via a mapping file, all target nodes should be set up in PXE mode before running the playbook. + +.. note:: + + * After configuration and provisioning of the cluster, changing the OIM server is not supported. If you need to change the OIM, you must redeploy the entire cluster. + + * For servers with an existing OS being discovered via BMC, ensure that the first PXE device on target nodes should be the designated active NIC for PXE booting. + + + + + + + + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/Setup_CP_proxy.rst b/docs/source/OmniaInstallGuide/Ubuntu/Setup_CP_proxy.rst new file mode 100644 index 000000000..ee9e98386 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/Setup_CP_proxy.rst @@ -0,0 +1,48 @@ +Configure a proxy server for the OIM +======================================== + +.. note:: You can skip the proxy setup using ``site_config.yml`` input file if you have direct internet access on the OIM. + +OIM proxy configuration is now available for Omnia users. This means that the OIM will not have direct access to the internet but via a proxy server. To set up the OIM with a proxy server, do the following: + +1. Go to ``omnia/input`` folder. + +2. Open the ``site_config.yml`` file and add the proxy server details to the ``proxy`` variable, as explained below: + ++-----------------------------+-------------------------------------------------------------------------------------------------------------------------------+ +| Parameter | Description | ++=============================+===============================================================================================================================+ +| **http_proxy** | | +| (Mandatory) | * This variable points to the HTTP proxy server and the port associated with the proxy server. | +| | * **Example:** ``"http://corporate-proxy:3128"`` | ++-----------------------------+-------------------------------------------------------------------------------------------------------------------------------+ +| **https_proxy** | | +| (Mandatory) | * This variable points to the HTTPS proxy server and the port associated with the proxy server. | +| | * **Example:** ``"https://corporate-proxy:3128"`` | ++-----------------------------+-------------------------------------------------------------------------------------------------------------------------------+ +| **no_proxy** | | +| (Optional) | * This variable is configured with the OIM hostname, admin network IP or any internal cluster network. | +| | * This value is required to exclude the internal cluster network from the proxy server. | +| | * **Example:** ``controlplane.omnia.test,10.5.0.1`` | ++-----------------------------+-------------------------------------------------------------------------------------------------------------------------------+ + + Sample input: :: + + proxy: + - { http_proxy: "http://corporate-proxy:3128", https_proxy: "http://corporate-proxy:3128", no_proxy: "controlplane.omnia.test,10.5.0.1" } + +3. Configure the ``http_proxy``, ``https_proxy``, and ``no_proxy`` environment variables on the OIM server. + + * Execute the following commands to temporarily update the proxy environment variable: :: + + export http_proxy=http://: + export https_proxy=http://: + export no_proxy="","" + + * For persistent proxy, update the ``/etc/environment`` or ``/root/.bashrc`` with the proxy environment details. :: + + http_proxy=http://: + https_proxy=http://: + no_proxy="","" + +.. caution:: You must configure the proxy environment variables on the OIM before running any Omnia playbooks. \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/Ubuntu/UbuntuSpace.rst b/docs/source/OmniaInstallGuide/Ubuntu/UbuntuSpace.rst new file mode 100644 index 000000000..0223f0135 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/UbuntuSpace.rst @@ -0,0 +1,16 @@ +Space requirements for the OIM running on Ubuntu OS +============================================================== + +* For all available software packages that Omnia supports: 50GB +* For the complete set of software images (in ``/`` or ``/var`` partition): 500GB +* For node with limited storage space in ``/`` or ``/var`` partition, Omnia suggests to execute ``local_repo.yml`` playbook with ``repo_config`` set to ``never`` ``input/local_repo_config.yml``. In this scenario, all software packages are downloaded and stored in pre-defined user registry. +* For storing offline repositories (the file path should be specified in ``repo_store_path`` in ``input/local_repo_config.yml``): 50GB + +.. note:: Docker and nerdctl services operate from the ``/var/lib/`` directory. If the OIM has storage constraints, users can mount this directory to another drive of their choice that has more storage capacity. Alternatively, the user can mount any external NFS server on the OIM and use that to store the software packages. + +.. csv-table:: Space requirements for images and packages on OIM + :file: ../../Tables/Ubuntu_space_req.csv + :header-rows: 1 + :keepspace: + +.. [1] Space allocated as part of OS repository (.iso). No extra space required. \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/Ubuntu/Ubuntu_prereq.rst b/docs/source/OmniaInstallGuide/Ubuntu/Ubuntu_prereq.rst new file mode 100644 index 000000000..78de8e8fd --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/Ubuntu_prereq.rst @@ -0,0 +1,16 @@ +Prerequisites +================= + +1. Choose a server outside of your intended cluster with the mentioned `storage requirements `_ to function as your Omnia Infrastructure Manager (OIM). + +2. Ensure that the OIM has the "server install image" of the Ubuntu operating system (OS) installed. For a complete list of supported OS versions, check out the `Support Matrix <../../Overview/SupportMatrix/OperatingSystems/index.html>`_. + +3. Ensure that the OIM needs is internet-capable with Git installed. If Git is not installed, use the below command to install it. :: + + apt install git -y + +4. Clone the Omnia repository from GitHub on to the OIM server using the following command: :: + + git clone https://github.com/dell/omnia.git + +5. [Optional] `Set up a proxy server for the OIM `_. diff --git a/docs/source/OmniaInstallGuide/Ubuntu/ViewInventory.rst b/docs/source/OmniaInstallGuide/Ubuntu/ViewInventory.rst new file mode 100644 index 000000000..df0984fee --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/ViewInventory.rst @@ -0,0 +1,55 @@ +Step 4: View node inventory +================================= + +When ``discovery_provision.yml``, ``prepare_oim.yml``, or ``utils/inventory_tagging.yml`` is run, a set of inventory files is created in ``/opt/omnia/omnia_inventory/`` based on `the Omnia database. `_ The inventories are created based on the type of CPUs and GPUs nodes have. The inventory files are: + + * ``compute_cpu_amd`` :: + + # This file is generated by omnia, and should not be edited + [compute_cpu_amd] + node001.omnia.test + + * ``compute_cpu_intel`` :: + + # This file is generated by omnia, and should not be edited + [compute_cpu_intel] + node001.omnia.test + + * ``compute_gpu_amd`` :: + + # This file is generated by omnia, and should not be edited + [compute_gpu_amd] + node002.omnia.test + node003.omnia.test + + * ``compute_gpu_nvidia`` :: + + # This file is generated by omnia, and should not be edited + [compute_gpu_nvidia] + node001.omnia.test + + * ``compute_gpu_intel`` :: + + # This file is generated by omnia, and should not be edited + [compute_gpu_intel] + node001.omnia.test + + * ``compute_hostname_ip`` :: + + # This file is generated by omnia, and should not be edited + [compute_hostname_ip] + node001.omnia.test ansible_host=10.5.0.2 + node002.omnia.test ansible_host=10.5.0.3 + node003.omnia.test ansible_host=10.5.0.4 + +.. note:: + + * Hostnames will only be written into the inventory files after the nodes are successfully PXE booted post provisioning. + * For a node's hostname to list in an inventory file, two conditions must be met: + + * Node status must be "booted" in DB. + * Node's hostname information is present in DB. + * To regenerate all the inventory files, use the playbook ``utils/inventory_tagging.yml``. + + + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/index.rst b/docs/source/OmniaInstallGuide/Ubuntu/index.rst new file mode 100644 index 000000000..d16af25b9 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/index.rst @@ -0,0 +1,20 @@ +Install Omnia on Ubuntu clusters +=================================== + +.. note:: + - Slurm installation is not supported on Ubuntu clusters. + - FreeIPA configuration is not supported on Ubuntu clusters. + +.. toctree:: + :maxdepth: 2 + + Ubuntu_prereq + UbuntuSpace + Prereq.sh/index + CreateLocalRepo/index + Provision/index + ViewInventory + OmniaCluster/index + InstallAITools/index + AdvancedConfigurationsUbuntu/index + diff --git a/docs/source/OmniaInstallGuide/Ubuntu/pullimagestonodes.rst b/docs/source/OmniaInstallGuide/Ubuntu/pullimagestonodes.rst new file mode 100644 index 000000000..610daefe9 --- /dev/null +++ b/docs/source/OmniaInstallGuide/Ubuntu/pullimagestonodes.rst @@ -0,0 +1,105 @@ +Download custom packages/images to the cluster +=============================================== + +**Download packages/images to the OIM registry** + +To download packages/images to the OIM registry/repository, ``local_repo.yml`` should be executed. + +Follow the steps below to download packages/images: + + 1. Create a ``.json`` file with all the required packages/images. For example, ``custom_image.json``. + + 2. Updated ``custom_image.json`` with the package/image information. Follow the sample template added below: + + :: + + { + "custom_image": { + "cluster": [ + { + "package": "quay.io/jetstack/cert-manager-controller", + "tag": "v1.13.0", + "type": "image" + }, + { + "package": "quay.io/jetstack/cert-manager-webhook", + "tag": "v1.13.0", + "type": "image" + }, + { + "package": "nfs-common", + "type": "deb", + "repo_name": "jammy" + }, + ] + } + } + + 3. Enter custom_image entry in ``input/software_config.json``. + + :: + + {"name": "custom_image"} + + 4. Enter the required softwares in ``software_config.json`` for which ``.json`` file is created, based on the OS type and version running on the cluster. For example: + + :: + + { + "cluster_os_type": "ubuntu", + "cluster_os_version": "22.04", + "repo_config": "partial", + "softwares": [ + {"name": "custom_image"}, + ] + } + + 3. Execute the following command to download required images from internet to OIM: + + :: + + cd local_repo + ansible-playbook local_repo.yml + + .. note:: If user registry is required to be used, ensure to update the registry details in ``input/local_repo_config.yml`` before executing ``local_repo.yml``. For example, ``user_registry: - { host: 192.168.0.1:5001, cert_path: "/home/ca.crt" }``. + +**Pull images/packages to the cluster** + + 1. Create an inventory file (for example, ``imagepull_inventory.ini``) with the required groups. Assign the required nodes to each group. Images will be pulled to the nodes within these groups. For example, if you have a Kubernetes cluster, then the inventory file should contain ``kube_control_plane`` and ``kube_node`` groups. An inventory example is provided below: + + :: + + inventory.ini + [kube_control_plane] + 10.8.0.1 + + [kube_node] + 10.8.0.2 + 10.8.0.3 + + 2. Execute the following command to pull images from OIM to the desired nodes: + + :: + + cd utils + ansible-playbook pull_images_to_nodes.yml -i imagepull_inventory.ini + +.. note:: Since the nodes are behind the proxy, they don't have direct internet access. Only the OIM has direct access to the public internet. + Nodes can connect to the internet via the OIM by setting the ``http_proxy`` and ``https_proxy`` environment variables, in the following format: :: + + export http_proxy=http://:3128 + export https_proxy=http://:3128 + + Example: :: + + export http_proxy=http://10.5.255.254:3128 + export https_proxy=http://10.5.255.254:3128 + + To pull any specific image to a particular node, do the following: + + 1. Connect to the node via SSH protocol and configuring the ``http_proxy`` and ``https_proxy`` environment variables by following the above commands. + 2. Use the following command to pull any desired image: + + :: + + nerdctl pull \ No newline at end of file diff --git a/docs/source/OmniaInstallGuide/index.rst b/docs/source/OmniaInstallGuide/index.rst new file mode 100644 index 000000000..c328402fa --- /dev/null +++ b/docs/source/OmniaInstallGuide/index.rst @@ -0,0 +1,23 @@ +Omnia Installation Guide +========================= + +* `Quick installation guide for RHEL/Rocky Linux clusters `_ + +* `Quick installation guide for Ubuntu clusters `_ + +* `Maintenance `_ + +* `Sample Files `_ + + +.. image:: ../images/installation_flow.png + :width: 800pt + + +.. toctree:: + :hidden: + + RHEL/index + Ubuntu/index + Maintenance/index + samplefiles diff --git a/docs/source/samplefiles.rst b/docs/source/OmniaInstallGuide/samplefiles.rst similarity index 68% rename from docs/source/samplefiles.rst rename to docs/source/OmniaInstallGuide/samplefiles.rst index c6b6c7a6b..92dc987ae 100644 --- a/docs/source/samplefiles.rst +++ b/docs/source/OmniaInstallGuide/samplefiles.rst @@ -59,60 +59,66 @@ inventory file * For Slurm, all the applicable inventory groups are ``slurm_control_node``, ``slurm_node``, and ``login``. * For Kubernetes, all the applicable groups are ``kube_control_plane``, ``kube_node``, and ``etcd``. * The centralized authentication server inventory group, that is ``auth_server``, is common for both Slurm and Kubernetes. + * For secure login node functionality, ensure to add the ``login`` group in the provided inventory file. software_config.json for Ubuntu --------------------------------- :: - { - "cluster_os_type": "ubuntu", - "cluster_os_version": "22.04", - "repo_config": "partial", - "softwares": [ - {"name": "amdgpu", "version": "6.0"}, - {"name": "cuda", "version": "12.3.2"}, - {"name": "bcm_roce", "version": "229.2.61.0"}, - {"name": "roce_plugin"}, - {"name": "ofed", "version": "24.01-0.3.3.1"}, - {"name": "openldap"}, - {"name": "secure_login_node"}, - {"name": "nfs"}, - {"name": "beegfs", "version": "7.4.2"}, - {"name": "k8s", "version":"1.26.12"}, - {"name": "roce_plugin"}, - {"name": "jupyter"}, - {"name": "kubeflow"}, - {"name": "kserve"}, - {"name": "pytorch"}, - {"name": "tensorflow"}, - {"name": "vllm"}, - {"name": "telemetry"}, - {"name": "ucx", "version": "1.15.0"}, - {"name": "openmpi", "version": "4.1.6"} - ], - - "bcm_roce": [ - {"name": "bcm_roce_libraries", "version": "229.2.61.0"} - ], - "amdgpu": [ - {"name": "rocm", "version": "6.0" } - ], - "vllm": [ - {"name": "vllm_amd"}, - {"name": "vllm_nvidia"} - ], - "pytorch": [ - {"name": "pytorch_cpu"}, - {"name": "pytorch_amd"}, - {"name": "pytorch_nvidia"} - ], - "tensorflow": [ - {"name": "tensorflow_cpu"}, - {"name": "tensorflow_amd"}, - {"name": "tensorflow_nvidia"} - ] - } + { + "cluster_os_type": "ubuntu", + "cluster_os_version": "22.04", + "repo_config": "partial", + "softwares": [ + {"name": "amdgpu", "version": "6.2.2"}, + {"name": "cuda", "version": "12.3.2"}, + {"name": "bcm_roce", "version": "230.2.54.0"}, + {"name": "ofed", "version": "24.01-0.3.3.1"}, + {"name": "openldap"}, + {"name": "secure_login_node"}, + {"name": "nfs"}, + {"name": "beegfs", "version": "7.4.2"}, + {"name": "k8s", "version":"1.29.5"}, + {"name": "roce_plugin"}, + {"name": "jupyter"}, + {"name": "kubeflow"}, + {"name": "kserve"}, + {"name": "pytorch"}, + {"name": "tensorflow"}, + {"name": "vllm"}, + {"name": "telemetry"}, + {"name": "ucx", "version": "1.15.0"}, + {"name": "openmpi", "version": "4.1.6"}, + {"name": "intelgaudi", "version": "1.18.0-524"}, + {"name": "csi_driver_powerscale", "version":"v2.11.0"} + ], + + "bcm_roce": [ + {"name": "bcm_roce_libraries", "version": "230.2.54.0"} + ], + "amdgpu": [ + {"name": "rocm", "version": "6.2.2" } + ], + "intelgaudi": [ + {"name": "intel"} + ], + "vllm": [ + {"name": "vllm_amd"}, + {"name": "vllm_nvidia"} + ], + "pytorch": [ + {"name": "pytorch_cpu"}, + {"name": "pytorch_amd"}, + {"name": "pytorch_nvidia"}, + {"name": "pytorch_gaudi"} + ], + "tensorflow": [ + {"name": "tensorflow_cpu"}, + {"name": "tensorflow_amd"}, + {"name": "tensorflow_nvidia"} + ] + } software_config.json for RHEL/Rocky Linux ------------------------------------------- @@ -126,7 +132,7 @@ software_config.json for RHEL/Rocky Linux "cluster_os_version": "8.8", "repo_config": "partial", "softwares": [ - {"name": "amdgpu", "version": "6.0"}, + {"name": "amdgpu", "version": "6.2.2"}, {"name": "cuda", "version": "12.3.2"}, {"name": "ofed", "version": "24.01-0.3.3.1"}, {"name": "freeipa"}, @@ -135,7 +141,7 @@ software_config.json for RHEL/Rocky Linux {"name": "nfs"}, {"name": "beegfs", "version": "7.4.2"}, {"name": "slurm"}, - {"name": "k8s", "version":"1.26.12"}, + {"name": "k8s", "version":"1.29.5"}, {"name": "jupyter"}, {"name": "kubeflow"}, {"name": "kserve"}, @@ -147,11 +153,12 @@ software_config.json for RHEL/Rocky Linux {"name": "amd_benchmarks"}, {"name": "utils"}, {"name": "ucx", "version": "1.15.0"}, - {"name": "openmpi", "version": "4.1.6"} + {"name": "openmpi", "version": "4.1.6"}, + {"name": "csi_driver_powerscale", "version":"v2.11.0"} ], "amdgpu": [ - {"name": "rocm", "version": "6.0" } + {"name": "rocm", "version": "6.2.2" } ], "vllm": [ {"name": "vllm_amd"}, diff --git a/docs/source/Overview/NetworkTopologies/Hybrid.rst b/docs/source/Overview/NetworkTopologies/Hybrid.rst index 1f8c42bf1..2d3831844 100644 --- a/docs/source/Overview/NetworkTopologies/Hybrid.rst +++ b/docs/source/Overview/NetworkTopologies/Hybrid.rst @@ -1,19 +1,21 @@ -Network Topology: Hybrid setup -============================= +Network Topology: Hybrid Setup +================================= -For an environment containing both LOM and BMC ports, the provision tool needs to be run twice to correctly manage all servers in the network. +.. note:: The following diagram is for representational purposes only. -.. image:: ../../images/omnia_network_Hybrid.jpg +.. image:: ../../images/Hybrid_NT.png -In a **Hybrid Setup**, the control plane and special nodes such as the head and login node are connected to the public network, while the iDRAC and the compute nodes use a shared LOM network. +In a **Hybrid Setup**, the OIM and special nodes such as the head and login node are connected to the public network, while the iDRAC and the compute nodes use a shared LOM network. -* **Public Network (Blue line)**: This indicates that the control plane, head node, and login node is connected to the external public network. +* **Public Network (Blue line)**: This indicates the external public network which is connected to the internet. NIC2 of the OIM, Head node, and Login node [optional] is connected to the public network. Along with this, BMC NIC of the Head node is connected. -* **Cluster Network (Green line)**: This indicates the admin network utilized by Omnia to provision the cluster nodes. +* **Admin Network and BMC network (Green line)**: This indicates the admin network and the BMC network utilized by Omnia to provision the cluster nodes and to control the cluster nodes using out-of-band management. NIC1 of all the nodes are connected to the private switch. -* **IB Network (Yellow line)**: The network used by the applications on the cluster nodes to communicate among each other. +* **IB / Additional Ethernet Network (Yellow line)**: This indicates the Infiniband (IB) or the additional ethernet network used by applications on the cluster nodes to communicate among each other, using Mellanox or high-speed ethernet switch. OIM connectivity is optional for this switch. + +.. note:: Omnia supports classless IP addressing, which allows the Admin network, BMC network, Public network, and the Additional network to be assigned different subnets. However, the Admin and BMC networks must be on the same subnet (represented by the ``netmask_bits`` parameter in the ``input/network_spec.yml`` file). **Recommended discovery mechanism** -* `mapping <../../InstallationGuides/InstallingProvisionTool/DiscoveryMechanisms/mappingfile.html>`_ -* `bmc <../../InstallationGuides/InstallingProvisionTool/DiscoveryMechanisms/bmc.html>`_ \ No newline at end of file +* `mapping <../../OmniaInstallGuide/Ubuntu/Provision/DiscoveryMechanisms/mappingfile.html>`_ +* `bmc <../../OmniaInstallGuide/Ubuntu/Provision/DiscoveryMechanisms/bmc.html>`_ \ No newline at end of file diff --git a/docs/source/Overview/NetworkTopologies/dedicated.rst b/docs/source/Overview/NetworkTopologies/dedicated.rst index 405a68228..b6b31b482 100644 --- a/docs/source/Overview/NetworkTopologies/dedicated.rst +++ b/docs/source/Overview/NetworkTopologies/dedicated.rst @@ -1,20 +1,24 @@ Network Topology: Dedicated Setup -================================= +==================================== -.. image:: ../../images/omnia_network_Dedicated.jpg +.. note:: The following diagram is for representational purposes only. -In a **Dedicated Setup**, all the cluster nodes (head, login, and compute) have dedicated iDRAC connection. +.. image:: ../../images/Dedicated_NT.png -* **Public Network (Blue line)**: This indicates the iDRAC network which is connected to the external public network. +In a **Dedicated Setup**, all the cluster nodes (Head, Compute, and Login [optional]) have dedicated iDRAC connection. -* **iDRAC Network (Red line)**: This indicates the private iDRAC network used by the control plane to control the cluster nodes using out-of-band management. +* **Public Network (Blue line)**: This indicates the external public network which is connected to the internet. NIC2 of the OIM, Head node, and Login node [optional] is connected to the public network. -* **Cluster Network (Green line)**: This indicates the admin network utilized by Omnia to provision the cluster nodes. +* **BMC Network (Red line)**: This indicates the private BMC (iDRAC) network used by the OIM to control the cluster nodes using out-of-band management. -* **IB Network (Yellow line)**: The network used by the applications on the cluster nodes to communicate among each other. +* **Admin Network (Green line)**: This indicates the admin network utilized by Omnia to provision the cluster nodes. NIC1 of all the nodes are connected to the private switch. + +* **IB / Additional Ethernet Network (Yellow line)**: This indicates the Infiniband (IB) or the additional ethernet network used by applications on the cluster nodes to communicate among each other, using Mellanox or high-speed ethernet switch. OIM connectivity is optional for this switch. + +.. note:: Omnia supports classless IP addressing, which allows the Admin network, BMC network, Public network, and the Additional network to be assigned different subnets. However, the Admin and BMC networks must be on the same subnet (represented by the ``netmask_bits`` parameter in the ``input/network_spec.yml`` file). **Recommended discovery mechanism** -* `mapping <../../InstallationGuides/InstallingProvisionTool/DiscoveryMechanisms/mappingfile.html>`_ -* `bmc <../../InstallationGuides/InstallingProvisionTool/DiscoveryMechanisms/bmc.html>`_ +* `mapping <../../OmniaInstallGuide/Ubuntu/Provision/DiscoveryMechanisms/mappingfile.html>`_ +* `bmc <../../OmniaInstallGuide/Ubuntu/Provision/DiscoveryMechanisms/bmc.html>`_ diff --git a/docs/source/Overview/NetworkTopologies/index.rst b/docs/source/Overview/NetworkTopologies/index.rst index f20402cd5..39a33ee9b 100644 --- a/docs/source/Overview/NetworkTopologies/index.rst +++ b/docs/source/Overview/NetworkTopologies/index.rst @@ -1,7 +1,8 @@ Network Topologies =================== - .. toctree:: +.. toctree:: + dedicated lom - Hybrid \ No newline at end of file + hybrid diff --git a/docs/source/Overview/NetworkTopologies/lom.rst b/docs/source/Overview/NetworkTopologies/lom.rst index 9743507f7..1d580239f 100644 --- a/docs/source/Overview/NetworkTopologies/lom.rst +++ b/docs/source/Overview/NetworkTopologies/lom.rst @@ -1,22 +1,23 @@ -Network Topology: LAN on motherboard (LOM) Setup -================================================== +Network Topology: Shared LAN on motherboard (LOM) Setup +========================================================= +.. note:: The following diagram is for representational purposes only. -A LOM port could be shared with the host operating system production traffic. Also, LOM ports can be dedicated for server management. For example, with a four-port LOM adapter, LOM ports one and two could be used for production data while three and four could be used for iDRAC, VNC, RDP, or other operating system-based management data. +.. image:: ../../images/LOM_NT.png -.. image:: ../../images/omnia_network_LOM.jpg +In a **Shared LOM setup**, the Administration and BMC logical networks share the same ethernet segment and physical connection. -In a shared **LOM Setup**, the entire cluster network is shared between the iDRAC and the cluster nodes. +* **Public Network (Blue line)**: This indicates the external public network which is connected to the internet. NIC2 of the OIM, and Login node [optional] is connected to the public network. -* **Public Network (Blue line)**: This indicates that only the control plane and login node is connected to the external public network. +* **Admin Network and BMC network (Green line)**: This indicates the admin network and the BMC network utilized by Omnia to provision the cluster nodes and to control the cluster nodes using out-of-band management. NIC1 of all the nodes are connected to the private switch. -* **Cluster Network (Green line)**: This indicates the admin network utilized by Omnia to provision all the cluster nodes (login, head, and compute). +* **IB / Additional Ethernet Network (Yellow line)**: This indicates the Infiniband (IB) or the additional ethernet network used by applications on the cluster nodes to communicate among each other, using Mellanox or high-speed ethernet switch. OIM connectivity is optional for this switch. -* **IB Network (Yellow line)**: The network used by the applications on the head and compute nodes to communicate among each other. +.. note:: Omnia supports classless IP addressing, which allows the Admin network, BMC network, Public network, and the Additional network to be assigned different subnets. However, the Admin and BMC networks must be on the same subnet (represented by the ``netmask_bits`` parameter in the ``input/network_spec.yml`` file). **Recommended discovery mechanism** -* `mapping <../../InstallationGuides/InstallingProvisionTool/DiscoveryMechanisms/mappingfile.html>`_ -* `bmc <../../InstallationGuides/InstallingProvisionTool/DiscoveryMechanisms/bmc.html>`_ -* `switch-based <../../InstallationGuides/InstallingProvisionTool/DiscoveryMechanisms/switch-based.html>`_ +* `mapping <../../OmniaInstallGuide/Ubuntu/Provision/DiscoveryMechanisms/mappingfile.html>`_ +* `bmc <../../OmniaInstallGuide/Ubuntu/Provision/DiscoveryMechanisms/bmc.html>`_ +* `switch-based <../../OmniaInstallGuide/Ubuntu/Provision/DiscoveryMechanisms/switch-based.html>`_ diff --git a/docs/source/Overview/SupportMatrix/Hardware/gpu.rst b/docs/source/Overview/SupportMatrix/Hardware/gpu.rst index 476d4dfc7..a7de9c40c 100644 --- a/docs/source/Overview/SupportMatrix/Hardware/gpu.rst +++ b/docs/source/Overview/SupportMatrix/Hardware/gpu.rst @@ -1,13 +1,15 @@ GPUs a.k.a. Accelerators ========================== - +--------+-----------------------------------+ - | GPU | Models | - +========+===================================+ - | NVIDIA | T4, A10, A30, A100, H100, L40 | - +--------+-----------------------------------+ - | AMD | MI100, MI200, MI210, MI300X | - +--------+-----------------------------------+ + +----------------+-----------------------------------+------------------------------------------------+ + | GPU Make | Models supported by Omnia | Models validated with current version of Omnia | + +================+===================================+================================================+ + | NVIDIA | T4, A10, A30, A100, H100, L40 | A100, L40 | + +----------------+-----------------------------------+------------------------------------------------+ + | AMD | MI100, MI200, MI210, MI300X | MI200, MI210, MI300X | + +----------------+-----------------------------------+------------------------------------------------+ + | Intel | Gaudi 3 (Pre-enablement) | Gaudi 3 | + +----------------+-----------------------------------+------------------------------------------------+ -.. versionadded:: 1.6 - AMD: MI300X \ No newline at end of file +.. versionadded:: 1.7 + Intel Gaudi 3 (Pre-enablement) \ No newline at end of file diff --git a/docs/source/Overview/SupportMatrix/Hardware/nics.rst b/docs/source/Overview/SupportMatrix/Hardware/nics.rst index 57ffc1144..12564ab0f 100644 --- a/docs/source/Overview/SupportMatrix/Hardware/nics.rst +++ b/docs/source/Overview/SupportMatrix/Hardware/nics.rst @@ -1,7 +1,10 @@ NICs ===== - .. csv-table:: Supported NICs + .. csv-table:: :file: ../../../Tables/supported-nics.csv :header-rows: 1 - :keepspace: \ No newline at end of file + :keepspace: + +.. versionadded:: 1.7 + 100GbE Ethernet Network Adapter E810 \ No newline at end of file diff --git a/docs/source/Overview/SupportMatrix/Hardware/servers.rst b/docs/source/Overview/SupportMatrix/Hardware/servers.rst index 9eb9c40f7..8340d3a42 100644 --- a/docs/source/Overview/SupportMatrix/Hardware/servers.rst +++ b/docs/source/Overview/SupportMatrix/Hardware/servers.rst @@ -1,37 +1,21 @@ Servers ======== -PowerEdge servers ------------------- - .. csv-table:: Supported PowerEdge servers - :file: ../../../Tables/supported-poweredge-servers.csv +PowerEdge Intel servers +--------------------------- + + .. csv-table:: + :file: ../../../Tables/supported-poweredge-intel-servers.csv :header-rows: 1 :keepspace: -.. [1] The R760xa supports both H100 and A100 GPUs. - -.. note:: Since Cloud Enclosures only support shared LOM connectivity, it is recommended that `BMC <../../../InstallationGuides/InstallingProvisionTool/DiscoveryMechanisms/bmc.html>`_ or `switch-based <../../../InstallationGuides/InstallingProvisionTool/DiscoveryMechanisms/switch-based.html>`_ methods of discovery are used. - -AMD servers ------------ - +-------------+-----------------------------------+ - | Server Type | Server Model | - +=============+===================================+ - | 14G | R6415, R7415, R7425 | - +-------------+-----------------------------------+ - | 15G | R6515, R6525, R7515, R7525, C6525 | - +-------------+-----------------------------------+ - | 16G | R6625, R7625, R7615, R6615 | - +-------------+-----------------------------------+ +.. note:: Since C-Series servers only support shared LOM connectivity, it is recommended to use BMC or switch-based server `discovery mechanisms <../../../OmniaInstallGuide/Ubuntu/Provision/DiscoveryMechanisms/index.html>`_. -.. versionadded:: 1.2 - 15G servers +PowerEdge AMD servers +------------------------- -.. versionadded:: 1.3 - AMD servers - -.. versionadded:: 1.4.1 - Intel 16G servers + .. csv-table:: + :file: ../../../Tables/supported-poweredge-amd-servers.csv + :header-rows: 1 + :keepspace: -.. versionadded:: 1.4.3 - Intel: R760, XE8640, R760xa, R760xd2, XE9680; AMD 16G servers diff --git a/docs/source/Overview/SupportMatrix/Hardware/storage.rst b/docs/source/Overview/SupportMatrix/Hardware/storage.rst index d37c03a35..effde893a 100644 --- a/docs/source/Overview/SupportMatrix/Hardware/storage.rst +++ b/docs/source/Overview/SupportMatrix/Hardware/storage.rst @@ -1,37 +1,46 @@ Storage ======== -Powervault Storage +PowerVault Storage ------------------ -+--------------+------------------------+ -| Storage Type | Storage Model | -+==============+========================+ -| ME4 | ME4084, ME4024, ME4012 | -+--------------+------------------------+ -| ME5 | ME5012, ME5024, ME5084 | -+--------------+------------------------+ ++--------------+---------------------------+------------------------------------------------+ +| Storage Type | Models supported by Omnia | Models validated with current version of Omnia | ++==============+===========================+================================================+ +| ME4 | ME4084, ME4024, ME4012 | ME4024 | ++--------------+---------------------------+------------------------------------------------+ +| ME5 | ME5012, ME5024, ME5084 | | ++--------------+---------------------------+------------------------------------------------+ -.. versionadded:: 1.3 - PowerVault ME5 storage support +.. note:: Omnia supports configuration of RAID levels, volumes, pool, and SNMP on PowerVault devices. For more information on PowerVault configuration using Omnia, `click here <../../../OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/ConfiguringStorage/index.html#configuring-storage>`_. BOSS Controller Cards ---------------------- -+-----------------------------------------------------+ -| Supported BOSS controllers | -+=====================================================+ -| Dell Boot Optimized Storage Solution-N1 (BOSS-N1) | -+-----------------------------------------------------+ -| Dell Boot Optimized Storage Solution-S1 (BOSS-S1) | -+-----------------------------------------------------+ -| Dell Boot Optimized Storage Solution-S2 (BOSS-S2) | -+-----------------------------------------------------+ ++-----------------------------------------------------+-----------------------------------------------------+ +| Models supported by Omnia | Models validated with current version of Omnia | ++=====================================================+=====================================================+ +| Dell Boot Optimized Storage Solution-N1 (BOSS-N1) | Dell Boot Optimized Storage Solution-N1 (BOSS-N1) | ++-----------------------------------------------------+-----------------------------------------------------+ +| Dell Boot Optimized Storage Solution-S1 (BOSS-S1) | Dell Boot Optimized Storage Solution-S1 (BOSS-S1) | ++-----------------------------------------------------+-----------------------------------------------------+ +| Dell Boot Optimized Storage Solution-S2 (BOSS-S2) | Dell Boot Optimized Storage Solution-S2 (BOSS-S2) | ++-----------------------------------------------------+-----------------------------------------------------+ +.. note:: Omnia does not support virtual drive configuration for BOSS cards. A virtual drive is present by default on the BOSS card, but if it is missing, the user must manually create one before running the ``discovery_provision.yml`` playbook. -.. versionadded:: 1.2.1 - BOSS controller cards +PowerScale Storage +---------------------- + ++-------------------------------+------------------------------------------------+ +| Models supported by Omnia | Models validated with current version of Omnia | ++===============================+================================================+ +| PowerScale H5600, H7000, H500 | PowerScale H500 | ++-------------------------------+------------------------------------------------+ +| PowerScale F600, F900, F710 | PowerScale F710 | ++-------------------------------+------------------------------------------------+ -.. versionadded:: 1.6 - BOSS N1 +.. versionadded:: 1.7 + PowerScale H500, PowerScale F710 +.. note:: Omnia does not support configuring PowerScale; it only allows users to add an existing PowerScale node to a Kubernetes cluster. \ No newline at end of file diff --git a/docs/source/Overview/SupportMatrix/Hardware/switches.rst b/docs/source/Overview/SupportMatrix/Hardware/switches.rst index ed46a9f13..06e9da56a 100644 --- a/docs/source/Overview/SupportMatrix/Hardware/switches.rst +++ b/docs/source/Overview/SupportMatrix/Hardware/switches.rst @@ -1,31 +1,18 @@ Switches ======== -+------------------------------+----------------------------------------------------------------------------------------------------------------+ -| Switch Type | Switch Model | -+==============================+================================================================================================================+ -| NVIDIA InfiniBand Switches | 1. NVIDIA MQM8700-HS2F Quantum HDR InfiniBand Switch 40 QSFP56 | -| | 2. NVIDIA QUANTUM-2 QM9700 | -+------------------------------+----------------------------------------------------------------------------------------------------------------+ - - -+------------------------------+----------------------------------------------------------------------------------------------------------------+ -| Switch Type | Switch Model | -+==============================+================================================================================================================+ -| Dell Networking Switches | 1. PowerSwitch S3048-ON | -| | 2. PowerSwitch S5232F-ON | -| | 3. PowerSwitch Z9264F-ON | -| | 4. PowerSwitch N3248TE-ON | -| | 5. PowerSwitch S4148 | -| | 6. PowerSwitch Z9664F | -+------------------------------+----------------------------------------------------------------------------------------------------------------+ - -.. versionadded:: 1.6 - PowerSwitch Z9664F +.. csv-table:: + :file: ../../../Tables/supported-switches.csv + :header-rows: 1 + :keepspace: + +.. versionadded:: 1.7 + PowerSwitch Z9432-ON, + PowerSwitch Z9864F-ON .. note:: - * The switches that have reached EOL might not function properly. It is recommended by Omnia to use switch models mentioned in support matrix. + * The switches that have reached EOL might not function properly. We recommended using switch models mentioned in the support matrix. * Omnia requires OS10 to be installed on ethernet switches. diff --git a/docs/source/Overview/SupportMatrix/OperatingSystems/RedHat.rst b/docs/source/Overview/SupportMatrix/OperatingSystems/RedHat.rst index 86951acfd..13ed68d9f 100644 --- a/docs/source/Overview/SupportMatrix/OperatingSystems/RedHat.rst +++ b/docs/source/Overview/SupportMatrix/OperatingSystems/RedHat.rst @@ -1,14 +1,10 @@ Red Hat Enterprise Linux (RHEL) =============================== -========== ============= ============= -OS Version Control Plane Cluster Nodes -========== ============= ============= -8.6 Yes Yes -8.7 [1]_ Yes Yes +========== ============= =============== +OS Version OIM Cluster Nodes +========== ============= =============== 8.8 Yes Yes -========== ============= ============= +========== ============= =============== -.. [1] This version of RHEL does not support vLLM installation via Omnia. -.. note:: Always deploy the "Server with GUI" edition of the RHEL OS on the control plane. \ No newline at end of file diff --git a/docs/source/Overview/SupportMatrix/OperatingSystems/Rocky.rst b/docs/source/Overview/SupportMatrix/OperatingSystems/Rocky.rst index 13ecc0553..99c81f021 100644 --- a/docs/source/Overview/SupportMatrix/OperatingSystems/Rocky.rst +++ b/docs/source/Overview/SupportMatrix/OperatingSystems/Rocky.rst @@ -2,19 +2,12 @@ Rocky Linux ============= +------------+---------------+---------------+ -| OS Version | Control Plane | Cluster Nodes | +| OS Version | OIM | Cluster Nodes | +============+===============+===============+ -| 8.6 | Yes | No | -+------------+---------------+---------------+ -| 8.7 [1]_ | Yes | No | -+------------+---------------+---------------+ | 8.8 | Yes | Yes | +------------+---------------+---------------+ -.. [1] This version of Rocky Linux does not support vLLM installation via Omnia. -.. note:: - Always deploy the "Server with GUI" edition of the Rocky Linux OS on the control plane. diff --git a/docs/source/Overview/SupportMatrix/OperatingSystems/Ubuntu.rst b/docs/source/Overview/SupportMatrix/OperatingSystems/Ubuntu.rst index 66b41ba25..8f3f7a4ba 100644 --- a/docs/source/Overview/SupportMatrix/OperatingSystems/Ubuntu.rst +++ b/docs/source/Overview/SupportMatrix/OperatingSystems/Ubuntu.rst @@ -2,16 +2,13 @@ Ubuntu ====== ========== ============= ============= -OS Version Control Plane Cluster Nodes +OS Version OIM Cluster Nodes ========== ============= ============= -20.04 [1]_ Yes Yes -22.04 Yes Yes +22.04.5 Yes Yes ========== ============= ============= -.. [1] This version of Ubuntu does not support vLLM and racadm installation via Omnia. - .. note:: - * Omnia supports only the "server install image" version of Ubuntu on the control plane and the cluster nodes. + * Omnia supports only the "server install image" version of Ubuntu on the OIM and the cluster nodes. * Omnia does not currently support Slurm on Ubuntu. * FreeIPA server is not provided in the default Ubuntu repositories. OpenLDAP is provided as an alternative. * PowerVault storage devices are not compatible with the Ubuntu operating system. As a result, Omnia running on Ubuntu clusters does not support the configuration of PowerVault storage devices. \ No newline at end of file diff --git a/docs/source/Overview/SupportMatrix/OperatingSystems/index.rst b/docs/source/Overview/SupportMatrix/OperatingSystems/index.rst index c2ceea222..02d34d8f5 100644 --- a/docs/source/Overview/SupportMatrix/OperatingSystems/index.rst +++ b/docs/source/Overview/SupportMatrix/OperatingSystems/index.rst @@ -1,7 +1,7 @@ -Operating Systems -================= +Operating Systems (OS) Matrix +================================ -.. note:: Omnia v1.6 does not support minimal OS version of RHEL/Rocky Linux on the control plane, whereas on the nodes (head, compute, and login) minimal OS version is supported. +.. note:: Omnia v1.7 supports both full and minimal OS versions of RHEL/Rocky Linux on the Omnia Infrastructure Manager (OIM), and only the minimal version on the cluster nodes (head, compute, and login). For Ubuntu, only the "server install image" is supported on both the OIM and cluster nodes. .. toctree:: RedHat diff --git a/docs/source/Overview/SupportMatrix/ValidationMatrix.rst b/docs/source/Overview/SupportMatrix/ValidationMatrix.rst deleted file mode 100644 index 2c27042bb..000000000 --- a/docs/source/Overview/SupportMatrix/ValidationMatrix.rst +++ /dev/null @@ -1,83 +0,0 @@ -Validation Matrix ------------------- - -Omnia v1.6 deployment has been validated on the following devices and their combinations. - -Servers -+++++++++ - .. csv-table:: Testing matrix for PowerEdge servers - :file: ../../Tables/testing-matrix-servers.csv - :header-rows: 1 - :keepspace: - - -NICs -+++++ - - +--------------------------------------------------+ - | NIC | - +==================================================+ - | NVIDIA ConnectX-6 | - +--------------------------------------------------+ - | NVIDIA ConnectX-7 | - +--------------------------------------------------+ - | QLogic 4X10GE | - +--------------------------------------------------+ - | Broadcom BCM5720 | - +--------------------------------------------------+ - | Broadcom 10GBASE-T Ethernet | - +--------------------------------------------------+ - | Broadcom BCM5760x | - +--------------------------------------------------+ - -GPUs a.k.a. Accelerators -++++++++++++++++++++++++++ - - +---------+-------------------------+ - | GPU | Models | - +=========+=========================+ - | NVIDIA | A100, H100, L40 | - +---------+-------------------------+ - | AMD | MI200, MI210, MI300X | - +---------+-------------------------+ - -Switches -+++++++++ - - +------------------------------+----------------------------------------------------------------------------------------------------------------+ - | Switch Type | Switch Model | - +==============================+================================================================================================================+ - | Mellanox InfiniBand Switches | 1. NVIDIA MQM8700-HS2F Quantum HDR InfiniBand Switch 40 QSFP56 | - | | 2. NVIDIA QUANTUM-2 QM9700 | - +------------------------------+----------------------------------------------------------------------------------------------------------------+ - | Dell Networking Switches | 1. PowerSwitch S3048-ON | - | | 2. PowerSwitch S5232F-ON | - | | 3. PowerSwitch Z9264F-ON | - | | 4. PowerSwitch N3248TE-ON | - | | 5. PowerSwitch S4148 | - +------------------------------+----------------------------------------------------------------------------------------------------------------+ - -Storage -++++++++ - -**Powervault Storage** - - +--------------+------------------------+ - | Storage Type | Storage Model | - +==============+========================+ - | ME4 | ME4084, ME4024, ME4012 | - +--------------+------------------------+ - | ME5 | ME5012, ME5024, ME5084 | - +--------------+------------------------+ - -**BOSS Controller Cards** - - +-----------------------------------------------------+ - | Supported BOSS controllers | - +=====================================================+ - | Dell Boot Optimized Storage Solution-N1 (BOSS-N1) | - +-----------------------------------------------------+ - | Dell Boot Optimized Storage Solution-S1 (BOSS-S1) | - +-----------------------------------------------------+ - | Dell Boot Optimized Storage Solution-S2 (BOSS-S2) | - +-----------------------------------------------------+ \ No newline at end of file diff --git a/docs/source/Overview/SupportMatrix/index.rst b/docs/source/Overview/SupportMatrix/index.rst index e0de785d8..d4903aeaa 100644 --- a/docs/source/Overview/SupportMatrix/index.rst +++ b/docs/source/Overview/SupportMatrix/index.rst @@ -2,7 +2,8 @@ Support Matrix =============== .. toctree:: + :maxdepth: 2 + Hardware/index OperatingSystems/index - ValidationMatrix omniainstalledsoftware \ No newline at end of file diff --git a/docs/source/Overview/architecture.rst b/docs/source/Overview/architecture.rst index b0a55ac78..7c853be69 100644 --- a/docs/source/Overview/architecture.rst +++ b/docs/source/Overview/architecture.rst @@ -1,11 +1,4 @@ Architecture =============== -.. image:: ../images/Omnia_Architecture.png - -Omnia stack ------------ - -.. image:: ../images/omnia-k8s.png - -.. image:: ../images/omnia-slurm.png +.. image:: ../images/Omnia_Architecture.png \ No newline at end of file diff --git a/docs/source/Overview/index.rst b/docs/source/Overview/index.rst index da9d2a2e1..570352e99 100644 --- a/docs/source/Overview/index.rst +++ b/docs/source/Overview/index.rst @@ -1,16 +1,16 @@ Omnia: Overview ================ -**Omnia** -, deriving its name from the Latin term denoting "all" or "everything", is a deployment tool tailored to configure Dell PowerEdge servers operating on standard RPM-based Linux OS images into clusters capable of handling HPC, AI, and data analytics workloads. Leveraging Slurm, Kubernetes, and complementary packages, it orchestrates job management and enables execution of varied workloads on the same converged solution. Omnia is a collection of open-source Ansible playbooks, continually evolving to accommodate a wide array of workloads effectively. - - - +**Omnia**, deriving its name from the Latin term denoting "all" or "everything", is a deployment tool tailored to configure Dell PowerEdge servers operating on standard RPM-based Linux OS images into clusters capable of handling HPC, AI, and data analytics workloads. Leveraging Slurm, Kubernetes, and complementary packages, it orchestrates job management and enables execution of varied workloads on the same converged solution. Omnia is a collection of open-source Ansible playbooks, continually evolving to accommodate a wide array of workloads effectively. .. toctree:: + :maxdepth: 2 + architecture + omnia_stack newfeatures releasenotes SupportMatrix/index + omnia_explained NetworkTopologies/index - more_info \ No newline at end of file + more_info diff --git a/docs/source/Overview/more_info.rst b/docs/source/Overview/more_info.rst index 99ce5b571..2397a0d9f 100644 --- a/docs/source/Overview/more_info.rst +++ b/docs/source/Overview/more_info.rst @@ -1,8 +1,5 @@ -Find out more about Omnia -========================= - Blogs about Omnia ------------------ +=================== * `Introduction to Omnia `_ @@ -14,11 +11,3 @@ Blogs about Omnia * `Solution Brief: Omnia Software `_ -What Omnia does ----------------- - -Omnia can deploy and configure devices, and build clusters that use Slurm or Kubernetes (or both) for workload management. Omnia will install software from a variety of sources, including: - - * Helm repositories - - * Source code repositories \ No newline at end of file diff --git a/docs/source/Overview/newfeatures.rst b/docs/source/Overview/newfeatures.rst index 743b4a371..658edfdf3 100644 --- a/docs/source/Overview/newfeatures.rst +++ b/docs/source/Overview/newfeatures.rst @@ -1,4 +1,58 @@ New Features ============ -Omnia v1.6.1 addresses an issue caused due to the unavailability of the dependent package 'libssl1.1_1.1.1f-1ubuntu2.22_amd64' required by Omnia v1.6 for the Ubuntu 22.04 operating system. The focus of this release is to resolve this issue and ensure the proper functionality of Omnia on Ubuntu 22.04 OS. \ No newline at end of file +* Omnia now executes exclusively within a virtual environment created by the ``prereq.sh`` script + +* Python version upgraded to 3.11 (Previously 3.9) + +* Ansible version upgraded to 9.5.1 (Previously 7.7.0) + +* Kubernetes version upgraded to 1.29.5 (Previously 1.26.12) + +* Pre-enablement for Intel Gaudi 3 accelerators: + + * Software stack installation (See the `support matrix `_ for the supported Intel firmware version) + + * Accelerator status verification using `HCCL `_ and `hl_qual `_ + + * Inventory tagging for the Gaudi accelerators (``compute_gpu_intel``) + + * Monitoring for the Gaudi accelerators via: + + * Omnia telemetry + * iDRAC telemetry + * Kubernetes telemetry via Prometheus exporter + + * Visualization of the Kubernetes telemetry and Intel Gaudi accelerator metrics using Grafana + + * AI tools enablement: + + * DeepSpeed + * Kubeflow + * vLLM + +* Sample playbook for a pre-trained Generative AI model - Llama 3.1 + +* CSI drivers for Kubernetes to access PowerScale storage with an option to enable the SmartConnect feature (without SSL certificates) + +* Added support for NVIDIA container toolkit for NVIDIA accelerators in a Kubernetes cluster + +* Added support for corporate proxy on RHEL, Rocky Linux, and Ubuntu clusters + +* Set OS Kernel command-line parameters and/or configure additional NICs on the nodes using a single playbook + +* The internal OpenLDAP server can now be configured as a proxy server + + + + + + + + + + + + + + diff --git a/docs/source/Overview/omnia_explained.rst b/docs/source/Overview/omnia_explained.rst new file mode 100644 index 000000000..e2fc8c303 --- /dev/null +++ b/docs/source/Overview/omnia_explained.rst @@ -0,0 +1,26 @@ +An Omnia cluster +================== + +Omnia can deploy and configure PowerEdge servers (a.k.a. nodes), and build clusters that use Slurm or Kubernetes (or both) for workload management. Apart from the general compute nodes of a cluster, an Omnia cluster has two additional nodes: + +1. **Omnia Infrastructure Manager (OIM)**: The OIM is like a central node in a cluster, separate from the actual computing nodes. It acts as the main hub of the cluster, hosting the Omnia provisioning and monitoring tool. When setting up the cluster, the Omnia repository is copied and downloaded to the OIM. +2. **Head Node**: The head node in an Omnia cluster is a server that is responsible for hosting the scheduling manager (``kube_control_plane`` or ``slurm_control_node``). Similar to the OIM, the head node is separate from the compute nodes in the cluster. It plays a crucial role in managing the scheduling of tasks within the cluster. + +Omnia "AI" cluster +------------------- + +Components of an AI-driven Omnia cluster are: + +* **Head node**: In an AI workload-driven Omnia cluster, the head node is nothing but the ``kube_control_plane`` used to manage Kubernetes jobs on the cluster. +* **Compute nodes**: In an AI cluster, a compute node is nothing but a ``kube_node``. + +Omnia "HPC" cluster +-------------------- + +Components of an HPC Omnia cluster are: + +* **Head node**: In an HPC cluster, the head node is nothing but the ``slurm_control_node`` used to manage slurm jobs on the cluster. +* **Compute nodes**: In an HPC cluster, a compute node is nothing but a ``slurm_node``. +* **[Optional] Login node**: In Omnia, a login node serves as an extra layer of authentication. Users are required to authenticate themselves through this additional login node, which is configured by Omnia. This setup allows the cluster administrator to limit direct access to the head node (also referred to as ``slurm_control_node``) by users. The login node acts as a gateway for users to securely access the cluster. + +.. note:: If a login node is not present in a Slurm cluster, then only users with access to the head node can submit Slurm jobs. \ No newline at end of file diff --git a/docs/source/Overview/omnia_stack.rst b/docs/source/Overview/omnia_stack.rst new file mode 100644 index 000000000..cb16c5112 --- /dev/null +++ b/docs/source/Overview/omnia_stack.rst @@ -0,0 +1,6 @@ +Omnia Stack +============ + +.. image:: ../images/omnia-k8s.png + +.. image:: ../images/omnia-slurm.png diff --git a/docs/source/Overview/releasenotes.rst b/docs/source/Overview/releasenotes.rst index 71138f10c..47431521a 100644 --- a/docs/source/Overview/releasenotes.rst +++ b/docs/source/Overview/releasenotes.rst @@ -1,10 +1,54 @@ Releases ======== +1.7 +----- + +* Omnia now executes exclusively within a virtual environment created by the ``prereq.sh`` script + +* Python version upgraded to 3.11 (Previously 3.9) + +* Ansible version upgraded to 9.5.1 (Previously 7.7.0) + +* Kubernetes version upgraded to 1.29.5 (Previously 1.26.12) + +* Pre-enablement for Intel Gaudi 3 accelerators: + + * Software stack installation (See the `support matrix `_ for the supported Intel firmware version) + + * Accelerator status verification using `HCCL `_ and `hl_qual `_ + + * Inventory tagging for the Gaudi accelerators (``compute_gpu_intel``) + + * Monitoring for the Gaudi accelerators via: + + * Omnia telemetry + * iDRAC telemetry + * Kubernetes telemetry via Prometheus exporter + + * Visualization of the Kubernetes telemetry and Intel Gaudi accelerator metrics using Grafana + + * AI tools enablement: + + * DeepSpeed + * Kubeflow + * vLLM + +* Sample playbook for a pre-trained Generative AI model - Llama 3.1 + +* CSI drivers for Kubernetes to access PowerScale storage with an option to enable the SmartConnect feature (without SSL certificates) + +* Added support for NVIDIA container toolkit for NVIDIA accelerators in a Kubernetes cluster + +* Added support for corporate proxy on RHEL, Rocky Linux, and Ubuntu clusters + +* Set OS Kernel command-line parameters and/or configure additional NICs on the nodes using a single playbook + +* The internal OpenLDAP server can now be configured as a proxy server + 1.6.1 ------- - -Omnia v1.6.1 addresses an issue caused due to the unavailability of the dependent package 'libssl1.1_1.1.1f-1ubuntu2.22_amd64' required by Omnia v1.6 for the Ubuntu 22.04 operating system. The focus of this release is to resolve this issue and ensure the proper functionality of Omnia on Ubuntu 22.04 OS. +Omnia v1.6.1 addresses an issue caused due to the unavailability of the dependent package ‘libssl1.1_1.1.1f-1ubuntu2.22_amd64’ required by Omnia v1.6 for the Ubuntu 22.04 operating system. The focus of this release is to resolve this issue and ensure the proper functionality of Omnia on Ubuntu 22.04 OS. 1.6 ---- @@ -138,7 +182,7 @@ Omnia v1.6.1 addresses an issue caused due to the unavailability of the dependen 1.4.3 ------ -* XE 9640, R760 XA, R760 XD2 are now supported as control planes or target nodes with NVIDIA H100 accelerators. +* XE9640, R760xa, R760xd2 are now supported as control planes or target nodes with NVIDIA H100 accelerators. * Added ability for split port configuration on NVIDIA Quantum-2-based QM9700 (NVIDIA InfiniBand NDR400 switches). @@ -152,7 +196,7 @@ Omnia v1.6.1 addresses an issue caused due to the unavailability of the dependen 1.4.2 ------- -* XE9680, R760, R7625, R6615, R7615 are now supported as control planes or target nodes. +* XE9680, R760, R7625, R6615, R7615 are now supported as control planes or target nodes. * Added ability for switch-based discovery of remote servers and PXE provisioning. diff --git a/docs/source/Roles/Network/index.rst b/docs/source/Roles/Network/index.rst index 7527b9b60..10e2c9c3b 100644 --- a/docs/source/Roles/Network/index.rst +++ b/docs/source/Roles/Network/index.rst @@ -5,7 +5,7 @@ In your HPC cluster, connect the Mellanox InfiniBand switches using the Fat-Tree .. note:: - * From Omnia 1.4, the Subnet Manager runs on the target Infiniband switches and not the control plane. + * From Omnia 1.4, the Subnet Manager runs on the target Infiniband switches and not the OIM. * When ``ib_nic_subnet`` is provided in ``input/provision_config.yml``, the infiniband NIC on target nodes are assigned IPv4 addresses within the subnet without user intervention during the execution of ``discovery_provision.yml``. diff --git a/docs/source/Roles/Storage/index.rst b/docs/source/Roles/Storage/index.rst index 7a38cbf94..0254c43c6 100644 --- a/docs/source/Roles/Storage/index.rst +++ b/docs/source/Roles/Storage/index.rst @@ -120,7 +120,7 @@ To open the ports required, use the following steps: cd omnia/storage ansible-playbook storage.yml -i inventory -(Where inventory refers to the `inventory file <../../samplefiles.html>`_ listing kube_control_plane, login_node and compute nodes.) +(Where inventory refers to the `inventory file <../../samplefiles.html>`_ listing the ``kube_control_plane``, login_node, and compute nodes.) .. note:: If a subsequent run of ``storage.yml`` fails, the ``storage_config.yml`` file will be unencrypted. diff --git a/docs/source/Roles/Utils/index.rst b/docs/source/Roles/Utils/index.rst deleted file mode 100644 index f9c360aef..000000000 --- a/docs/source/Roles/Utils/index.rst +++ /dev/null @@ -1,12 +0,0 @@ -Additional utilities -===================== - -The Utilities role allows users to set up certain tasks such as - -.. toctree:: - software_update - KernelUpdate - kernel_param_update - portcleanup - timescaledb_utility - freeipa_installation diff --git a/docs/source/Roles/index.rst b/docs/source/Roles/index.rst index d60b2d62f..33af058e3 100644 --- a/docs/source/Roles/index.rst +++ b/docs/source/Roles/index.rst @@ -8,9 +8,6 @@ Below is a list of all Omnia's features: .. toctree:: - Security/index - Storage/index - Accelerator/index Utils/index Telemetry/index diff --git a/docs/source/SecurityConfigGuide/Preface.rst b/docs/source/SecurityConfigGuide/Preface.rst index 55d8452b6..43773fe2d 100644 --- a/docs/source/SecurityConfigGuide/Preface.rst +++ b/docs/source/SecurityConfigGuide/Preface.rst @@ -13,7 +13,7 @@ THE INFORMATION IN THIS PUBLICATION IS PROVIDED "AS-IS." DELL MAKES NO REPRESENT Scope of the document ---------------------- -This document covers the security features supported by Omnia 1.6.1. +This document covers the security features supported by Omnia 1.7. Document references -------------------- diff --git a/docs/source/SecurityConfigGuide/ProductSubsystemSecurity.rst b/docs/source/SecurityConfigGuide/ProductSubsystemSecurity.rst index 81394b271..1caba630c 100644 --- a/docs/source/SecurityConfigGuide/ProductSubsystemSecurity.rst +++ b/docs/source/SecurityConfigGuide/ProductSubsystemSecurity.rst @@ -4,7 +4,7 @@ Product and Subsystem Security Security controls map ---------------------- -.. image:: ../images/securityControlsMap.jpg +.. image:: ../images/SecurityControlsMap.png Omnia performs bare metal configuration to enable AI/HPC workloads. It uses Ansible playbooks to perform installations and configurations. iDRAC is supported for provisioning bare metal servers. Omnia installs xCAT to enable provisioning of clusters via PXE in different ways: @@ -14,7 +14,7 @@ Omnia performs bare metal configuration to enable AI/HPC workloads. It uses Ansi - Switch **[default]**: To discovery the cluster by routing communication through particular switch ports over SNMPv3, non-admin switch credentials must be provided. -.. note:: IPMI is not required on the control plane. However, compute nodes (iDRACs in the cluster/private network) require IPMI to be enabled for BMC discovery. +.. note:: IPMI is not required on the OIM. However, compute nodes (iDRACs in the cluster/private network) require IPMI to be enabled for BMC discovery. Omnia can be installed via CLI only. Slurm and Kubernetes are deployed and configured on the cluster. FreeIPA or OpenLDAP is installed for providing authentication. @@ -35,6 +35,8 @@ To perform these configurations and installations, a secure SSH channel is estab Authentication --------------- +Omnia adheres to a subset of the specifications of NIST 800-53 and NIST 800-171 guidelines on the OIM and login node. + Omnia does not have its own authentication mechanism because bare metal installations and configurations take place using root privileges. Post the execution of Omnia, third-party tools are responsible for authentication to the respective tool. Cluster authentication tool @@ -97,7 +99,7 @@ Configuring remote connections When setting up BeeGFS client services on the cluster, a connection authentication file is used to maintain the security of the communications between server and client. 1. Generate the connection authentication file (connAuth) and use it to set up BeeGFS meta, server and storage services. - 2. Copy the connAuth file to the control plane and note the filepath. + 2. Copy the connAuth file to the OIM and note the filepath. 3. Populate the value of ``beegfs_secret_storage_filepath`` in ``input/storage_config.yml`` with the filepath from the previous step. Omnia will configure the BeeGFS clients on th cluster using the provided file. BeeGFS is responsible for maintaining and securing connAuthFile. For more information, `click here `_. @@ -329,7 +331,7 @@ Omnia configures the following ports for use by third-party tools installed by O Data security ------------- -Omnia does not store data. The passwords Omnia accepts as input to configure the third party tools are validated and then encrypted using Ansible Vault. Run the following commands routinely on the control plane for the latest security updates. +Omnia does not store data. The passwords Omnia accepts as input to configure the third party tools are validated and then encrypted using Ansible Vault. Run the following commands routinely on the OIM for the latest security updates. * For RHEL/Rocky Linux OS diff --git a/docs/source/Tables/ControlPlaneLogs.csv b/docs/source/Tables/ControlPlaneLogs.csv index adf8de3b6..d4175dcbe 100644 --- a/docs/source/Tables/ControlPlaneLogs.csv +++ b/docs/source/Tables/ControlPlaneLogs.csv @@ -3,7 +3,7 @@ Omnia Logs,/var/log/omnia.log,Omnia Log,"This log is configured by Default. This .. Note:: User can view the combined logs for ``discovery_provision.yml`` and ``omnia.yml`` execution in ``omnia.log`` file." Accelerator Logs,/var/log/omnia/accelerator.log,Accelerator Log,This log is configured by Default. -Monitor Logs,/var/log/omnia/monitor.log,Monitor Log,This log is configured by Default. +kubespray_telemetry Logs,/var/log/omnia/kubespray_omnia.log,Kubespray Telemetry Log,This log is configured by Default. Network Logs,/var/log/omnia/network.log,Network Log,This log is configured by Default. Platform Logs,/var/log/omnia/platforms.log,Platform Log,This log is configured by Default. Provision Logs,/var/log/omnia/provision.log,Provision Log,This log is configured by Default. @@ -27,3 +27,13 @@ DNF logs,/var/log/dnf.log,Installation Logs,This log is configured on Rocky OS. BeeGFS Logs,/var/log/beegfs-client.log,BeeGFS Logs,This log is configured on BeeGFS client nodes. Compute Logs,/var/log/xcat/computes.log,Logs system messages from all cluster nodes.,This log is configured by Default. Cluster deployment logs,/var/log/xcat/cluster.log,Logs deployment messages from all cluster nodes.,This log is configured by Default. +Server Network Logs,/var/log/omnia/server_spec_update.log,Server network log,This log is configured by Default. +Local Repository Logs,/var/log/omnia/local_repo.log,Local repository log,This log is configured by Default. +Discovery Logs,/var/log/omnia/discovery.log,Discovery log,This log is configured by Default. +Prepare OIM Logs,/var/log/omnia/prepare_oim.log,Preparing OIM log,This log is configured by Default. +Tools Logs,/var/log/omnia/tools.log,Tools log,This log is configured by Default. +Upgrade Logs,/var/log/omnia/upgrade.log,Upgrade log,This log is configured by Default. +IP Rule Assignment Logs,/var/log/omnia/ip_rule_assignment.log,IP rule assignment log,This log is configured by Default. +Software Update Logs,/var/log/omnia/software_update.log,Software update log,This log is configured by Default. +Performance Profile Logs,/var/log/omnia/performance_profile.log,Peformance profile log,This log is configured by Default. +Benchmark Logs,/var/log/omnia/benchmarks.log,Benchmark messages log ,This log is configured by Default. diff --git a/docs/source/Tables/FAQ_provision.csv b/docs/source/Tables/FAQ_provision.csv new file mode 100644 index 000000000..b6eb4e381 --- /dev/null +++ b/docs/source/Tables/FAQ_provision.csv @@ -0,0 +1,7 @@ +Potential Cause,Resolution +Disk partition may not have enough storage space per the requirements specified in ``input/provision_config`` (under ``disk_partition``).,Add more space to the server or modify the requirements specified in ``input/provision_config`` (under ``disk_partition``) and run ``discovery_provision.yml`` playbook. +The provided ISO may be corrupt/incomplete.,"Download the ISO again, verify the checksum/download size and run the ``discovery_provision.yml`` playbook." +"Hardware issues such as faulty disk, cable connectivity issues, or firmware issues present on the server","Resolve the hardware issues and PXE boot the node. For example, for a faulty disk, replace the disk or create a RAID1 virtual disk. In case of a firmware issue, ensure that the latest firmware is applied." +A virtual disk may not have been created,Create a virtual disk and PXE boot the node. +Re-run of the ``discovery_provision.yml`` playbook on the OIM while provisioning is in-progress on the remote nodes.,Initiate PXE boot on the remote node after completion of the ``discovery_provision.yml`` playbook execution. +The ``nfs-server`` service may not be running. This failure can occur due to low memory.,Increase memory if its low and restart the ``nfs-server`` service using the ``systemctl start nfs-server`` command and then PXE boot the node. diff --git a/docs/source/Tables/Metrics_GPU.csv b/docs/source/Tables/Metrics_GPU.csv index a702ffe70..37fe3fe7b 100644 --- a/docs/source/Tables/Metrics_GPU.csv +++ b/docs/source/Tables/Metrics_GPU.csv @@ -1,13 +1,13 @@ Metric Name,Unit,Possible value(s),Potential error cause(s) gpu_temperature:gpu,C,"* Metric value -* No data","* AMD/NVIDIA accelerators are not present. -* GPU drivers are not installed including Rocm and CUDA. +* No data","* AMD/NVIDIA/Intel accelerators are not present. +* GPU drivers are not installed. " gpu_utilization,percent,"* Metric value -* No data","* AMD/NVIDIA accelerators are not present. -* GPU drivers are not installed including Rocm and CUDA. +* No data","* AMD/NVIDIA/Intel accelerators are not present. +* GPU drivers are not installed. " gpu_utilization:average,percent,"* Metric value -* No data","* AMD/NVIDIA accelerators are not present. -* GPU drivers are not installed including Rocm and CUDA. +* No data","* AMD/NVIDIA/Intel accelerators are not present. +* GPU drivers are not installed. " diff --git a/docs/source/Tables/Metrics_Health.csv b/docs/source/Tables/Metrics_Health.csv index 77ad1c196..76c015ed6 100644 --- a/docs/source/Tables/Metrics_Health.csv +++ b/docs/source/Tables/Metrics_Health.csv @@ -9,32 +9,32 @@ beegfs -beegfsstat,"* Unknown * [Fail] The BeeGFS client service has failed or the node is not present in reachable lists of BeeGFS clients." gpu_driver_health:gpu,"* Unknown * Fail -* Pass","* AMD/NVIDIA accelerators are not present. -* GPU drivers are not installed including Rocm and CUDA. " +* Pass","* AMD/NVIDIA/Intel accelerators are not present. +* GPU drivers are not installed. " gpu_health_nvlink:gpu [1]_,"* Unknown * Fail -* Pass","* AMD/NVIDIA accelerators are not present. +* Pass","* AMD/NVIDIA/Intel accelerators are not present. * NVLinks are not present. -* GPU drivers are not installed including Rocm and CUDA. +* GPU drivers are not installed. " gpu_health_pcie:gpu,"* Unknown * Fail -* Pass","* AMD/NVIDIA accelerators are not present. -* GPU drivers are not installed including Rocm and CUDA. +* Pass","* AMD/NVIDIA/Intel accelerators are not present. +* GPU drivers are not installed. " gpu_health_pmu:gpu,"* Unknown * Fail -* Pass","* AMD/NVIDIA accelerators are not present. -* GPU drivers are not installed including Rocm and CUDA. +* Pass","* AMD/NVIDIA/Intel accelerators are not present. +* GPU drivers are not installed. " gpu_health_power:gpu,"* Unknown * Fail -* Pass","* AMD/NVIDIA accelerators are not present. -* GPU drivers are not installed including Rocm and CUDA. +* Pass","* AMD/NVIDIA/Intel accelerators are not present. +* GPU drivers are not installed. " gpu_health_thermal:gpu,"* Unknown -* Metric Value","* AMD/NVIDIA accelerators are not present. -* GPU drivers are not installed including Rocm and CUDA. +* Metric Value","* AMD/NVIDIA/Intel accelerators are not present. +* GPU drivers are not installed. " Kubernetespodsstatus,"* Unknown * Fail diff --git a/docs/source/Tables/Omnia_nodes.csv b/docs/source/Tables/Omnia_nodes.csv new file mode 100644 index 000000000..382fa51fc --- /dev/null +++ b/docs/source/Tables/Omnia_nodes.csv @@ -0,0 +1,10 @@ +Component,Description +Control Plane,"A server (a.k.a. node) where provisioning software is installed and used as the single point to perform system management over the entire cluster. On this node, a database is configured to store the cluster details. Network services (dhcp, tftp, http, etc) are enabled to respond in Operating system deployment." +Cluster Node,Any node provisioned by Omnia apart from the control plane can be termed as a cluster node. +Kube Control Plane,"Servers where kubernetes control plane components (apiserver, scheduler, controller) will run." +Kube Node,Kubernetes node where the pods will run. +etcd,Servers to compose the etcd server in a Kubernetes cluster. You should have at least 3 servers for failover purpose. +Slurm Control Node,Servers where Slurm control node components (slurmctld and slurmdbd) will run. +Slurm Node,Slurm node where the slurmd services will run. +Login Node,A slurm node used to run a slurm job on the cluster. +NFS Node,Used to configure NFS server on a node connected to the PowerVault storage array. diff --git a/docs/source/Tables/Provision_config.csv b/docs/source/Tables/Provision_config.csv index c6c112201..101059dd4 100644 --- a/docs/source/Tables/Provision_config.csv +++ b/docs/source/Tables/Provision_config.csv @@ -3,7 +3,7 @@ ``string`` -Required",Path where user has placed the iso image that needs to be provisioned on target nodes. Accepted files are Rocky8-DVD or RHEL-8.x-DVD (full OS). iso_file_path should contain the provision_os and provision_os_version values in the filename. +Required",Path where user has placed the iso image that needs to be provisioned on target nodes. Accepted files are Rocky8-DVD or RHEL-8.x-DVD (full OS). ``iso_file_path`` should contain the ``provision_os`` and ``provision_os_version`` values in the filename. "**node_name** ``string`` @@ -73,8 +73,8 @@ Optional","* This variable is required when nodes are to be discovered via switc ``JSON list`` Optional","* User defined disk partition applied to remote servers. -* The disk partition desired_capacity has to be provided in MB. -* Valid mount_point values accepted for disk partition are /home, /var, /tmp, /usr, swap. +* The disk partition ``desired_capacity`` has to be provided in MB. +* Valid ``mount_point`` values accepted for disk partition are /home, /var, /tmp, /usr, swap. * Default partition size provided for RHEL/Rocky is /boot: 1024MB, /boot/efi: 256MB and remaining space to / partition. * Default partition size provided for Ubuntu is /boot: 2148MB, /boot/efi: 1124MB and remaining space to / partition. * Values are accepted in the form of JSON list such as: , - { mount_point: ""/home"", desired_capacity: ""102400"" } @@ -85,7 +85,7 @@ Optional","* User defined disk partition applied to remote servers. ``string`` -Required","Timezone to be used during OS provisioning. Available timezones are provided `here <../../Appendices/Timezones.html>`_. +Required","Timezone to be used during OS provisioning. Available timezones are provided `here <../../../Appendices/Timezones.html>`_. Choices: @@ -101,14 +101,14 @@ Choices: Required","Language to be used during OS provisioning. -**Default values**: ``en-US``" +**Default value**: ``en-US``" "**default_lease_time** ``integer`` Required","Default lease time for IPs assigned by DHCP. Range: 21600-86400 -**Default values**: ``86400``" +**Default value**: ``86400``" "**ubuntu_kernel_flavor** ``string`` @@ -116,4 +116,13 @@ Required","Default lease time for IPs assigned by DHCP. Range: 21600-86400 Optional","* This is a user-configured variable that is used to specify the kernel flavor on Ubuntu 22.04. * This variable is only supported when the cluster runs Ubuntu 22.04. * To specify this variable, ensure to append the string ``ubuntu_kernel_flavor: generic`` or ``ubuntu_kernel_flavor: hwe`` to the end of the ``input/provision_config.yml`` file. -* **Accepted values**: **generic**, hwe." +* **Accepted values**: ``generic``, ``hwe``." +"**ntp_support** + +``boolean`` + +Required","* The ``ntp_support`` variable denotes whether the cluster will have a Network Time Protocol (NTP) server configured on the OIM or not. +* If ``ntp_support`` is set to ``true``, an NTP server will be configured on the OIM and the time will be synchronized across the cluster nodes during provisioning. +* If ``ntp_support`` is set to ``false``, the NTP server will not be configured and time will not be synchronized across the cluster nodes. +* **Default value**: ``true`` +.. caution:: In a restricted network or proxy environment, it is not recommended to set up an NTP server on the OIM due to potential failures in connecting to the public NTP pools." diff --git a/docs/source/Tables/Provision_creds.csv b/docs/source/Tables/Provision_creds.csv index 481ec82a8..0f4be6857 100644 --- a/docs/source/Tables/Provision_creds.csv +++ b/docs/source/Tables/Provision_creds.csv @@ -12,7 +12,7 @@ Required","* Password set for the root account of target nodes during provisioni Required","* Password set for the postgresDB on target nodes during provisioning. * Length >= 8 characters -* Password must not contain -,\, ',""" +* The password must only contain alphanumeric characters." "**bmc_username** ``string`` @@ -53,4 +53,4 @@ Optional","* Username for Dockerhub account used for Docker logins. ``string`` Optional","* Password for Dockerhub account used for Docker logins. -* This value is mandatory if ``docker_username`` is provided." \ No newline at end of file +* This value is mandatory if ``docker_username`` is provided." diff --git a/docs/source/Tables/RHEL_space_req.csv b/docs/source/Tables/RHEL_space_req.csv new file mode 100644 index 000000000..d43634713 --- /dev/null +++ b/docs/source/Tables/RHEL_space_req.csv @@ -0,0 +1,23 @@ +Software ,Disc utilization (size) +AMD ROCM,4 GB +CUDA,4 GB +OFED,1 GB +FreeIPA,NA [1]_ +OpenLDAP,6 MB +Secure login node,65 MB +NFS,NA [1]_ +BeeGFS,3 MB +Slurm,190 MB +Kubernetes,5 GB +Jupyterhub,3.5 GB +Kubeflow,96 GB +Kserve,2 GB +Pytorch,111 GB +Tensorflow,57 GB +vLLM,77 GB +Telemetry,1 GB +Intel benchmarks,28 GB +AMD benchmarks,NA [1]_ +Utils,200 KB +UCX,3 MB +OpenMPI,17 MB diff --git a/docs/source/Tables/Ubuntu_space_req.csv b/docs/source/Tables/Ubuntu_space_req.csv new file mode 100644 index 000000000..922caacd6 --- /dev/null +++ b/docs/source/Tables/Ubuntu_space_req.csv @@ -0,0 +1,19 @@ +Software ,Disc utilization (size) +AMD ROCM,4 GB +CUDA,3.1 GB +Intel Gaudi driver,345 MB +OFED,286 MB +OpenLDAP,5 MB +Secure login node,70 MB +NFS,NA [1]_ +BeeGFS,19 MB +Kubernetes,5 GB +Jupyterhub,3 GB +Kubeflow,107 GB +Kserve,22 GB +Pytorch,145 GB +Tensorflow,68 GB +vLLM,108 GB +Telemetry,4 GB +UCX,3 MB +OpenMPI,17 MB diff --git a/docs/source/Tables/intel_gaudi_metrics.csv b/docs/source/Tables/intel_gaudi_metrics.csv new file mode 100644 index 000000000..49a88928a --- /dev/null +++ b/docs/source/Tables/intel_gaudi_metrics.csv @@ -0,0 +1,29 @@ +Intel Gaudi Metrics +habanalabs_clock_soc_max_mhz +habanalabs_clock_soc_mhz +habanalabs_device_config +habanalabs_ecc_feature_mode +habanalabs_energy +habanalabs_kube_info +habanalabs_memory_free_bytes +habanalabs_memory_total_bytes +habanalabs_memory_used_bytes +habanalabs_pci_link_speed +habanalabs_pci_link_width +habanalabs_pcie_receive_throughput +habanalabs_pcie_replay_count +habanalabs_pcie_rx +habanalabs_pcie_transmit_throughput +habanalabs_pcie_tx +habanalabs_pending_rows_state +habanalabs_pending_rows_with_double_bit_ecc_errors +habanalabs_pending_rows_with_single_bit_ecc_errors +habanalabs_power_default_limit_mW +habanalabs_power_mW +habanalabs_temperature_onboard +habanalabs_temperature_onchip +habanalabs_temperature_threshold_gpu +habanalabs_temperature_threshold_memory +habanalabs_temperature_threshold_shutdown +habanalabs_temperature_threshold_slowdown +habanalabs_utilization diff --git a/docs/source/Tables/local_repo_config.csv b/docs/source/Tables/local_repo_config_rhel.csv similarity index 56% rename from docs/source/Tables/local_repo_config.csv rename to docs/source/Tables/local_repo_config_rhel.csv index 93be0bb72..058134d19 100644 --- a/docs/source/Tables/local_repo_config.csv +++ b/docs/source/Tables/local_repo_config_rhel.csv @@ -7,16 +7,16 @@ Required","* The intended file path for offline repository data. * Ensure the disk partition has enough space. * Ensure that 755 permission is given to ``repo_store_path`` if user intends to use nfs share mount for ``repo_store_path``. -**Default value**: ``""/omnia_repo""``" +**Default value**: ``""/opt/omnia_repo""``" "**user_repo_url** ``JSON List`` Optional","* This variable accepts the repository urls of the user which contains the packages required for the cluster. -* When ``repo_config`` is always, the given list will be configured on the control plane and packages required for cluster will be downloaded into a local repository. +* When ``repo_config`` is always, the given list will be configured on the OIM and packages required for cluster will be downloaded into a local repository. -* When ``repo_config`` is partial, a local repository is created on the control plane containing packages that are not part of the user's repository. +* When ``repo_config`` is partial, a local repository is created on the OIM containing packages that are not part of the user's repository. * When ``repo_config`` is never, no local repository is created and packages are downloaded on all cluster nodes. @@ -24,19 +24,16 @@ Optional","* This variable accepts the repository urls of the user which contain * 'gpgkey' defines gpgkey for the repository. If 'gpgkey' is omitted then gpgcheck=0 is set for that repository. -* **Sample value**: ``- {url: ""http://crb.com/CRB/x86_64/os/"",gpgkey: ""http://crb.com/CRB/x86_64/os/RPM-GPG-KEY""}`` - - -" +* **Sample value**: ``- {url: ""http://crb.com/CRB/x86_64/os/"",gpgkey: ""http://crb.com/CRB/x86_64/os/RPM-GPG-KEY""}``" "**user_registry** ``JSON List`` Optional","* This variable accepts the registry url along with port of the user which contains the images required for cluster. -* When ``repo_config`` is always, the list given in ``user_registry`` will be configured on the control plane and packages required for cluster will be downloaded into a local repository. If the same repository is available in both the ``user_repo_url`` and the ``user_registry``, the repository will be configured using the values in ``user_registry``. +* When ``repo_config`` is always, the list given in ``user_registry`` will be configured on the OIM and packages required for cluster will be downloaded into a local repository. If the same repository is available in both the ``user_repo_url`` and the ``user_registry``, the repository will be configured using the values in ``user_registry``. -* When ``repo_config`` is partial, a local registry is created on the control plane containing packages that are not part of the ``user_registry``. Images listed in ``user_registry`` are directly configured as a mirror on compute nodes. Compute nodes are expected to connect to the URLs in the ``user_registry`` via http_proxy. +* When ``repo_config`` is partial, a local registry is created on the OIM containing packages that are not part of the ``user_registry``. Images listed in ``user_registry`` are directly configured as a mirror on compute nodes. Compute nodes are expected to connect to the URLs in the ``user_registry`` via http_proxy. * When ``repo_config`` is never, no local registry is created and packages/images are downloaded on all cluster nodes. @@ -47,40 +44,20 @@ Optional","* This variable accepts the registry url along with port of the user * **Sample value**: :: - { host: 10.11.0.100:5001, cert_path: ""/home/ca.crt"" } - - { host: registryhostname.registry.test, cert_path: """" } - - - - - - - -" + - { host: registryhostname.registry.test, cert_path: """" }" "**rhel_os_url** ``string`` -Optional","* Mandatory when ``cluster_os_type`` is rhel in ``softwares_config.json``. -* This variable will be ignored when ``cluster_os_type`` is ubuntu or rocky. +Required","* Mandatory when ``cluster_os_type`` is rhel in ``softwares_config.json``. * User has to provide the code ready builder url in order to download the packages. -* When ``repo_config`` is ""always"", the given ``rhel_os_url`` will be configured in the control plane and packages required for cluster will be downloaded. +* When ``repo_config`` is ""always"", the given ``rhel_os_url`` will be configured in the OIM and packages required for cluster will be downloaded. * When repo_config is ""partial"" or ""never"", the packages required for cluster which were coming from ``rhel_repo_url`` will not be downloaded. * The ``rhel_os_url`` is configured via proxy in compute nodes. -* **Example**: If ``cluster_os_type`` is rhel, ``rhel_os_url`` might be ""- {url: ""http://crb.com/CRB/x86_64/os/"", gpgkey: ""http://crb.com/CRB/x86_64/os/RPM-GPG-KEY""}""" -"**ubuntu_os_url** - -``string`` - -Optional","* Mandatory when ``cluster_os_type`` is ubuntu in ``softwares_config.json``. -* This variable will be ignored when ``cluster_os_type`` is rhel or rocky. -* This variables defines the repos to be configured on all the compute nodes. -* When ``repo_config`` is always, partial or never, the given ubuntu_os_url configured via proxy in compute nodes. -* Online ``ubuntu_os_url`` for Ubuntu 22.04 is ""http://in.archive.ubuntu.com/ubuntu"". -* Online ``ubuntu_os_url`` for Ubuntu 20.04 is ""http://archive.ubuntu.com/ubuntu"". - +* **Example**: If ``cluster_os_type`` is rhel, ``rhel_os_url`` might be ""- {url: ""http://crb.com/CRB/x86_64/os/"", gpgkey: ""http://crb.com/CRB/x86_64/os/RPM-GPG-KEY""}"" -* **Example**: When cluster_os_type is Ubuntu 22.04, ``ubuntu_os_url`` should be ""http://in.archive.ubuntu.com/ubuntu""" +.. caution:: Omnia does not support adding a Red Hat subscription URL to the ``rhel_os_url`` parameter. Adding it results in an error during ``local_repo.yml`` playbook execution. For more information, `click here <../../../Troubleshooting/KnownIssues/RHEL/local_repo.html>`_." "**omnia_repo_url_rhel** ``JSON List`` @@ -89,7 +66,7 @@ Required","* A list of all the repo urls from where rpms will be downloaded for * 'url' defines the baseurl for the repository. * 'gpgkey' defines gpgkey for the repository. If 'gpgkey' is omitted, then gpgcheck=0 is set for that repository * This value is not validated by Omnia. Any errors can cause Omnia to fail. -* Ensure that all URLs listed below are reachable to the control plane. +* Ensure that all URLs listed below are reachable to the OIM. **Default value**: :: @@ -109,7 +86,7 @@ Required","* A list of all the repo urls from where rpms will be downloaded for * 'url' defines the baseurl for the repository. * 'gpgkey' defines gpgkey for the repository. If 'gpgkey' is omitted, then gpgcheck=0 is set for that repository * This value is not validated by Omnia. Any errors can cause Omnia to fail. -* Ensure that all URLs listed below are reachable to the control plane. +* Ensure that all URLs listed below are reachable to the OIM. **Default value**: :: @@ -129,26 +106,3 @@ Required","* A list of all the repo urls from where rpms will be downloaded for " -"**omnia_repo_url_ubuntu** - -``JSON List`` - -Required","* A list of all the repo urls from where deb packages will be downloaded for Omnia features on Ubuntu clusters. -* 'url' defines the baseurl for the repository. -* 'gpgkey' defines gpgkey for the repository. If 'gpgkey' is omitted, the repository will be marked as ""trusted"". -* On clusters running Ubuntu, if gpgkeys are not available, public keys are accepted in place of gpgkeys. However, the field public key cannot be left blank. -* This value is not validated by Omnia. Any errors can cause Omnia to fail. -* Ensure that all URLs listed below are reachable to the control plane. - - -**Default value**: :: - - - { url: ""https://download.docker.com/linux/ubuntu {{ os_release }} stable"", gpgkey: ""https://download.docker.com/linux/ubuntu/gpg"" } - - { url: ""https://repo.radeon.com/rocm/apt/{{ rocm_version }} {{ os_release }} main"", gpgkey: ""https://repo.radeon.com/rocm/rocm.gpg.key"" } - - { url: ""https://www.beegfs.io/release/beegfs_{{beegfs_version}} {{ os_release }} non-free"", gpgkey: ""https://www.beegfs.io/release/beegfs_{{beegfs_version}}/gpg/GPG-KEY-beegfs"" } - - { url: ""https://repo.radeon.com/amdgpu/{{ amdgpu_version }}/ubuntu {{ os_release }} main"", gpgkey: ""https://repo.radeon.com/rocm/rocm.gpg.key"" } - - { url: ""https://ltb-project.org/debian/openldap25/jammy jammy main"", publickey: ""https://ltb-project.org/documentation/_static/RPM-GPG-KEY-LTB-project"" } - - { url: ""https://nvidia.github.io/libnvidia-container/stable/deb/amd64 /"", gpgkey: ""https://nvidia.github.io/libnvidia-container/gpgkey"" } - - { url: ""http://ppa.launchpad.net/deadsnakes/ppa/ubuntu {{ os_release }} main"", gpgkey: """" } - - { url: ""https://a2o.github.io/snoopy-packages/repo/ubuntu {{ os_release }} stable"", publickey: ""https://a2o.github.io/snoopy-packages/snoopy-packages-key.pub"" } - " diff --git a/docs/source/Tables/local_repo_config_ubuntu.csv b/docs/source/Tables/local_repo_config_ubuntu.csv new file mode 100644 index 000000000..4b9a6d8b2 --- /dev/null +++ b/docs/source/Tables/local_repo_config_ubuntu.csv @@ -0,0 +1,83 @@ +Parameter,Details +"**repo_store_path** + +``string`` + +Required","* The intended file path for offline repository data. +* Ensure the disk partition has enough space. +* Ensure that 755 permission is given to ``repo_store_path`` if user intends to use nfs share mount for ``repo_store_path``. + +**Default value**: ``""/opt/omnia_repo""``" +"**user_repo_url** + +``JSON List`` + +Optional","* This variable accepts the repository urls of the user which contains the packages required for the cluster. + +* When ``repo_config`` is always, the given list will be configured on the OIM and packages required for cluster will be downloaded into a local repository. + +* When ``repo_config`` is partial, a local repository is created on the OIM containing packages that are not part of the user's repository. + +* When ``repo_config`` is never, no local repository is created and packages are downloaded on all cluster nodes. + +* 'url' defines the baseurl for the repository. + +* 'gpgkey' defines gpgkey for the repository. If 'gpgkey' is omitted then gpgcheck=0 is set for that repository. + +* **Sample value**: ``- {url: ""http://crb.com/CRB/x86_64/os/"",gpgkey: ""http://crb.com/CRB/x86_64/os/RPM-GPG-KEY""}``" +"**user_registry** + +``JSON List`` + +Optional","* This variable accepts the registry url along with port of the user which contains the images required for cluster. + +* When ``repo_config`` is always, the list given in ``user_registry`` will be configured on the OIM and packages required for cluster will be downloaded into a local repository. If the same repository is available in both the ``user_repo_url`` and the ``user_registry``, the repository will be configured using the values in ``user_registry``. + +* When ``repo_config`` is partial, a local registry is created on the OIM containing packages that are not part of the ``user_registry``. Images listed in ``user_registry`` are directly configured as a mirror on compute nodes. Compute nodes are expected to connect to the URLs in the ``user_registry`` via http_proxy. + +* When ``repo_config`` is never, no local registry is created and packages/images are downloaded on all cluster nodes. + +* 'host' defines the URL and path to the registry. + +* 'cert_path' defines the absolute path where the security certificates for each registry. If this path is not provided, insecure registries are configured. + +* **Sample value**: :: + + - { host: 10.11.0.100:5001, cert_path: ""/home/ca.crt"" } + - { host: registryhostname.registry.test, cert_path: """" }" +"**ubuntu_os_url** + +``string`` + +Required","* Mandatory when ``cluster_os_type`` is ubuntu in ``softwares_config.json``. +* This variables defines the repos to be configured on all the compute nodes. +* When ``repo_config`` is always, partial or never, the given ubuntu_os_url configured via proxy in compute nodes. +* Online ``ubuntu_os_url`` for Ubuntu 22.04 is ""http://in.archive.ubuntu.com/ubuntu"". +* Online ``ubuntu_os_url`` for Ubuntu 20.04 is ""http://archive.ubuntu.com/ubuntu"". + + +* **Example**: When cluster_os_type is Ubuntu 22.04, ``ubuntu_os_url`` should be ""http://in.archive.ubuntu.com/ubuntu""" +"**omnia_repo_url_ubuntu** + +``JSON List`` + +Required","* A list of all the repo urls from where deb packages will be downloaded for Omnia features on Ubuntu clusters. +* 'url' defines the baseurl for the repository. +* 'gpgkey' defines gpgkey for the repository. If 'gpgkey' is omitted, the repository will be marked as ""trusted"". +* On clusters running Ubuntu, if gpgkeys are not available, public keys are accepted in place of gpgkeys. However, the field public key cannot be left blank. +* This value is not validated by Omnia. Any errors can cause Omnia to fail. +* Ensure that all URLs listed below are reachable to the OIM. + + +**Default value**: :: + + - { url: ""https://download.docker.com/linux/ubuntu {{ os_release }} stable"", gpgkey: ""https://download.docker.com/linux/ubuntu/gpg"" } + - { url: ""https://repo.radeon.com/rocm/apt/{{ rocm_version }} {{ os_release }} main"", gpgkey: ""https://repo.radeon.com/rocm/rocm.gpg.key"" } + - { url: ""https://www.beegfs.io/release/beegfs_{{beegfs_version}} {{ os_release }} non-free"", gpgkey: ""https://www.beegfs.io/release/beegfs_{{beegfs_version}}/gpg/GPG-KEY-beegfs"" } + - { url: ""https://repo.radeon.com/amdgpu/{{ amdgpu_version }}/ubuntu {{ os_release }} main"", gpgkey: ""https://repo.radeon.com/rocm/rocm.gpg.key"" } + - { url: ""https://ltb-project.org/debian/openldap25/jammy jammy main"", publickey: ""https://ltb-project.org/documentation/_static/RPM-GPG-KEY-LTB-project"" } + - { url: ""https://nvidia.github.io/libnvidia-container/stable/deb/amd64 /"", gpgkey: ""https://nvidia.github.io/libnvidia-container/gpgkey"" } + - { url: ""http://ppa.launchpad.net/deadsnakes/ppa/ubuntu {{ os_release }} main"", gpgkey: """" } + - { url: ""https://a2o.github.io/snoopy-packages/repo/ubuntu {{ os_release }} stable"", publickey: ""https://a2o.github.io/snoopy-packages/snoopy-packages-key.pub"" } + - { url: ""https://vault.habana.ai/artifactory/debian {{ os_release }} main"", publickey: ""https://vault.habana.ai/artifactory/api/gpg/key/public"" } + " diff --git a/docs/source/Tables/network_spec.csv b/docs/source/Tables/network_spec.csv new file mode 100644 index 000000000..8e48cb899 --- /dev/null +++ b/docs/source/Tables/network_spec.csv @@ -0,0 +1,101 @@ +Network Name,Parameters for the network,Parameter details +"``admin_network`` + + + + +.. note:: This name cannot be modified. This is mandatory for discovery and provisioning of the cluster nodes.","**nic_name** + +``string`` + +Mandatory",The name of the NIC on which the administrative network is accessible to the OIM. +,"**netmask_bits** + +``integer`` + +Mandatory",The 32-bit “mask” used to divide an IP address into subnets and specify the network’s available hosts. +,"**static_range** + +``IP address range`` + +Mandatory",The static range of IPs to be provisioned on target nodes. +,"**dynamic_range** + +``IP address range`` + +Mandatory",The dynamic range of IPs to be provisioned on target nodes. +,"**correlation_to_admin** + +``boolean`` + +Mandatory","* Boolean value used to indicate whether all other networks specified in the file (for example: bmc_network) should be correlated to the admin network. For example, if a target node is assigned the IP xx.yy.0.5 on the admin network, it will be assigned the IP aa.bb.0.5 on the BMC network. +* This value is irrelevant when discovering nodes using a mapping file." +,"**admin_uncorrelated_node_start_ip** + +``IP address`` + +Optional","* If ``correlation_to_admin`` is set to true but correlated IPs are not available on non-admin networks, provide an IP within the ``static_range`` of the admin network that can be used to assign admin static IPs to uncorrelated nodes. If this is empty, then the first IP in the ``static_range`` of the admin network is taken by default. +* This value is irrelevant when discovering nodes using a mapping file." +,"**network_gateway** + +``IP address`` + +Mandatory",The network gateway IP is the assigned IP address of the NIC that connects a local network to external networks. +,"**DNS** + +``IP address`` + +Optional","The IP of the extrernal DNS server. A DNS server, or Domain Name System server, translates domain names into IP addresses that computers use to identify each other on the network." +,"**MTU** + +``integer`` + +Mandatory",Maximum transmission unit (MTU) is a measurement in bytes of the largest data packets that an Internet-connected device can accept. +"``bmc_network`` + + + +.. note:: This name cannot be modified. This is mandatory while using the BMC discovery mechanism.","**nic_name** + +``string`` + +Mandatory",The name of the NIC on which the administrative network is accessible to the OIM. +,"**netmask_bits** + +``integer`` + +Mandatory",The 32-bit “mask” used to divide an IP address into subnets and specify the network’s available hosts. +,"**static_range** + +``IP address range`` + +Mandatory",The static range of IPs to be provisioned on target nodes. +,"**dynamic_range** + +``IP address range`` + +Mandatory",The dynamic range of IPs to be provisioned on target nodes. +,"**reassignment_to_static** + +``boolean`` + +Optional","* If iDRACs are set to DHCP mode and Omnia has assigned the IPs, then the user can reassign the IP within the ``bmc_network`` static range by setting this value to ``true``. +* If this value is not provided or set to ``false`` while the iDRACs are in DHCP mode, they will obtain IPs from the ``bmc_network`` dynamic range, and these IPs will then be converted to static IPs for the iDRACs." +,"**discover_ranges** + +``IP address range`` + +Mandatory","* If some iDRACs IP ranges are reachable from OIM but is not in ``bmc_network``, then user can provide those IP ranges here. +* Discovery of a single IP is not possible. User must provide a range. +* User can also provide comma-separated ranges. +.. note:: This is an optional field. User must not remove any of the fields even though it is optional." +,"**network_gateway** + +``IP address`` + +Mandatory",The network gateway IP is the assigned IP address of the NIC that connects a local network to external networks. +,"**MTU** + +``integer`` + +Mandatory",Maximum transmission unit (MTU) is a measurement in bytes of the largest data packets that an Internet-connected device can accept. diff --git a/docs/source/Tables/omnia_installed_software.csv b/docs/source/Tables/omnia_installed_software.csv index c8acca828..e079ac738 100644 --- a/docs/source/Tables/omnia_installed_software.csv +++ b/docs/source/Tables/omnia_installed_software.csv @@ -1,100 +1,113 @@ OSS Title,License Name/Version #,Description,Version Slurm Workload manager,GNU General Public License,HPC Workload Manager,20.11.9 -Kubernetes Controllers,Apache-2.0,HPC Workload Manager,1.26.12 -MariaDB,GPL 2.0,Relational database used by Slurm,10.3 -Docker CE,Apache-2.0,Docker Service,20.10.2 -NVidia container runtime,Apache-2.0,Nvidia container runtime library,3.4.2 -Python-pip,MIT License,Python Package,24 -kubelet,Apache-2.0,"Provides external, versioned ComponentConfig API types for configuring the kubelet",1.26.12 -kubeadm,Apache-2.0,Provides “fast paths” for creating Kubernetes clusters.,1.26.12 -etcd,Apache-2.0,Relational database used by Kubernetes,3.5.10 -kubectl,Apache-2.0,Command line tool for Kubernetes,1.26.12 -jupyterhub,BSD-3Clause New or Revised License,Multi-user hub,4.0.2 -kubeflow,Apache-2.0,Cloud Native platform for machine learning,1.8.0 -helm,Apache-2.0,Kubernetes Package Manager,3.12.3 -tensorflow,Apache-2.0,Tensorflow is a GPU accelerated tensor computational framework for Machine Learning.,2.1 -horovod,Apache-2.0,Distributed deep learning training framework for Tensorflow,0.21.1 -MPI,3Clause BSD License,HPC library,0.2.3 -spark,Apache-2.0,Unified engine for large-scale data analytics.,2.4.7 -coreDNS,Apache-2.0,DNS server that chains plugins,1.9.3 -cni,Apache-2.0,Networking for Linux containers,1.4.0 -dellemc.openmanage,GNU-General Public License v3.0,"It is a systems management and monitoring application that provides a comprehensive view of the Dell EMC servers, chassis, storage, and network switches on the enterprise network",4.4.0 +Kubernetes Controllers,Apache 2.0,Orchestration tool,1.29.5 +MariaDB,GPL 2.0,Open source relational database used by Slurm,10.3 +Docker CE,Apache 2.0,Docker Service,20.10.20 +NVIDIA container runtime,"Apache 2.0, GPL-3.0, LGPL-3.0",NVIDIA container runtime library,3.4.2 +libnvidia-container,Apache 2.0,NVIDIA container runtime library,1.16.2 +habanalabs-container-runtime,Apache 2.0,"Intel Habana container runtime library + +.. note:: This is preliminary code and may change before official release.",1.18.0-524 +habanalabs-k8s-device-plugin,Apache 2.0,HABANA device plugin for Kubernetes,1.18.0-524 +Python-pip,MIT License,Python Package,24.2 +kubelet,Apache 2.0,"Provides external, versioned ComponentConfig API types for configuring the kubelet",1.29.5 +kubeadm,Apache 2.0,Provides “fast paths” for creating Kubernetes clusters.,1.29.5 +kubectl,Apache 2.0,Command line tool for Kubernetes,1.29.5 +etcd,Apache 2.0,Relational database used by Kubernetes,3.5.12 +jupyterhub,"BSD-3Clause ""New"" or ""Revised"" License",Multi-user hub,3.2.0 +kubeflow,Apache 2.0,A repository for Kustomize manifests,1.9.1 +helm,Apache 2.0,Kubernetes Package Manager,3.14.2 +helm-charts,Apache 2.0,The source for Dell Helm charts.,csi-isilon-2.11.0 +mpi-operator,Apache 2.0,"Kubernetes Operator for MPI-based applications (distributed training, HPC, etc.)",0.5.0 +spark-operator,Apache 2.0,Unified analytics engine for large-scale data processing.,v1beta2-1.3.8-3.1.1 +coreDNS,Apache 2.0,DNS server that chains plugins,1.11.1 +cni,Apache 2.0,Networking for Linux containers,3.27.3 +dellemc.openmanage,GNU-General Public License v3.0,"It is a systems management and monitoring application that provides a comprehensive view of the Dell servers, chassis, storage, and network switches on the enterprise network",9.6.0 dellemc.os10,GNU-General Public License v3.0,It provides networking hardware abstraction through a common set of APIs,1.1.1 -community.general ansible,GNU-General Public License v3.0,The collection is a part of the Ansible package and includes many modules and plugins supported by Ansible community which are not part of more specialized community collections.,4.8.7 -cri-o,Apache-2.0,Container service,1.21 -OpenSM,GNU General Public License 2,InfiniBand compliant Subnet Manager.,3.3.21 -omsdk,Apache-2.0,Dell EMC OpenManage Python SDK (OMSDK) is a python library that helps developers and customers to automate the lifecycle management of PowerEdge Servers,1.2.456 -freeipa,GNU General Public License v3,Authentication system used on the login node,4.9 -bind-dyndb-ldap,GNU General Public License v2,LDAP driver for BIND9. It allows you to read data and also write data back (DNS Updates) to an LDAP backend.,11.1 -prometheus-slurm-exporter,GNU General Public License v3,Prometheus collector and exporter for metrics extracted from the Slurm resource scheduling system.,0.20.0 -prometheus,Apache-2.0,"Open-source monitoring system with a dimensional data model, flexible query language, efficient time series database and modern alerting approach.",2.23.0 -singularity,BSD License,It is a container platform,3.8.0 -loki,GNU AFFERO GENERAL PUBLIC LICENSE v3.0,Loki is a log aggregation system designed to store and query logs from all your applications and infrastructure,2.4.1 -promtail,Apache-2.0,Promtail is an agent which ships the contents of local logs to a private Grafana Loki instance,2.4.1 -Kube prometheus stack,Apache-2.0,"Kube Prometheus Stack is a collection of Kubernetes manifests, Grafana dashboards, and Prometheus rules.",25.0.0 -mailx,MIT License,mailx is a Unix utility program for sending and receiving mail.,12.5 -xorriso,GPL 3.0,It can load the management information of existing ISO images and it writes the session results to optical media or to filesystem objects,1.4.8 -openshift-restclient-python,Apache-2.0,A python library for interacting with OpenShift via the OpenShift client binary,0.13.2 +community.general ansible,GNU-General Public License v3.0,Ansible Community General Collection,9.4.0 +cri-o,Apache 2.0,Container service,1.21 +OpenSM,GNU-General Public License v2.0,Provides an implementation for InfiniBand Subnet Manager and Administrator to initialize IB hardware.,3.3.21 +omsdk,Apache 2.0,Dell OpenManage Python SDK (OMSDK) is a python library that helps developers and customers to automate the lifecycle management of Dell PowerEdge servers,1.2.518 +freeIPA,GNU-General Public License v3.0,"FreeIPA is an integrated security information management solution combining Linux (Fedora), 389 Directory Server, MIT Kerberos, NTP, DNS, and Dogtag (Certificate System)",1.9.2 +bind-dyndb-ldap,GNU-General Public License v2.0,Dynamic LDAP back-end is a plugin for BIND that provides back-end capabilities to an LDAP database,11.6.4 +prometheus-slurm-exporter,GNU-General Public License v3,Prometheus collector and exporter for metrics extracted from the Slurm resource scheduling system.,0.20.0 +prometheus,Apache 2.0,"Open-source monitoring system with a dimensional data model, flexible query language, efficient time series database and modern alerting approach.",2.23.0 +loki,GNU AFFERO GENERAL PUBLIC LICENSE v3.0,"Loki is a horizontally scalable, highly available, multi-tenant log aggregation system",2.4.1 +promtail,Apache 2.0,Promtail is an agent which ships the contents of local logs to a private Grafana Loki instance,2.4.1 +Kube prometheus stack,Apache 2.0,"Kube Prometheus Stack is a collection of Kubernetes manifests, Grafana dashboards, and Prometheus rules.",62.3.0 +mailx,MIT License,mailx is a Unix utility for sending and receiving mail.,12.5 +openshift,Apache 2.0,An on-premise platform-as-a-service (PAAS) built around Linux containers orchestrated and managed by Kubernetes,0.13.2 +openshift-restclient-python,Apache 2.0,A python library for interacting with OpenShift via the OpenShift client binary,0.13.2 grafana,GNU AFFERO GENERAL PUBLIC LICENSE,Grafana is the open source analytics & monitoring solution for every database.,8.3.2 -kubernetes.core,GPL 3.0,Performs CRUD operations on K8s objects,2.2.3 -community.grafana,GPL 3.0,Technical Support for open source grafana.,1.3 -activemq,Apache-2.0,"Most popular multi protocol, message broker.",5.10.0 +kubernetes.core,GPL 3.0,Performs CRUD operations on K8s objects,5.0.0 +community.grafana,GPL 3.0,Technical Support for open source grafana.,2.1.0 +activemq,Apache 2.0,"Most popular open source, multi-protocol, and Java-based message broker.",5.10.0 golang,BSD-3-Clause License,"Go is a statically typed, compiled programming language designed at Google.",1.17 mysql,GPL 2.0,MySQL is an open-source relational database management system.,8 postgresSQL,PostgresSQL License,"PostgreSQL, also known as Postgres, is a free and open-source relational database management system emphasizing extensibility and SQL compliance.","10.15, 12, 10.21" -idrac-telemetry-reference tools,Apache-2.0,Reference toolset for PowerEdge telemetry metric collection and integration with analytics and visualization solutions.,0.1 +idrac-telemetry-reference tools,Apache 2.0,Reference toolset for PowerEdge telemetry metric collection and integration with analytics and visualization solutions.,0.1 nsfcac/grafana-plugin,MIT License,Machine Learning Framework,2.1.0 jansson,MIT License,"C library for encoding, decoding and manipulating JSON data",2.14 libjwt,Mozilla Public License-2.0 License,JWT C Library,1.12.0 -389-ds,GPL,Light weight directory access protocol,2.4.7 +389-ds,GPL,Light weight directory access protocol,1.4.3.32 apparmor,GNU General Public License,Controls access based on paths of the program files,3.0.3 snoopy,GPL 2.0,Snoopy is a small library that logs all program executions on your Linux/BSD system,2.4.15 -timescaledb,Apache-2.0,"TimescaleDB is a time-series SQL database providing fast analytics, scalability, with automated data management on a proven storage engine.",2.6.0 +timescaledb,Apache 2.0,"TimescaleDB is a time-series SQL database providing fast analytics, scalability, with automated data management on a proven storage engine.",2.6.0 Beegfs-Client,GPLv2,BeeGFS is a high-performance parallel file system with easy management. The distributed metadata architecture of BeeGFS has been designed to provide the scalability and flexibility that is required to run today’s and tomorrow’s most demanding HPC applications.,7.4.2 Lmod,MIT License,"An Environment Module System based on Lua, Reads TCL Modules, Supports a Software Hierarchy","8.4.28-1.1, 8.2.7" Lua,MIT License,"Lua is a lightweight, high-level, multi-paradigm programming language designed primarily for embedded use in applications.",5.3.4 -ansible posix,GNU General Public License,Ansible Collection targeting POSIX and POSIX-ish platforms.,1.4.0 -xCAT,Eclipse Public License 1.0,Provisioning tool ,2.16.5 -CUDA Toolkit,NVIDIA License,The NVIDIA® CUDA® Toolkit provides a development environment for creating high performance GPU-accelerated applications.,12.3.2 (Default) -MLNX-OFED,BSD License,MLNX_OFED is an NVIDIA tested and packaged version of OFED that supports two interconnect types using the same RDMA (remote DMA) and kernel bypass APIs called OFED verbs – InfiniBand and Ethernet.,24.01-0.3.3.1 (Default) -ansible pylibssh,LGPL 2.1,Python bindings to client functionality of libssh specific to Ansible use case.,1.0.0 -perl-DBD-Pg,GNU General Public License v3,DBD::Pg - PostgreSQL database driver for the DBI module,3.7.4 -ansible.utils ansible collection,GPL 3.0,"Ansible Collection with utilities to ease the management, manipulation, and validation of data within a playbook",4.8.7 +ansible posix,GNU General Public License,An Ansible Collection of modules and plugins that target POSIX UNIX/Linux and derivative Operating Systems.,1.4.0 +xCAT,Eclipse Public License 1.0,"xCAT is an open-source tool for automating deployment, scaling, and management of bare metal servers and virtual machines",2.16.5 +CUDA Toolkit,NVIDIA License,CUDA (or Compute Unified Device Architecture) is a parallel computing platform and application programming interface (API) that allows software to use certain types of graphics processing units (GPUs) for general purpose processing,2.13.2 +MLNX_OFED,BSD License,"MLNX_OFED is based on the OpenFabrics Enterprise Distribution (OFED™), an open-source software for RDMA and kernel bypass applications provided by Open Fabric Alliance",24.01-0.3.3.1 +ansible pylibssh,LGPL 2.1,Python bindings specific to Ansible use case for libssh,1.2.2 +perl-DBD-Pg,GNU-General Public License v3.0,Perl module that works with the DBI module to provide access to PostgreSQL databases.,3.7.4 +ansible.utils ansible collection,GPL 3.0,"Ansible Collection with utilities to ease the management, manipulation, and validation of data within a playbook",2.5.2 pandas,BSD-3-Clause License,"pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool, built on top of the Python programming language.",2.2.1 python3-netaddr,BSD License,A Python library for representing and manipulating network addresses.,0.8.0 -psycopg2-binary,GNU Lesser General Public License,Psycopg is the most popular PostgreSQL database adapter for the Python programming language.,2.9.5 -python.requests,Apache-2.0,Makes HTTP requests simpler and more human-friendly.,2.28.1 -apptainer,BSD 3-Clause License,Apptainer: Application containers for Linux,1.2 -metallb,Apache-2.0,A network load-balancer implementation for Kubernetes using standard routing protocols,1.3 -smartmontools,GNU GPL,Official read only mirror of the smartmontools project SVN,7.4 -pyinstaller,"GNU General Public License, Apache 2.0",Freeze (package) Python programs into stand-alone executables,6 -psutil,BSD 3-Clause License,Cross-platform lib for process and system monitoring in Python,05:09:05 +psycopg2-binary,GNU Lesser General Public License,Psycopg is a PostgreSQL database adapter for the Python programming language.,2.9.5 +python.requests,Apache 2.0,"A simple, yet elegant, HTTP library.",2.28.1 +apptainer,BSD 3-Clause License,Apptainer creates application containers for Linux,1.2 +metallb,Apache 2.0,A network load-balancer implementation for Kubernetes using standard routing protocols,0.13.9 +smartmontools,GNU GPL,Official read only mirror of the smartmontools project SVN,7.1.1 +pyinstaller,"GNU General Public License, Apache 2.0",Freeze (package) Python programs into stand-alone executables,6.11.1 +psutil,BSD 3-Clause License,Cross-platform library for process and system monitoring in Python,6.1.1 +community.crypto ansible collection,GNU General Public License v3.0,The community.crypto collection for Ansible,2.12.0 LDAP ToolBox,GNU General Public License v3.0 ,OpenLDAP server installation,2.5.16 -squid - ubuntu 20.04,Apache License 2.0,http proxy server,4.1 -squid - ubuntu 22.04,Apache License 2.0,http proxy server,5.7 -squid,Apache License 2.0,http proxy server,4.15 -jq,MIT License,Software for verifying syntax of json files,1.7.1 -community.crypto ansible collection,GNU General Public License v3.0,The community.crypto collection for Ansible.,2.14.0 -pyopenssl,Apache License 2.0,pyOpenSSL is a rather thin wrapper around (a subset of) the OpenSSL library.,24.1.0 -urllib3,MIT License,"urllib3 is a powerful, user-friendly HTTP client for python",2.2.1 -containerd,Apache License 2.0,An open and reliable container runtime,1.6.13-3.1 -nerdctl,Apache License 2.0,Omnia Local Registry,1.5.0 -kubespray collection,Apache License 2.0,Deploy a Production Ready Kubernetes Cluster,v2.23.2 -cryptography,"Apache Software License, BSD License (Apache-2.0 OR BSD-3-Clause)",cryptography is a package designed to expose cryptographic primitives and recipes to Python developers.,41.0.1 +squid - ubuntu 20.04,Apache 2.0,http proxy server,4.1 +squid - ubuntu 22.04,Apache 2.0,http proxy server,5.7 +squid,Apache 2.0,http proxy server,4.15 +jq,MIT License,Software for verifying syntax of json files,1.6.6 +pyopenssl,Apache 2.0,pyOpenSSL is a rather thin wrapper around (a subset of) the OpenSSL library.,21.0.0 +urllib3,MIT License,"urllib3 is a powerful, user-friendly HTTP client for python",1.26.5 +containerd,Apache 2.0,An open and reliable container runtime,1.6.13-3.1 +nerdctl,Apache 2.0,"contaiNERD CTL - Docker-compatible CLI for containerd, with support for Compose, Rootless, eStargz, OCIcrypt, IPFS, etc.",1.7.4 +kubespray collection,Apache 2.0,Deploy a Production Ready Kubernetes Cluster,2.25 +cryptography,Apache 2.0,cryptography is a package designed to expose cryptographic primitives and recipes to Python developers.,44.0.0 jmespath,MIT License,JMESPath is a query language for JSON.,1.0.1 -MarkupSafe,BSD-3-Clause,Safely add untrusted strings to HTML/XML markup.,2.1.3 -netaddr,BSD License,A Python library for representing and manipulating network addresses.,0.8.0 -pbr,Apache Software License,Python Build Reasonableness. Mirror of code maintained at opendev.org.,5.11.1 -ruamel.yaml,MIT License,It is a YAML 1.2 loader/dumper package for Python.,0.17.31 +MarkupSafe,"BSD 3-Clause ""New"" or ""Revised"" License",Safely add untrusted strings to HTML/XML markup.,2.1.5 +netaddr,BSD License,A Python library for representing and manipulating network addresses.,1.2.1 +pbr,Apache 2.0,Python Build Reasonableness. Mirror of code maintained at opendev.org.,6.0.0 +ruamel.yaml,MIT License,It is a YAML 1.2 loader/dumper package for Python.,0.18.6 ruamel.yaml.clib,MIT License,It is the C based reader/scanner and emitter for ruamel.yaml,0.2.8 -Kserve,Apache License 2.0,"Deploys Kserve as inference platform on K8s cluster. Kserve installation installs dependencies istio1.17.0, certificate manager (1.13.0) and Knative(1.11.0) ",0.11.2 -pyarrow,Apache License 2.0,Discovery provisioning,15.0.2 -vllm,Apache License 2.0,A high-throughput and memory-efficient inference and serving engine for LLMs,0.4.0 -ucx,BSD 3 Clause License,Benchmarks tools,1.15.0 -openmpi,3-clause BSD license,Benchmarks tools,4.1.6 -Pytorch,Apache-2.0,PyTorch is a GPU accelerated tensor computational framework for Machine Learning.,latest image -Pytorch Nvidia,NVIDIA DEEP LEARNING CONTAINER LICENSE,"The PyTorch NGC Container is optimized for GPU acceleration, and contains a validated set of libraries that enable and optimize GPU performance",23.12-py3 -Tensorflow Nvidia,NVIDIA DEEP LEARNING CONTAINER LICENSE,Tensorflow is a GPU accelerated tensor computational framework for Machine Learning.,23.12-tf2-py3 -Kustomize,Apache License 2.0,Customization of kubernetes YAML configurations,5.0.3 -xilinx-device-plugin,Apache License 2.0,he AMD-Xilinx device plugin for Kubernetes is a Daemonset deployed on the Kubernetes(k8s) cluster,1.3.0 -nfs-subdir-external-provisioner,Apache License 2.0,Dynamic sub-dir volume provisioner on a remote NFS server.,4.0.18 +Kserve,Apache 2.0,"Deploys Kserve as inference platform on K8s cluster. Kserve installation installs dependencies istio1.17.0, certificate manager (1.13.0) and Knative(1.11.0) ",0.13.0 +pyarrow,Apache 2.0,Apache Arrow is a multi-language toolbox for accelerated data interchange and in-memory processing,17.0.0 +vLLM NVIDIA,Apache 2.0,A high-throughput and memory-efficient inference and serving engine for LLMs,0.4.0 +vLLM AMD,Apache 2.0,vLLM is a toolkit and library for large language model (LLM) inference and serving.,0.2.4 +vLLM Intel,Apache 2.0,vLLM is a toolkit and library for large language model (LLM) inference and serving.,0.5.3.post1+Gaudi-1.18.0 +ucx,BSD 3 Clause License,Unified Communication X,1.15.0 +openmpi,BSD 3 Clause License,Open MPI main development repository,4.1.6 +Pytorch CPU/AMD,BSD 3 Clause License,Tensors and Dynamic neural networks in Python with strong GPU acceleration,latest +Pytorch NVIDIA,NVIDIA DEEP LEARNING CONTAINER LICENSE,"The PyTorch NGC Container is optimized for GPU acceleration, and contains a validated set of libraries that enable and optimize GPU performance",23.12-py3 +Pytorch Intel ,HABANA LABS SYNAPSEAI SOFTWARE SUITE OUTBOUND SOFTWARE LICENSE AGREEMENT,The Intel Gaudi software is integrated with PyTorch and optimized for Intel Gaudi AI accelerators.,2.4.0 +Tensorflow AMD/CPU,Apache 2.0,Machine Learning framework.,latest +Tensorflow NVIDIA,Apache 2.0,An Open Source Machine Learning Framework for Everyone,23.12-tf2-py3 +Kustomize,Apache 2.0,Customization of kubernetes YAML configurations,5.0.3 +xilinx-device-plugin,Apache 2.0,The AMD-Xilinx device plugin for Kubernetes is a Daemonset deployed on the Kubernetes(k8s) cluster,1.3.0 +nfs-subdir-external-provisioner,Apache 2.0,Dynamic sub-dir volume provisioner on a remote NFS server.,4.0.18 +calico,Apache 2.0,"Calico is an open-source networking and security solution for containers and microservices, widely used in Kubernetes for efficient networking and robust security.",3.27.3 +flannel,Apache 2.0,Flannel is a simple and easy-to-configure network fabric for containers. It is part of the Kubernetes ecosystem and primarily used to provide a network overlay that connects containers across multiple nodes.,0.22.0 +csi-powerscale,Apache 2.0,CSI Driver for Dell PowerScale devices,2.11.0 +external-snapshotter,Apache 2.0,Sidecar container that watches Kubernetes Snapshot CRD objects and triggers CreateSnapshot/DeleteSnapshot against a CSI endpoint.,8.0.1 +Prometheus Gaudi metric exporter,Apache 2.0,This is a Prometheus exporter implementation that enables the collection of Intel Gaudi AI accelerator metrics in a container cluster for compute workload.,1.18.0-524 +DeepSpeed Intel,Apache 2.0,"DeepSpeed is a deep learning optimization library that makes distributed training and inference easy, efficient, and effective.",v2Beta1 diff --git a/docs/source/Tables/performance_config.csv b/docs/source/Tables/performance_config.csv new file mode 100644 index 000000000..ecaac6293 --- /dev/null +++ b/docs/source/Tables/performance_config.csv @@ -0,0 +1,34 @@ +Parameters,Details +"**performance_profile** + +String + +Required","The field captures the performance profile which will be configured on the nodes. To see and get a brief summary of all of the available profiles, use the ``tuned-adm list`` command. + + + +**Example**: ``accelerator-performance`` +" +"**performance_profile_plugin** + +String + +Optional","If you want to customise the default performance profiles, you can do so by adding multiple plugins/parameters or by altering the default values present here. If there is no need to modify the profile, the ``performance_profile_plugin`` section can be left as it is. By default, it takes the pre-defined plugin values placed in the profile. + + + +**Example**: If you have selected ``accelerator-performance`` as your performance profile, then you can alter the values of parameters such as ``force_latency`` and ``vm.swappiness``. If required, you can also add supported other supported parameters such as ``vm.nr_hugepages`` with their desired values. + +" +"**reboot_required** + +String + +Required","The value provided for this field denotes if the device should be rebooted after peformance profile configuration. + +**Choices**: + +``yes`` + +``no`` -> default +" diff --git a/docs/source/Tables/scheduler_k8s.csv b/docs/source/Tables/scheduler_k8s_rhel.csv similarity index 74% rename from docs/source/Tables/scheduler_k8s.csv rename to docs/source/Tables/scheduler_k8s_rhel.csv index 98c2e9073..a55d50b10 100644 --- a/docs/source/Tables/scheduler_k8s.csv +++ b/docs/source/Tables/scheduler_k8s_rhel.csv @@ -10,8 +10,7 @@ * ``""calico""`` <- default * ``""flannel""`` - -.. note:: While setting up Kubernetes for the RoCE NIC, ensure that the value for this parameter is always set to ``flannel``. For more information, `click here `_. " + " "**pod_external_ip_range** ``string`` @@ -41,3 +40,21 @@ Optional ","* Kubernetes pod network CIDR for internal network. When used, it will assign IP addresses from this range to individual pods. * This network must be unused in your network infrastructure. * **Default value**: ""10.233.64.0/18""" +"**topology_manager_policy** + + ``string`` + + Optional ","* Kubernetes Topology manager policies. +* Accepted values are ``none``, ``best-effort``, ``restricted``, or ``single-numa-node``. +* **Default value**: ``none`` + +**Example**: ``topology_manager_policy: ""none""``" +"**topology_manager_scope** + + ``string`` + + Optional ","* Kubernetes Topology manager scope. +* Accepted values are ``container`` or ``pod``. +* **Default value**: ``container`` + +**Example**: ``topology_manager_scope: ""container""``" diff --git a/docs/source/Tables/scheduler_k8s_ubuntu.csv b/docs/source/Tables/scheduler_k8s_ubuntu.csv new file mode 100644 index 000000000..14ae7b1e4 --- /dev/null +++ b/docs/source/Tables/scheduler_k8s_ubuntu.csv @@ -0,0 +1,71 @@ +Variables,Details +"**k8s_cni** + + ``string`` + + Required ","* Kubernetes SDN network. +* Required when ``scheduler_type: ""k8s""`` + + Choices: + + * ``""calico""`` <- default + * ``""flannel""`` + +.. note:: While setting up Kubernetes for the RoCE NIC, ensure that the value for this parameter is always set to ``flannel``. For more information, `click here <../AdvancedConfigurationsUbuntu/k8s_plugin_roce_nic.html>`_. " +"**pod_external_ip_range** + + ``string`` + + Required ","* These addresses will be used by Loadbalancer for assigning External IPs to K8s services +* Make sure the IP range is not assigned to any node in the cluster. +* **Example**: ""10.11.0.100-10.11.0.150"" , ""10.11.0.0/16"" " +"**ansible_config_file_path** + + ``string`` + + Required ","* Path to directory hosting ansible config file (ansible.cfg file) +* This directory is on the host running ansible, if ansible is installed using dnf +* If ansible is installed using pip, this path should be set +* **Default value**: ``/etc/ansible`` " +"**k8s_service_addresses** + + ``string`` + + Optional ","* Kubernetes internal network for services. +* This network must be unused in your network infrastructure. +* **Default value**: ""10.233.0.0/18"" " +"**k8s_pod_network_cidr** + + ``string`` + + Optional ","* Kubernetes pod network CIDR for internal network. When used, it will assign IP addresses from this range to individual pods. +* This network must be unused in your network infrastructure. +* **Default value**: ""10.233.64.0/18""" +"**topology_manager_policy** + + ``string`` + + Optional ","* Kubernetes Topology manager policies. +* Accepted values are ``none``, ``best-effort``, ``restricted``, or ``single-numa-node``. +* **Default value**: ``none`` + +**Example**: ``topology_manager_policy: ""none""``" +"**topology_manager_scope** + + ``string`` + + Optional ","* Kubernetes Topology manager scope. +* Accepted values are ``container`` or ``pod``. +* **Default value**: ``container`` + +**Example**: ``topology_manager_scope: ""container""``" +"**run_intel_gaudi_tests** + + ``boolean`` + + Optional ","* If value is ``true``, Omnia performs validation tests on the nodes containing Intel Gaudi accelerators. It uses the `Habana Collective Communications Library (HCCL) `_ and `Qualification tool `_. +* If value is ``false``, Omnia does not perform any validation tests on the nodes. +* Accepted values are ``true`` or ``false``. +* **Default value**: ``false`` + +**Example**: ``run_intel_gaudi_tests: ""false""``" diff --git a/docs/source/Tables/security_config.csv b/docs/source/Tables/security_config.csv index cb5ad1f77..ebfb393cd 100644 --- a/docs/source/Tables/security_config.csv +++ b/docs/source/Tables/security_config.csv @@ -1,7 +1,9 @@ Parameter,Details "domain_name - ``string`` - Required ","* Sets the intended domain name. + + ``string`` + + Required ","* Sets the intended domain name. * If dc=omnia,dc=test, Provide ``omnia.test`` * If dc=dell,dc=omnia,dc=com Provide ``dell.omnia.com`` diff --git a/docs/source/Tables/software_config.csv b/docs/source/Tables/software_config_rhel.csv similarity index 59% rename from docs/source/Tables/software_config.csv rename to docs/source/Tables/software_config_rhel.csv index 5ccd9e763..606d696ab 100644 --- a/docs/source/Tables/software_config.csv +++ b/docs/source/Tables/software_config_rhel.csv @@ -3,28 +3,25 @@ ``string`` -Required","* The operating system running on the cluster (``rhel``, ``rocky``, and ``ubuntu``). -**Default value**: ``ubuntu``." +Required","* The operating system running on the cluster. +* **Value**: ``rhel``, ``rocky``" "**cluster_os_version** ``string`` Required","* The OS Version that will be provisoned on compute nodes. -* For RHEL, the accepted values are 8.6, 8.7, and 8.8. -* For Rocky, the accepted values are 8.6, 8.7, and 8.8. -* For Ubuntu, the accepted values are 20.04, 22.04. -* **Default value**: 22.04" +* For RHEL and Rocky Linux, the accepted ``value`` is ``8.8``." "**repo_config** ``string`` Required","* The type of offline configuration user needs. -* When the value is set to ``always``, Omnia creates a local repository/registry on the Control plane hosting all the packages/images required for the cluster. -* When the value is set to ``partial``, Omnia creates a local repository/registry on the Control plane hosting all the packages/images except those listed in the ``user_repo_url/user_registry`` in ``input/local_repo_config.yml``. +* When the value is set to ``always``, Omnia creates a local repository/registry on the OIM hosting all the packages/images required for the cluster. +* When the value is set to ``partial``, Omnia creates a local repository/registry on the OIM hosting all the packages/images except those listed in the ``user_repo_url/user_registry`` in ``input/local_repo_config.yml``. * When the value is set to ``never``, Omnia does not create a local repository/registry. All the packages/images are directly downloaded on the cluster. .. note:: - * After ``local_repo.yml`` has run, the value of ``repo_config`` in ``input/software_config.json`` cannot be updated without running the `control_plane_cleanup.yml <../CleanUpScript.html>`_ script first. - * Irrespective of the value of ``repo_config``, all local repositories that are not available as images, debian packages, or RPMs will be downloaded and configured locally on the control plane. Additionally, AMD GPU drivers, CUDA, and OFED are downloaded by default. + * After ``local_repo.yml`` has run, the value of ``repo_config`` in ``input/software_config.json`` cannot be updated without running the `oim_cleanup.yml <../../Maintenance/cleanup.html>`_ script first. + * Irrespective of the value of ``repo_config``, all local repositories that are not available as images, debian packages, or RPMs will be downloaded and configured locally on the OIM. Additionally, AMD GPU drivers, CUDA, and OFED are also downloaded by default if they are mentioned in the ``input/software_config.json``. * **Accepted values**: @@ -36,11 +33,11 @@ Required","* The type of offline configuration user needs. ``JSON list`` -Required","* A JSON list of required software and (optionally) the software revision. +Required","* A JSON list of required software and (optionally) the software version. * The following software should be listed with a version in the list: BeeGFS, AMD GPU, Kubernetes, CUDA, OFED, UCX, and ROCm. * A minimum of one software should be provided in the list for ``local_repo.yml`` to execute correctly. * The ``_software_config.json`` will have the basic softwares present in it. To add additional software stacks, add the software under ``input/software_config.json``. -* For the list of all applicable softwares based on your , refer the templates at ``examples/template__software_config.json``. Supported ```` are RHEL, Rocky, and Ubuntu. +* For the list of all applicable softwares based on your ````, refer the templates at ``examples/template__software_config.json``. For example, ``examples/template_rhel_software_config.json``. .. note:: The accepted names for software is taken from ``input/config//``. " diff --git a/docs/source/Tables/software_config_ubuntu.csv b/docs/source/Tables/software_config_ubuntu.csv new file mode 100644 index 000000000..aa2003eb2 --- /dev/null +++ b/docs/source/Tables/software_config_ubuntu.csv @@ -0,0 +1,53 @@ +Parameter,Details +"**cluster_os_type** + +``string`` + +Required","* The operating system running on the cluster. + +* **Value**: ``ubuntu``." +"**cluster_os_version** + +``string`` + +Required","* The OS Version that will be provisoned on compute nodes. + +* For Ubuntu, the accepted values is 22.04. + +* **Default value**: 22.04" +"**repo_config** + +``string`` + +Required","* The type of offline configuration user needs. + +* When the value is set to ``always``, Omnia creates a local repository/registry on the OIM hosting all the packages/images required for the cluster. + +* When the value is set to ``partial``, Omnia creates a local repository/registry on the OIM hosting all the packages/images except those listed in the ``user_repo_url/user_registry`` in ``input/local_repo_config.yml``. + +* When the value is set to ``never``, Omnia does not create a local repository/registry. All the packages/images are directly downloaded on the cluster. + +.. note:: + * After ``local_repo.yml`` has run, the value of ``repo_config`` in ``input/software_config.json`` cannot be updated without running the `oim_cleanup.yml <../../Maintenance/cleanup.html>`_ script first. + * Irrespective of the value of ``repo_config``, all local repositories that are not available as images, debian packages, or RPMs will be downloaded and configured locally on the OIM. Additionally, AMD GPU drivers, Intel Gaudi drivers, CUDA, and OFED are downloaded by default if they are mentioned in the ``input/software_config.json``. + +* **Accepted values**: + + * ``always`` + * ``partial`` <- Default + * ``never`` " +"**softwares** + +``JSON list`` + +Required","* A JSON list of required software and (optionally) the software version. + +* The following software should be listed with a version in the list: BeeGFS, AMD GPU, Intel Gaudi, Kubernetes, CUDA, OFED, UCX, and ROCm. + +* A minimum of one software should be provided in the list for ``local_repo.yml`` to execute correctly. + +* The ``_software_config.json`` will have the basic softwares present in it. To add additional software stacks, add the software under ``input/software_config.json``. + +* For the list of all applicable softwares based on your ````, refer the templates at ``examples/template__software_config.json``. For example, ``examples/template_ubuntu_software_config.json``. + +.. note:: The accepted names for software is taken from ``input/config//``." diff --git a/docs/source/Tables/storage_config.csv b/docs/source/Tables/storage_config.csv index 73806d4da..40bb1f5d4 100644 --- a/docs/source/Tables/storage_config.csv +++ b/docs/source/Tables/storage_config.csv @@ -1,13 +1,14 @@ -Parameters,Details +Variables,Details "**nfs_client_params** ``JSON List`` Required ","* This JSON list contains all parameters required to set up NFS. +* Indicates if ``k8s_share`` or ``slurm_share`` is true. * For a bolt-on set up where there is a pre-existing NFS export, set ``nfs_server`` to ``false``. -* When ``nfs_server`` is set to ``true``, an NFS share is created on the control plane for access by all cluster nodes. -* For more information on the different kinds of configuration available, `click here. `_" -"**beegfs_rdma_support** +* When ``nfs_server`` is set to ``true``, an NFS share is created on the OIM for access by all cluster nodes. +* For more information on the different kinds of configuration available, `click here. `_" +"beegfs_rdma_support ``boolean`` Optional","This variable is used if user has RDMA-capable network hardware (e.g., InfiniBand) @@ -16,27 +17,27 @@ Choices: * ``false`` <- Default * ``true``" -"**beegfs_ofed_kernel_modules_path** +"beegfs_ofed_kernel_modules_path ``string`` Optional ","* The path where separate OFED kernel modules are installed. * Ensure that the path provided here exists on all target nodes. **Default value**: ``""/usr/src/ofa_kernel/default/include""`` " -"**beegfs_mgmt_server** +"beegfs_mgmt_server ``string`` Required ","BeeGFS management server IP. .. note:: The provided IP should have an explicit BeeGFS management server running . " -"**beegfs_mounts** +"beegfs_mounts ``string`` Optional ","Beegfs-client file system mount location. If ``storage_yml`` is being used to change the BeeGFS mounts location, set ``beegfs_unmount_client`` to ``true``. **Default value**: ""/mnt/beegfs"" " -"**beegfs_unmount_client** +"beegfs_unmount_client ``boolean`` Optional ","Changing this value to true will unmount running instance of BeeGFS client and should only be used when decommisioning BeeGFS, changing the mount location or changing the BeeGFS version. @@ -47,7 +48,7 @@ Choices: * ``true`` " -"**beegfs_version_change** +"beegfs_version_change ``boolean`` Optional ","Use this variable to change the BeeGFS version on the target nodes. diff --git a/docs/source/Tables/supported-nics.csv b/docs/source/Tables/supported-nics.csv index 14e68ccae..ef987e5c7 100644 --- a/docs/source/Tables/supported-nics.csv +++ b/docs/source/Tables/supported-nics.csv @@ -1,21 +1,27 @@ -NIC -Intel(R) Ethernet 10G 4P X710/I350 rNDC -Intel(R) Ethernet Converged Network Adapter X710 -Intel(R) 25GbE Ethernet Network Adapter E810 -NVIDIA ConnectX-5 Single Port 100 GbE QSFP+ -NVIDIA ConnectX-5 Single Port 0 GbE QSFP -NVIDIA ConnectX-5 Single Port 56 GbE QSFP+ -NVIDIA ConnectX-5 Ex 100 GbE QSFP -NVIDIA ConnectX-6 Single Port VPI HDR QSFP -NVIDIA ConnectX-6 Single Port VPI HDR 100 QSFP -NVIDIA ConnectX-7 NDR/NDR200 Adaptor -"NVIDIA Network Adapter (10 Gb) -" -Broadcom 10GBASE-T Ethernet -Broadcom Gigabit Ethetnet BCM5720 -Broadcom Adv Dual 10GBASE-t Ethernet -Broadcom Adv Dual 25Gb Ethernet -Broadcom NetXtreme Gigabit Ethernet -I350GbE Controller -QLogic 4X10GE -Broadcom BCM5760x +NIC Maker,Models supported by Omnia,Models validated with current version of Omnia +Intel®,"* Ethernet 10G 4P X710/I350 rNDC +* Ethernet Converged Network Adapter X710 +* 25GbE Ethernet Network Adapter E810 +* 100GbE Ethernet Network Adapter E810 +* I350GbE Ethernet Controller",* 100GbE Ethernet Network Adapter E810 +NVIDIA ,"* ConnectX-5 Single Port 100 GbE QSFP+ +* ConnectX-5 Single Port 0 GbE QSFP +* ConnectX-5 Single Port 56 GbE QSFP+ +* ConnectX-5 Ex 100 GbE QSFP +* ConnectX-6 Single Port VPI HDR QSFP +* ConnectX-6 Single Port VPI HDR 100 QSFP +* ConnectX-7 NDR/NDR200 Adaptor +* Network Adapter (10 Gb)","* ConnectX-6 Single Port VPI HDR QSFP +* ConnectX-6 Single Port VPI HDR 100 QSFP +* ConnectX-7 NDR/NDR200 Adaptor + +.. note:: The latest firmware for Connect X6 and X7 models has not been validated." +Broadcom ,"* 10GBASE-T Ethernet +* Gigabit Ethetnet BCM5720 +* Adv Dual 10GBASE-t Ethernet +* Adv Dual 25Gb Ethernet +* NetXtreme Gigabit Ethernet +* BCM5760x","* 10GBASE-T Ethernet +* Gigabit Ethetnet BCM5720 +* BCM5760x" +QLogic,FastLinQ 4X10GE,FastLinQ 4X10GE diff --git a/docs/source/Tables/supported-poweredge-amd-servers.csv b/docs/source/Tables/supported-poweredge-amd-servers.csv new file mode 100644 index 000000000..277c8fed3 --- /dev/null +++ b/docs/source/Tables/supported-poweredge-amd-servers.csv @@ -0,0 +1,4 @@ +Generation,Models supported by Omnia ,Models validated with current version of Omnia +14G,"R6415, R7415, R7425",R7425 +15G,"R6515, R6525, R7515, R7525, C6525, XE8545","C6525, R6525" +16G,"R6625, R7625, R7615, R6615","R6625, R7625" diff --git a/docs/source/Tables/supported-poweredge-intel-servers.csv b/docs/source/Tables/supported-poweredge-intel-servers.csv new file mode 100644 index 000000000..74b9bfe9a --- /dev/null +++ b/docs/source/Tables/supported-poweredge-intel-servers.csv @@ -0,0 +1,4 @@ +Generation,Models supported by Omnia,Models validated with current version of Omnia +14G,"C4140, C6420, R240, R340, R440, R540, R640, R740, R740xd, R740xd2, R840, R940, R940xa","R740xd, R340, R440" +15G,"C6520, R650, R750, R750xa, R750xs",R750xs +16G,"C6620, R660, R760, XE8640, R760xa, R760xd2, XE9680","R760xa, R760xd2, R760, C6620, XE9680" diff --git a/docs/source/Tables/supported-poweredge-servers.csv b/docs/source/Tables/supported-poweredge-servers.csv deleted file mode 100644 index 10f6c2a55..000000000 --- a/docs/source/Tables/supported-poweredge-servers.csv +++ /dev/null @@ -1,4 +0,0 @@ -Server Type,Server Model -14G,"C4140, C6420, R240, R340, R440, R540, R640, R740, R740xd, R740xd2, R840, R940, R940xa" -15G,"C6520, R650, R750, R750xa" -16G,"C6620, R660, R6625, R760, XE8640, R760xa [1]_ , R760xd2, XE9680" diff --git a/docs/source/Tables/supported-switches.csv b/docs/source/Tables/supported-switches.csv new file mode 100644 index 000000000..856ee9b91 --- /dev/null +++ b/docs/source/Tables/supported-switches.csv @@ -0,0 +1,18 @@ +Switch make and type,Models supported by Omnia,Models validated with current version of Omnia +NVIDIA InfiniBand Switches,"* NVIDIA MQM8700-HS2F Quantum HDR InfiniBand Switch 40 QSFP56 +* NVIDIA QUANTUM-2 QM9700","* NVIDIA MQM8700-HS2F Quantum HDR InfiniBand Switch 40 QSFP56 +* NVIDIA QUANTUM-2 QM9700" +Dell Networking Switches,"* PowerSwitch S3048-ON +* PowerSwitch S5232F-ON +* PowerSwitch Z9264F-ON +* PowerSwitch N3248TE-ON +* PowerSwitch S4148 +* PowerSwitch Z9664F +* PowerSwitch Z9432-ON +* PowerSwitch Z9864F-ON","* PowerSwitch S3048-ON +* PowerSwitch S5232F-ON +* PowerSwitch Z9264F-ON +* PowerSwitch N3248TE-ON +* PowerSwitch S4148 +* PowerSwitch Z9432-ON +* PowerSwitch Z9864F-ON" diff --git a/docs/source/Tables/telemetry_config.csv b/docs/source/Tables/telemetry_config.csv index e082442c9..cd5832419 100644 --- a/docs/source/Tables/telemetry_config.csv +++ b/docs/source/Tables/telemetry_config.csv @@ -40,6 +40,37 @@ Required ","* Enables visualizations. .. note:: When ``visualization_support`` is true, ``grafana_username`` and ``grafana_password`` become mandatory." +"**k8s_prometheus_support** + +``boolean`` + +Optional ","* This variable signifies whether Kubernetes metrics will be collected by the Prometheus metrics exporter or not. +* If the variable value is ``true``, Kube Prometheus will be deployed on the ``kube_control_plane``. Kube Prometheus is a set of Kubernetes manifests, tools, and configurations that makes it easier to set up and manage Prometheus monitoring in a Kubernetes environment. +* For the complete list of Kubernetes metrics collected by Prometheus, `click here `_ +* **Values:** + * ``true`` + + * ``false`` <- **Default**" +"**prometheus_scrape_interval** + +``integer`` + +Optional ","* Providing values to this variable is mandatory if ``k8s_prometheus_support`` is ``true``. +* This variable determines how frequently (time interval in seconds) the Prometheus exporter gathers the metrics from the target nodes. +* This variable accepts values in seconds. +* **Default value:** 15" +"**prometheus_gaudi_support** + +``boolean`` + +Optional ","* This variable signifies whether Intel Gaudi metrics will be collected by the Gaudi Prometheus metrics exporter or not. +* The ``k8s_prometheus_support`` variable must be ``true`` for the metrics to be collected. +* **Values:** + * ``true`` + + * ``false`` <- **Default** + +.. note:: Support for Intel Gaudi metrics collection via Prometheus exporter is only available for clusters running on Ubuntu 22.04 OS." "**k8s_service_addresses** ``string`` @@ -65,7 +96,7 @@ Required ","* Kubernetes pod network CIDR for internal network. When used, it wi ``string`` -Required ","* These addresses will be used by Loadbalancer for assigning External IPs to K8s services running on control plane. +Required ","* These addresses will be used by Loadbalancer for assigning External IPs to K8s services running on the OIM. * Make sure the IP range is not assigned to any node in the cluster. * If ``admin_nic`` network provided in ``network_spec.yml`` is in ``""10.11.0.0""`` network, then ``pod_external_ip_range`` should be in same network such as ``""10.11.0.60-10.11.0.70""``. * Acceptable formats: ``""10.11.0.100-10.11.0.150""`` , ``""10.11.0.0/16""`` @@ -138,7 +169,7 @@ Required ","* This variable denotes the time interval (seconds) of telemetry dat ``boolean`` [1]_ Required ","* This variable is used to enable metric collection part of the regular metric group. -* For a list of regular metrics collected, `click here. <../../Roles/Telemetry/TelemetryMetrics.html#regular-metrics>`_ +* For a list of regular metrics collected, `click here. `_ * **Values:** * ``true`` <- **Default** @@ -148,7 +179,7 @@ Required ","* This variable is used to enable metric collection part of the regu ``boolean`` [1]_ Required ","* This variable is used to enable metric collection part of the health check metric group. -* For a list of health metrics collected, `click here. <../../Roles/Telemetry/TelemetryMetrics.html#health-metrics>`_ +* For a list of health metrics collected, `click here. `_ * **Values:** * ``true`` <- **Default** @@ -158,7 +189,7 @@ Required ","* This variable is used to enable metric collection part of the heal ``boolean`` [1]_ Required ","* This variable is used to enable metric collection related to GPU. -* For a list of GPU metrics collected, `click here. <../../Roles/Telemetry/TelemetryMetrics.html#gpu-metrics>`_ +* For a list of GPU metrics collected, `click here. `_ * **Values:** diff --git a/docs/source/Telemetry/Gaudi_metrics.rst b/docs/source/Telemetry/Gaudi_metrics.rst new file mode 100644 index 000000000..b62b2edfe --- /dev/null +++ b/docs/source/Telemetry/Gaudi_metrics.rst @@ -0,0 +1,9 @@ +Intel Gaudi metrics collected by the Prometheus exporter +=========================================================== + +The supported Intel Gaudi metrics are: + +.. csv-table:: Intel Gaudi metrics for Prometheus + :file: ../Tables/intel_gaudi_metrics.csv + :header-rows: 1 + :keepspace: \ No newline at end of file diff --git a/docs/source/Roles/Telemetry/MetricInfo.rst b/docs/source/Telemetry/MetricInfo.rst similarity index 94% rename from docs/source/Roles/Telemetry/MetricInfo.rst rename to docs/source/Telemetry/MetricInfo.rst index 1bc23eb22..4117d92be 100644 --- a/docs/source/Roles/Telemetry/MetricInfo.rst +++ b/docs/source/Telemetry/MetricInfo.rst @@ -2,7 +2,7 @@ Additional metric information ------------------------------- .. csv-table:: Omnia telemetry metrics - :file: ../../Tables/Metrics.csv + :file: ../Tables/Metrics.csv :header-rows: 1 :keepspace: diff --git a/docs/source/Telemetry/Prometheus_k8s.rst b/docs/source/Telemetry/Prometheus_k8s.rst new file mode 100644 index 000000000..d077624f6 --- /dev/null +++ b/docs/source/Telemetry/Prometheus_k8s.rst @@ -0,0 +1,91 @@ +View the Kubernetes and Intel Gaudi metrics from the Prometheus UI and Grafana +==================================================================================== + +Prometheus metrics visualization refers to the process of displaying the metrics collected by the Prometheus exporter in a visual format, enabling easier analysis and interpretation. Using the Prometheus UI and integration with tools like Grafana, users can create custom dashboards, graphs, and charts to visualize metric trends and monitor system health. + +**Supported metrics** + +* The list of Kubernetes metrics collected by the Prometheus exporter can be found `here `_. +* The list of Intel Gaudi metrics collected by the Prometheus exporter is `linked here `_. + +**Prerequisites** + +* To view the Kubernetes and Intel Gaudi metrics from the Prometheus UI, the ``k8s_prometheus_support`` and ``prometheus_gaudi_support`` variables in ``input/telemetry_config.yml`` must be set to ``true``. All the variables and their related information for the configuration file can be found `here `_. +* To enable visualization for the supported metrics using Grafana, the ``visualization_support`` variable in ``input/telemetry_config.yml`` must be set to ``true`` in addition to the above mentioned variables. + +**Execute the telemetry playbook** + +With the above mentioned variable values provided to the ``input/telemetry_config.yml`` file, execute the ``telemetry.yml`` playbook using the below command: :: + + cd telemetry + ansible-playbook telemetry.yml -i + +.. note:: The provided inventory file must contain a ``kube_control_plane``, single or multiple ``kube_node``, and an ``etcd`` node. + +Accessing the Prometheus server for Kubernetes and Gaudi metrics +------------------------------------------------------------------ + +**Access the Prometheus server from the** ``kube_control_plane`` **or** ``kube_node`` + +1. After you have executed the ``telemetry.yml`` playbook, run the following command on the ``kube_control_plane`` to bring up all the services that are currently running on the Kubernetes cluster: :: + + kubectl get svc -A + +2. Locate the ``prometheus-kube-prometheus-prometheus`` service under the ``monitoring`` namespace. You can access the Prometheus server with the corresponding ``CLUSTER-IP`` of the Prometheus service. + +**Access the Prometheus server from the Omnia OIM** + +3. After you have executed the ``telemetry.yml`` playbook, run the following command on the ``kube_control_plane`` to bring up all the services that are currently running on the Kubernetes cluster: :: + + kubectl get svc -A + +4. Locate the ``prometheus-kube-prometheus-prometheus`` service under the ``monitoring`` namespace. + +5. Update the Prometheus service ``TYPE``: + + - Use the following command to change the Prometheus service type to ``LoadBalancer`` and automatically assign an ``EXTERNAL-IP``: :: + + kubectl patch service prometheus-kube-prometheus-prometheus -n monitoring -p '{"spec": {"type": "LoadBalancer"}}' + + - Or, use the following command to change the Prometheus service type to ``NodePort``: :: + + kubectl patch service prometheus-kube-prometheus-prometheus -n monitoring -p '{"spec": {"type": "NodePort"}}' + +6. To access the Prometheus server from any browser, you can use ``:9090`` from the Omnia OIM or ``kube_control_plane``, and ``:`` from the ``kube_node``. + +7. Once you navigate to ``:9090`` from your browser, the Prometheus UI is displayed. A sample image of the UI is provided below: + + .. image:: ../images/Prometheus_ui.png + +8. From the UI, you can click the "Globe" button (highlighted in yellow in the following image) to open up the Metrics Explorer. + + .. image:: ../images/Prometheus_ui_2.png + +9. Use the **Metrics Explorer** to get all the available Kubernetes and Intel Gaudi metrics. A sample image of the Metrics Explorer is provided below: + + .. image:: ../images/Prometheus_ui_3.png + +Visualize the Kubernetes and Intel Gaudi metrics using Grafana +---------------------------------------------------------------- + +1. Find the IP address of the Grafana service using ``kubectl get svc -n grafana`` + + .. image:: ../images/grafanaIP.png + +2. Login to the Grafana UI by connecting to the cluster IP of grafana service obtained above via port 5000, that's ``http://xx.xx.xx.xx:5000/login`` + + .. image:: ../images/Grafana_login.png + +3. Add the Prometheus data source to Grafana + + .. image:: ../images/Prometheus_datasource.png + +4. Add the Prometheus server URL to the datasource configuration window, for example - ``http://10.50.3.101:9090`` + + .. image:: ../images/Prometheus_datasource2.png + +5. Click ``Save & test``. A green checkbox pops up signifying successful configuration of the Prometheus datasource. + +6. From the dashboard menu on the left, create a dashboard with your own settings or import an existing one from `Grafana dashboards `_. Set the datasource to ``Prometheus`` while configuring the dashboard. For more information on how to import dashboards, `click here `_ + +7. Click ``Load`` to bring up the Grafana dashboard with the Prometheus metrics. \ No newline at end of file diff --git a/docs/source/Roles/Telemetry/TelemetryMetrics.rst b/docs/source/Telemetry/TelemetryMetrics.rst similarity index 85% rename from docs/source/Roles/Telemetry/TelemetryMetrics.rst rename to docs/source/Telemetry/TelemetryMetrics.rst index bab126264..1d0e45d67 100644 --- a/docs/source/Roles/Telemetry/TelemetryMetrics.rst +++ b/docs/source/Telemetry/TelemetryMetrics.rst @@ -7,7 +7,7 @@ Regular metrics **Your cluster in numbers**: Regular metrics include information such as CPU, memory, packets errors, drives etc. .. csv-table:: Regular metrics - :file: ../../Tables/Metrics_Regular.csv + :file: ../Tables/Metrics_Regular.csv :header-rows: 1 :keepspace: @@ -18,7 +18,7 @@ Health metrics **The health of your cluster**: Health metrics include key performance indicators. .. csv-table:: Health metrics - :file: ../../Tables/Metrics_Health.csv + :file: ../Tables/Metrics_Health.csv :header-rows: 1 :keepspace: @@ -31,7 +31,7 @@ GPU metrics **The GPUs of your cluster**: GPU metrics include information about GPUs in the cluster .. csv-table:: GPU metrics - :file: ../../Tables/Metrics_GPU.csv + :file: ../Tables/Metrics_GPU.csv :header-rows: 1 :keepspace: diff --git a/docs/source/Roles/Telemetry/TimescaleDB.rst b/docs/source/Telemetry/TimescaleDB.rst similarity index 86% rename from docs/source/Roles/Telemetry/TimescaleDB.rst rename to docs/source/Telemetry/TimescaleDB.rst index 66d1e27da..6526b47f5 100644 --- a/docs/source/Roles/Telemetry/TimescaleDB.rst +++ b/docs/source/Telemetry/TimescaleDB.rst @@ -3,14 +3,14 @@ Timescale DB **Accessing the timescale DB** -1. Check the IP of the control plane (``ifconfig``): :: +1. Check the IP of the OIM (``ifconfig``): :: 3: eno8403: mtu 1500 qdisc mq state UP group default qlen 1000 link/ether b4:45:06:eb:da:4e brd ff:ff:ff:ff:ff:ff inet 198.168.0.11/24 brd 198.168.0.255 scope global dynamic noprefixroute eno8403 validlft 30884289sec preferred_lft 30884289sec ínet6 fe80::b645:6ff:feeb:da4e/64 scope link noprefixroute validlft forever preferredlft forever 2. Check the external port on which timescaleDB is running (``kubectl get svc -A``): - .. image:: ../../images/TimescaleDB_Ports.png + .. image:: ../images/TimescaleDB_Ports.png 3. Connect to DB (``psql -h -p -U -d telemetry_metrics``) @@ -31,11 +31,11 @@ For iDRAC telemetry: :: *Query for Omnia telemetry* - .. image:: ../../images/TimescaleDB_table.png + .. image:: ../images/TimescaleDB_table.png *Query for iDRAC telemetry* - .. image:: ../../images/publictimeseries.png + .. image:: ../images/publictimeseries.png For the entire set of iDRAC telemetry metrics, `click here `_. diff --git a/docs/source/Roles/Telemetry/Visualizations/ParallelCoordinates.rst b/docs/source/Telemetry/Visualizations/ParallelCoordinates.rst similarity index 76% rename from docs/source/Roles/Telemetry/Visualizations/ParallelCoordinates.rst rename to docs/source/Telemetry/Visualizations/ParallelCoordinates.rst index 7b08058f8..8436f9512 100644 --- a/docs/source/Roles/Telemetry/Visualizations/ParallelCoordinates.rst +++ b/docs/source/Telemetry/Visualizations/ParallelCoordinates.rst @@ -3,38 +3,38 @@ Parallel coordinates Parallel coordinates are a great way to visualize multiple metric dimensions simultaneously to see trends and spot outlier activity. Metrics like CPU temp, Fan Speed, Memory Usage etc. can be added or removed as an additional vertical axis. This implementation of parallel coordinate graphing includes a display of metric value distribution in the form of a violin plot along vertical axes and the ability to interact with the graph to perform filtering. Metric range filtering on one or more axes automatically filters the node and sample list in the top left-hand panel to the nodes and samples that fit the filtering criteria. -.. image:: ../../../images/Visualization/ParallelCoordinates_InitialView_Collapsed.png +.. image:: ../../images/Visualization/ParallelCoordinates_InitialView_Collapsed.png In the above image, both left-hand panels are collapsed to allow for a better view of the graph. They can be expanded by clicking on the arrows highlighted in the picture. The expanded panels can be used to customize the graph. -.. image:: ../../../images/Visualization/ParallelCoordinates_InitialView_Expanded.png +.. image:: ../../images/Visualization/ParallelCoordinates_InitialView_Expanded.png In the above image, both left-hand panels are expanded and can be minimized by clicking on the minimize arrows on the right of each panel. These panels can be used to customize the graphs by: * Filtering by node and node metrics * Assigning colors to different node metrics -.. image:: ../../../images/Visualization/ParallelCoordinates_Recoloration.png +.. image:: ../../images/Visualization/ParallelCoordinates_Recoloration.png In the above image, the metric **Power Consumption** has been assigned a color to highlight the metric. -.. image:: ../../../images/Visualization/ParallelCoordinates_NodeSelection.png +.. image:: ../../images/Visualization/ParallelCoordinates_NodeSelection.png In the above image, data has been filtered by **Node** to get insights into different metrics about specific nodes. -.. image:: ../../../images/Visualization/ParallelCoordinates_TopLeftPanel_NodeHighlight.png +.. image:: ../../images/Visualization/ParallelCoordinates_TopLeftPanel_NodeHighlight.png In the above image, data for a single node has been highlighted using the top-left panel. -.. image:: ../../../images/Visualization/ParallelCoordinates_MetricFiltering.png +.. image:: ../../images/Visualization/ParallelCoordinates_MetricFiltering.png In the above image, metric filters were applied on **Power Consumption** by clicking on the vertical axis and dragging a filter box over the range of values required. The top left panel will display nodes and samples that fit the filter. Filters are removed by clicking on the vertical dimension axis again. -.. image:: ../../../images/Visualization/ParallelCoordinates_DoubleMetricFiltering.png +.. image:: ../../images/Visualization/ParallelCoordinates_DoubleMetricFiltering.png In the above image, metric filters were applied on **Power Consumption** and **NIC temperature** . Using more than one filter will result in fewer nodes and telemetry samples that meet the filtering criteria. -.. image:: ../../../images/Visualization/ParallelCoordinates_TimeFiltering.png +.. image:: ../../images/Visualization/ParallelCoordinates_TimeFiltering.png In the above image, the top-right panel was used to filter data by time, this can be done in 2 ways: diff --git a/docs/source/Roles/Telemetry/Visualizations/index.rst b/docs/source/Telemetry/Visualizations/index.rst similarity index 92% rename from docs/source/Roles/Telemetry/Visualizations/index.rst rename to docs/source/Telemetry/Visualizations/index.rst index f87522699..24c074c0b 100644 --- a/docs/source/Roles/Telemetry/Visualizations/index.rst +++ b/docs/source/Telemetry/Visualizations/index.rst @@ -11,15 +11,15 @@ Once ``discovery_provision.yml`` is executed and Grafana is set up, use ``teleme i. Find the IP address of the Grafana service using ``kubectl get svc -n grafana`` - .. image:: ../../../images/grafanaIP.png + .. image:: ../../images/grafanaIP.png ii. Login to the Grafana UI by connecting to the cluster IP of grafana service obtained above via port 5000. That is ``http://xx.xx.xx.xx:5000/login`` - .. image:: ../../../images/Grafana_login.png + .. image:: ../../images/Grafana_login.png iii. Enter the ``grafana_username`` and ``grafana_password`` as mentioned in ``input/telemetry_config.yml``. - .. image:: ../../../images/Grafana_Dashboards.png + .. image:: ../../images/Grafana_Dashboards.png **All your data in a glance**: diff --git a/docs/source/Roles/Telemetry/index.rst b/docs/source/Telemetry/index.rst similarity index 79% rename from docs/source/Roles/Telemetry/index.rst rename to docs/source/Telemetry/index.rst index f57cd5155..1544eff81 100644 --- a/docs/source/Roles/Telemetry/index.rst +++ b/docs/source/Telemetry/index.rst @@ -8,12 +8,16 @@ The telemetry feature allows the set up of Omnia telemetry (to poll values from To initiate telemetry support, fill out the following parameters in ``input/telemetry_config.yml``: .. csv-table:: Parameters - :file: ../../Tables/telemetry_config.csv + :file: ../Tables/telemetry_config.csv :header-rows: 1 :keepspace: .. [1] Boolean parameters do not need to be passed with double or single quotes. +.. note:: The ``input/telemetry_config.yml`` file is encrypted during the execution of ``omnia.yml`` playbook. Use the below commands to edit the encrypted input files: + :: + ansible-vault edit telemetry_config.yml --vault-password-file .telemetry_vault_key + Once you have executed ``discovery_provision.yml`` and has also provisioned the cluster, initiate telemetry on the cluster as part of ``omnia.yml``, which configures the cluster with scheduler, storage and authentication using the below command. :: ansible-playbook omnia.yml -i inventory @@ -24,11 +28,15 @@ Optionally, you can initiate only telemetry using the below command: :: .. note:: + * To run the ``telemetry.yml`` playbook independently from the ``omnia.yml`` playbook on Intel Gaudi nodes, start by executing the ``performance_profile.yml`` playbook. Once that’s done, you can run the ``telemetry.yml`` playbook separately. + * Depending on the type of telemetry initiated, include the following possible groups in the inventory: - * omnia_telemetry: slurm_control_node, slurm_node, kube_control_plane, kube_node, auth_server, login, etcd + * omnia_telemetry: ``slurm_control_node``, ``slurm_node``, ``login``, ``kube_control_plane``, ``kube_node``, ``etcd``, ``auth_server`` - * idrac_telemetry: idrac + * idrac_telemetry: ``idrac`` + + * k8s_telemetry on Prometheus: ``kube_control_plane``, ``kube_node``, ``etcd`` * If you would like a local backup of the timescaleDB used to store telemetry data, `click here <../Utils/timescaledb_utility.html>`_. @@ -47,13 +55,13 @@ To modify how data is collected from the cluster, modify the variables in ``omni .. note:: * Currently, changing the ``grafana_username`` and ``grafana_password`` values is not supported via ``telemetry.yml``. * The passed inventory should have an idrac group, if ``idrac_telemetry_support`` is true. - * If ``omnia_telemetry_support`` is true, then the inventory should have control plane and cluster node groups (as specified in the sample files) along with optional login group. + * If ``omnia_telemetry_support`` is true, then the inventory should have OIM and cluster node groups (as specified in the sample files) along with optional login group. * Rocky Linux 8.7 is not compatible with the Kubernetes installed by ``telemetry.yml`` due to known issues with cri-o. For more information, `click here `_. * If a subsequent run of ``telemetry.yml`` fails, the ``telemetry_config.yml`` file will be unencrypted. **To access the Grafana UI** -*Pre requisites* +*Prerequisites* * ``visualisation_support`` should be set to true when running ``telemetry.yml`` or ``omnia.yml``. @@ -61,31 +69,31 @@ To modify how data is collected from the cluster, modify the variables in ``omni i. Find the IP address of the Grafana service using ``kubectl get svc -n grafana`` -.. image:: ../../images/grafanaIP.png +.. image:: ../images/grafanaIP.png -ii. Login to the Grafana UI by connecting to the cluster IP of grafana service obtained above via port 5000. That is ``http://xx.xx.xx.xx:5000/login`` +ii. Login to the Grafana UI by connecting to the cluster IP of grafana service obtained above via port 5000, that's ``http://xx.xx.xx.xx:5000/login`` -.. image:: ../../images/Grafana_login.png +.. image:: ../images/Grafana_login.png iii. Enter the ``grafana_username`` and ``grafana_password`` as mentioned in ``input/telemetry_config.yml``. -.. image:: ../../images/Grafana_Dashboards.png +.. image:: ../images/Grafana_Dashboards.png Loki log collections can viewed on the explore section of the grafana UI. -.. image:: ../../images/Grafana_Loki.png +.. image:: ../images/Grafana_Loki.png Datasources configured by Omnia can be viewed as seen below. -.. image:: ../../images/GrafanaDatasources.png +.. image:: ../images/GrafanaDatasources.png **To use Loki for log filtering** @@ -93,7 +101,7 @@ Datasources configured by Omnia can be viewed as seen below. ii. In the Explore page, select **control-plane-loki**. - .. image:: ../../images/Grafana_ControlPlaneLoki.png + .. image:: ../images/Grafana_ControlPlaneLoki.png iii. The log browser allows you to filter logs by job, node and/or user. @@ -109,7 +117,7 @@ Example :: ii. In the Explore page, select **telemetry-postgres**. - .. image:: ../../images/Grafana_Telemetry_PostGRES.png + .. image:: ../images/Grafana_Telemetry_PostGRES.png iii. The query builder allows you to create SQL commands that can be used to query the ``omnia_telemetry.metrics`` table. Filter the data required using the following fields: @@ -124,7 +132,7 @@ Example :: *iDRAC telemetry data from Grafana* - .. image:: ../../images/idractelemetry.png + .. image:: ../images/idractelemetry.png .. note:: If you are more comfortable using SQL queries over the query builder, click on **Edit SQL** to directly provide your query. Optionally, the data returned from a query can be viewed as a graph. @@ -137,7 +145,8 @@ If ``idrac_telemetry_support`` and ``visualisation_support`` is set to true, Par TelemetryMetrics MetricInfo TimescaleDB - + Prometheus_k8s + Gaudi_metrics diff --git a/docs/source/Troubleshooting/FAQ.rst b/docs/source/Troubleshooting/FAQ.rst deleted file mode 100644 index d0c2aaf7c..000000000 --- a/docs/source/Troubleshooting/FAQ.rst +++ /dev/null @@ -1,294 +0,0 @@ -Frequently asked questions -========================== - -⦾ **Why does the provisioning status of RHEL/Rocky Linux remote servers remain stuck at ‘installing’ in cluster.nodeinfo (omniadb)?** - -.. image:: ../images/InstallingStuckDB.png - -.. image:: ../images/InstallCorruptISO.png - -**Potential Causes**: - - * Disk partition may not have enough storage space per the requirements specified in ``input/provision_config`` (under ``disk_partition``). - - * The provided ISO may be corrupt/incomplete. - - * Hardware issues (Auto reboot may fail at POST) - - * A virtual disk may not have been created - - * Re-run of the ``discovery_provision.yml`` playbook on the control plane while provisioning is in-progress on the remote nodes. - - -**Resolution**: - - * Add more space to the server or modify the requirements specified in ``input/provision_config`` (under ``disk_partition``). - - * Download the ISO again, verify the checksum/ download size and re-run the provision tool. - - * Resolve/replace the faulty hardware and PXE boot the node. - - * Create a virtual disk and PXE boot the node. - - * Initiate PXE boot on the remote node after completion of the ``discovery_provision.yml`` playbook execution. - -⦾ **Why does the provisioning status of Ubuntu remote servers remain stuck at ‘bmcready’ or 'powering-on' in cluster.nodeinfo (omniadb)?** - -.. image:: ../images/ubuntu_pxe_failure.png - -**Potential Causes**: - - * Disk partition may not have enough storage space per the requirements specified in ``input/provision_config`` (under ``disk_partition``). - - * The provided ISO may be corrupt/incomplete. - - * Hardware issues (Auto reboot may fail at POST) - - * A virtual disk may not have been created - - * Re-run of the ``discovery_provision.yml`` playbook on the control plane while provisioning is in-progress on the remote nodes. - - -**Resolution**: - - * Add more space to the server or modify the requirements specified in ``input/provision_config`` (under ``disk_partition``). - - * Download the ISO again, verify the checksum/ download size and re-run the provision tool. - - * Resolve/replace the faulty hardware and PXE boot the node. - - * Create a virtual disk and PXE boot the node. - - * Initiate PXE boot on the remote node after completion of the ``discovery_provision.yml`` playbook execution. - - -⦾ **Why is the provisioning status of my target servers stuck at ‘powering-on’ in the cluster.info (omniadb)?** - -**Potential Cause**: - - * Hardware issues (Auto-reboot may fail due to hardware tests failing) - * The target node may already have an OS and the first boot PXE device is not configured correctly. - -**Resolution**: - - * Resolve/replace the faulty hardware and PXE boot the node. - * Target servers should be configured to boot in PXE mode with the appropriate NIC as the first boot device. - -⦾ **What to do if PXE boot fails while discovering target nodes via switch_based discovery with provisioning status stuck at 'powering-on' in cluster.nodeinfo (omniadb):** - -.. image:: ../images/PXEBootFail.png - -1. Rectify any probable causes like incorrect/unavailable credentials (``switch_snmp3_username`` and ``switch_snmp3_password`` provided in ``input/provision_config.yml``), network glitches, having multiple NICs with the same IP address as the control plane, or incorrect switch IP/port details. -2. Run the clean up script by: :: - - cd utils - ansible-playbook control_plane_cleanup.yml - -3. Re-run the provision tool (``ansible-playbook discovery_provision.yml``). - - -⦾ **What to do if playbook execution fails due to external (network, hardware etc) failure:** - -Re-run the playbook whose execution failed once the issue is resolved. - -⦾ **Why don't IPA commands work after setting up FreeIPA on the cluster?** - -**Potential Cause**: - - Kerberos authentication may be missing on the target node. - -**Resolution**: - - Run ``kinit admin`` on the node and provide the ``kerberos_admin_password`` when prompted. (This password is also entered in ``input/security_config.yml``.) - - -⦾ **Why am I unable to login using LDAP credentials after successfully creating a user account?** - -**Potential Cause**: - - Whitespaces in the LDIF file may have caused an encryption error. Verify whether there are any whitespaces in the file by running ``cat -vet ``. - - **Resolution:** - - Remove the whitespaces and re-run the LDIF file. - -⦾ **Why are the status and admin_mac fields not populated for specific target nodes in the cluster.nodeinfo table?** - -**Causes**: - - * Nodes do not have their first PXE device set as designated active NIC for PXE booting. - * Nodes that have been discovered via multiple discovery mechanisms may list multiple times. Duplicate node entries will not list MAC addresses. - -**Resolution**: - - * Configure the first PXE device to be active for PXE booting. - * PXE boot the target node manually. - * Duplicate node objects (identified by service tag) will be deleted automatically. To manually delete node objects, use ``utils/delete_node.yml``. - -⦾ **What to do if user login fails when accessing a cluster node?** - -.. image:: ../images/UserLoginError.png - -**Potential Cause**: - * ssh key on the control plane may be outdated. - -**Resolution**: - - * Refresh the key using ``ssh-keygen -R ``. - * Retry login. - -⦾ **Why does the 'Import SCP from a local path' task fail during idrac.yml?** - -.. image:: ../images/ImportSCPiDRAC_fail.png - -**Potential Cause**: The target server may be stalled during the booting process. - -**Resolution**: Bring the target node up and re-run the script. - -⦾ **Why is the node status stuck at 'powering-on' or 'powering-off' after a control plane reboot?** - -**Potential Cause**: The nodes were powering off or powering on during the control plane reboot/shutdown. - -**Resolution**: In the case of a planned shutdown, ensure that the control plane is shut down after the compute nodes. When powering back up, the control plane should be powered on and xCAT services resumed before bringing up the compute nodes. In short, have the control plane as the first node up and the last node down. - -For more information, `click here `_ - -⦾ **Why do subscription errors occur on RHEL control planes when rhel_repo_local_path (in input/provision_config.yml) is not provided and control plane does not have an active subscription?** - -.. image:: ../images/SubscriptionErrors.png - -For many of Omnia's features to work, RHEL control planes need access to the following repositories: - - 1. AppStream - 2. BaseOS - - -This can only be achieved using local repos specified in rhel_repo_local_path (``input/provision_config.yml``). - -.. note:: - To enable the repositories, run the following commands: :: - - subscription-manager repos --enable=codeready-builder-for-rhel-8-x86_64-rpms - subscription-manager repos --enable=rhel-8-for-x86_64-appstream-rpms - subscription-manager repos --enable=rhel-8-for-x86_64-baseos-rpms - - Verify your changes by running: :: - - yum repolist enabled - -⦾ **Why does the task: Initiate reposync of AppStream, BaseOS and CRB fail?** - -.. image:: ../images/RepoURLError.png - -**Potential Cause**: The ``repo_url``, ``repo_name`` or ``repo`` provided in ``rhel_repo_local_path`` (``input/provision_config.yml``) may not have been valid. - -Omnia does not validate the input of ``rhel_repo_local_path``. - -**Resolution**: Ensure the correct values are passed before re-running ``discovery_provision.yml``. - -⦾ **How to add a new node for provisioning** - - -1. Using a mapping file: - - * Update the existing mapping file by appending the new entry (without the disrupting the older entries) or provide a new mapping file by pointing ``pxe_mapping_file_path`` in ``provision_config.yml`` to the new location. - - * Run ``discovery_provision.yml``. - -2. Using the switch IP: - - * Run ``discovery_provision.yml`` once the switch has discovered the potential new node. - -⦾ **Why does the task: 'BeeGFS: Rebuilding BeeGFS client module' fail?** - -.. image:: ../images/BeeGFSFailure.png - -**Potential Cause**: BeeGFS version 7.3.0 is in use. - -**Resolution**: Use BeeGFS client version 7.3.1 when setting up BeeGFS on the cluster. - - -⦾ **Why does splitting an ethernet Z series port fail with "Failed. Either port already split with different breakout value or port is not available on ethernet switch"?** - - -**Potential Cause**: - - 1. The port is already split. - - 2. It is an even-numbered port. - -**Resolution**: - - Changing the ``breakout_value`` on a split port is currently not supported. Ensure the port is un-split before assigning a new ``breakout_value``. - - -⦾ **What to do if the LC is not ready:** - - -* Verify that the LC is in a ready state for all servers: ``racadm getremoteservicesstatus`` - -* PXE boot the target server. - -⦾ **Why does the task: 'Orchestrator: Deploy MetalLB IP Address pool' fail?** - -.. image:: ../images/Metallb_Telemetry_Apptainer_fail.png - -**Potential Cause**: ``/var`` partition is full (potentially due to images not being cleared after intel-oneapi images docker images are used to execute benchmarks on the cluster using apptainer support) . - -**Resolution**: Clear the ``/var`` partition and retry ``telemetry.yml``. - - -⦾ **Why does the task: [Telemetry]: TASK [grafana : Wait for grafana pod to come to ready state] fail with a timeout error?** - -**Potential Cause**: Docker pull limit exceeded. - -**Resolution**: Manually input the username and password to your docker account on the control plane. - -⦾ **Is provisioning servers using BOSS controller supported by Omnia?** - -From Omnia 1.2.1, provisioning a server using BOSS controller is supported. - -⦾ **What are the licenses required when deploying a cluster through Omnia?** - -While Omnia playbooks are licensed by Apache 2.0, Omnia deploys multiple softwares that are licensed separately by their respective developer communities. For a comprehensive list of software and their licenses, `click here <../Overview/SupportMatrix/omniainstalledsoftware.html>`_ . - -⦾ **Why does the task: TASK [hostname_validation : Verify the domain name is not blank in hostname] fail?** - -**Potential Cause**: Hostname is not configured properly with the domain name, on the target node. - -**Resolution**: Use the following commands to configure the hostname properly: :: - - - sysctl kernel.hostname=node001.omnia.test - hostnamectl set-hostname node001.omnia.test - - -.. note:: ``node001.omnia.test`` is a sample hostname. - -⦾ **local_repo.yml playbook execution fails at the TASK [parse_and_download : Display Failed Packages]** - -.. image:: ../images/package_failure_local_repo.png - -**Potential Cause**: This issue is encountered if Omnia fails to download any software package while executing ``local_repo.yml`` playbook. Download failures can occur if: - - * The URL to download the software packages mentioned in the ``//.json`` is incorrect or the repository is unreachable. - * The provided Docker credentials are incorrect or if you encounter a Docker pull limit issue. For more information, `click here `_. - * If the disk space is insufficient while downloading the package. - -**Resolution**: Re-run the ``local_repo.yml`` playbook while ensuring the following: - - * URL to download the software packages mentioned in ``//.json`` is correct, and the repository is reachable. - * Docker credentials provided in ``input/provision_config_credentials`` is correct. - * Sufficient disk space is available while downloading the package. For disk space considerations, see `local repo <../InstallationGuides/LocalRepo/Prerequisite.html>`_. - -If the ``local_repo.yml`` is executed successfully without any package download failures, a "success" message is displayed as shown below: - -.. image:: ../images/local_repo_success.png - -⦾ **Why does the provisioning status of Kubernetes RoCE pod remain stuck at 'Pending' or 'ContainerCreating' state?** - -.. image:: ../images/roce_pod_failure.png - -**Potential Cause**: This issue is encountered if incorrect parameter values are provided during the installation of the Kubernetes plugin for the RoCE NIC. For more information about the parameters and their accepted values, `click here <../InstallationGuides/BuildingClusters/k8s_plugin_roce_nic.html>`_. - -**Resolution**: If the RoCE pod is in 'Pending' or 'ContainerCreating' state, describe the pod to check for issues. If there is a mistake in the parameter values provided, use ``delete_roce_plugin.yml`` to delete the configurations made for the Kubernetes RoCE plugin, append the ``input/roce_plugin_config.yml`` with correct values and re-deploy the RoCE pod by executing ``deploy_roce_plugin.yml``. \ No newline at end of file diff --git a/docs/source/Troubleshooting/FAQ/Common/General_Query.rst b/docs/source/Troubleshooting/FAQ/Common/General_Query.rst new file mode 100644 index 000000000..0ab266735 --- /dev/null +++ b/docs/source/Troubleshooting/FAQ/Common/General_Query.rst @@ -0,0 +1,10 @@ +General Query +============== + +⦾ **What to do if playbook execution fails due to external (network, hardware etc) failure?** + +**Resolution**: Re-run the playbook whose execution failed once the issue is resolved. + +⦾ **What are the licenses required when deploying a cluster through Omnia?** + +**Resolution**: While Omnia playbooks are licensed by Apache 2.0, Omnia deploys multiple software that are licensed separately by their respective developer communities. For a comprehensive list of software and their licenses, `click here <../../../Overview/SupportMatrix/omniainstalledsoftware.html>`_. \ No newline at end of file diff --git a/docs/source/Troubleshooting/FAQ/Common/LocalRepo.rst b/docs/source/Troubleshooting/FAQ/Common/LocalRepo.rst new file mode 100644 index 000000000..7e0836514 --- /dev/null +++ b/docs/source/Troubleshooting/FAQ/Common/LocalRepo.rst @@ -0,0 +1,22 @@ +Local Repository +=================== + +⦾ **Why does the** ``local_repo.yml`` **playbook execution fail at** ``TASK [parse_and_download : Display Failed Packages]`` **?** + +.. image:: ../../../images/package_failure_local_repo.png + +**Potential Cause**: This issue is encountered if Omnia fails to download any software package while executing ``local_repo.yml`` playbook. Download failures can occur if: + + * The URL to download the software packages mentioned in the ``//.json`` is incorrect or the repository is unreachable. + * The provided Docker credentials are incorrect or if you encounter a Docker pull limit issue. For more information, `click here `_. + * If the disk space is insufficient while downloading the package. + +**Resolution**: Re-run the ``local_repo.yml`` playbook while ensuring the following: + + * URL to download the software packages mentioned in ``//.json`` is correct, and the repository is reachable. + * Docker credentials provided in ``input/provision_config_credentials`` is correct. + * Sufficient disk space is available while downloading the package. For disk space considerations, see the `Omnia installation guide <../../../OmniaInstallGuide/index.html>`_. + +If the ``local_repo.yml`` is executed successfully without any package download failures, a "success" message is displayed as shown below: + +.. image:: ../../../images/local_repo_success.png \ No newline at end of file diff --git a/docs/source/Troubleshooting/FAQ/Common/Network.rst b/docs/source/Troubleshooting/FAQ/Common/Network.rst new file mode 100644 index 000000000..5dc7cd032 --- /dev/null +++ b/docs/source/Troubleshooting/FAQ/Common/Network.rst @@ -0,0 +1,12 @@ +Network +========= + +⦾ **Why does splitting an ethernet Z series port fail with "Failed. Either port already split with different breakout value or port is not available on ethernet switch"?** + +**Potential Cause**: + + 1. The port is already split. + + 2. It is an even-numbered port. + +**Resolution**: Changing the ``breakout_value`` on a split port is currently not supported. Ensure the port is un-split before assigning a new ``breakout_value``. \ No newline at end of file diff --git a/docs/source/Troubleshooting/FAQ/Common/Provision.rst b/docs/source/Troubleshooting/FAQ/Common/Provision.rst new file mode 100644 index 000000000..9c6db6481 --- /dev/null +++ b/docs/source/Troubleshooting/FAQ/Common/Provision.rst @@ -0,0 +1,65 @@ +Provision +========== + +⦾ **Why is the provisioning status of my target servers stuck at ‘powering-on’ in the cluster.info (omniadb)?** + +**Potential Cause**: + + * Hardware issues (Auto-reboot may fail due to hardware tests failing) + * The target node may already have an OS and the first boot PXE device is not configured correctly. + +**Resolution**: + + * Resolve/replace the faulty hardware and PXE boot the node. + * Target servers should be configured to boot in PXE mode with the appropriate NIC as the first boot device. + +⦾ **What to do if PXE boot fails while discovering target nodes via switch_based discovery with provisioning status stuck at 'powering-on' in cluster.nodeinfo (omniadb):** + +.. image:: ../../../images/PXEBootFail.png + +1. Rectify any probable causes like incorrect/unavailable credentials (``switch_snmp3_username`` and ``switch_snmp3_password`` provided in ``input/provision_config.yml``), network glitches, having multiple NICs with the same IP address as the OIM, or incorrect switch IP/port details. +2. Run the clean up script by: :: + + cd utils + ansible-playbook oim_cleanup.yml + +3. Re-run the provision tool (``ansible-playbook discovery_provision.yml``). + +⦾ **Why are the status and admin_mac fields not populated for specific target nodes in the cluster.nodeinfo table?** + +**Causes**: + + * Nodes do not have their first PXE device set as designated active NIC for PXE booting. + * Nodes that have been discovered via multiple discovery mechanisms may list multiple times. Duplicate node entries will not list MAC addresses. + +**Resolution**: + + * Configure the first PXE device to be active for PXE booting. + * PXE boot the target node manually. + * Duplicate node objects (identified by service tag) will be deleted automatically. To manually delete node objects, use ``utils/delete_node.yml``. + +⦾ **What to do if user login fails when accessing a cluster node?** + +.. image:: ../../../images/UserLoginError.png + +**Potential Cause**: SSH key on the OIM may be outdated. + +**Resolution**: + + * Refresh the key using ``ssh-keygen -R ``. + * Retry login. + +⦾ **Why is the node status stuck at 'powering-on' or 'powering-off' after a OIM reboot?** + +**Potential Cause**: The nodes were powering off or powering on during the OIM reboot/shutdown. + +**Resolution**: In the case of a planned shutdown, ensure that the OIM is shut down after the compute nodes. When powering back up, the OIM should be powered on and xCAT services resumed before bringing up the compute nodes. In short, have the OIM as the first node up and the last node down. + +For more information, `click here `_ + +⦾ **What to do if the Lifecycle Controller (LC) is not ready?** + +**Resolution**: + +* Verify that the LC is in a ready state for all servers using: ``racadm getremoteservicesstatus`` +* PXE boot the target server. \ No newline at end of file diff --git a/docs/source/Troubleshooting/FAQ/Common/Security.rst b/docs/source/Troubleshooting/FAQ/Common/Security.rst new file mode 100644 index 000000000..462bb0f5a --- /dev/null +++ b/docs/source/Troubleshooting/FAQ/Common/Security.rst @@ -0,0 +1,31 @@ +Centralized authentication +============================= + +⦾ **Why am I unable to login using LDAP credentials after successfully creating a user account?** + +**Potential Cause**: Whitespaces in the LDIF file may have caused an encryption error. Verify whether there are any whitespaces in the file by running ``cat -vet ``. + +**Resolution**: Remove the whitespaces and re-run the LDIF file. + + +⦾ **Why does the** ``TASK [hostname_validation : Verify the domain name is not blank in hostname]`` **fail?** + +**Potential Cause**: Hostname is not configured properly with the domain name, on the target node. + +**Resolution**: Use the following commands to configure the hostname properly: :: + + + sysctl kernel.hostname=node001.omnia.test + hostnamectl set-hostname node001.omnia.test + + +.. note:: ``node001.omnia.test`` is an acceptable sample hostname. + + +⦾ **Why does the user login fail for an OpenLDAP user?** + +**Potential Cause**: Incorrect OpenLDAP service is running on the authentication server. + +**Resolution**: Ensure that ``slapd-ltb.service`` is running on the authentication server. Use the following command to check if the service running: :: + + systemctl status slapd-ltb.service \ No newline at end of file diff --git a/docs/source/Troubleshooting/FAQ/Common/Storage.rst b/docs/source/Troubleshooting/FAQ/Common/Storage.rst new file mode 100644 index 000000000..b836b3dc0 --- /dev/null +++ b/docs/source/Troubleshooting/FAQ/Common/Storage.rst @@ -0,0 +1,10 @@ +Storage +========= + +⦾ **Why does the** ``TASK [beegfs : Rebuilding BeeGFS client module]`` **fail?** + +.. image:: ../../../images/BeeGFSFailure.png + +**Potential Cause**: BeeGFS version 7.3.0 is in use. + +**Resolution**: Use BeeGFS client version 7.3.1 while setting up BeeGFS on the cluster. \ No newline at end of file diff --git a/docs/source/Troubleshooting/FAQ/Common/Telemetry.rst b/docs/source/Troubleshooting/FAQ/Common/Telemetry.rst new file mode 100644 index 000000000..869111ca7 --- /dev/null +++ b/docs/source/Troubleshooting/FAQ/Common/Telemetry.rst @@ -0,0 +1,16 @@ +Telemetry +========== + +⦾ **Why does the** ``TASK [orchestrator : Deploy MetalLB IP Address pool]`` **fail?** + +.. image:: ../../../images/Metallb_Telemetry_Apptainer_fail.png + +**Potential Cause**: ``/var`` partition is full (potentially due to images not being cleared after intel-oneapi images docker images are used to execute benchmarks on the cluster using apptainer support) . + +**Resolution**: Clear the ``/var`` partition and retry ``telemetry.yml``. + +⦾ **Why does the** ``TASK [grafana : Wait for grafana pod to come to ready state]`` **fail with a timeout error?** + +**Potential Cause**: Docker pull limit exceeded. + +**Resolution**: Manually input the username and password to your docker account on the OIM. \ No newline at end of file diff --git a/docs/source/Troubleshooting/FAQ/Common/Virtual_Env.rst b/docs/source/Troubleshooting/FAQ/Common/Virtual_Env.rst new file mode 100644 index 000000000..d20c323af --- /dev/null +++ b/docs/source/Troubleshooting/FAQ/Common/Virtual_Env.rst @@ -0,0 +1,40 @@ +Virtual environment +===================== + +⦾ **After executing** ``prereq.sh`` **script, why do I see the following error?** + +.. image:: ../../../images/virtual_env_1.png + :width: 600pt + +**Potential Cause**: The virtual environment created by the ``prereq.sh`` script is not activated. The activation of the virtual environment is completely user-driven. + +**Resolution**: + +* Executing ``./prereq.sh`` installs all the packages and sets up the virtual environment - but doesn't activate it. You can activate the Python virtual environment using the following command: :: + + source /opt/omnia/omnia17_venv/bin/activate + + .. image:: ../../../images/virtual_env_2.png + + +* To verify that the virtual environment is active, check if the following prompt is displayed: :: + + (omnia) [root@ omnia]# + + +⦾ **While executing any Omnia playbook, why do I encounter a "Command not found" or "Command 'ansible-playbook' not found" error?** + +.. image:: ../../../images/virtual_env_error_1.png + :width: 650pt + +.. image:: ../../../images/virtual_env_error_2.png + +**Potential Cause**: Omnia playbooks are being executed outside of the Omnia virtual environment. + +**Resolution**: Use the ``prereq.sh`` script to set up the Omnia virtual environment and install Ansible on the OIM. Activate the virtual environment following the steps provided `here <../../../OmniaInstallGuide/Ubuntu/Prereq.sh/index.html>`_ and then execute the playbooks. + +⦾ **Why does executing an Omnia playbook outside the git cloned Omnia repository folder lead to failure?** + +**Potential Cause**: Omnia does not support execution of playbooks outside of the git cloned Omnia repository folder. + +**Resolution**: Ensure to execute all playbooks from inside the git cloned Omnia repository folder. \ No newline at end of file diff --git a/docs/source/Troubleshooting/FAQ/Common/index.rst b/docs/source/Troubleshooting/FAQ/Common/index.rst new file mode 100644 index 000000000..24e6320e4 --- /dev/null +++ b/docs/source/Troubleshooting/FAQ/Common/index.rst @@ -0,0 +1,13 @@ +Common FAQs +============ + +.. toctree:: + + General_Query + Virtual_Env + LocalRepo + Provision + Network + Security + Storage + Telemetry \ No newline at end of file diff --git a/docs/source/Troubleshooting/FAQ/RHEL/Provision.rst b/docs/source/Troubleshooting/FAQ/RHEL/Provision.rst new file mode 100644 index 000000000..4a1816232 --- /dev/null +++ b/docs/source/Troubleshooting/FAQ/RHEL/Provision.rst @@ -0,0 +1,35 @@ +Provision +========== + +⦾ **Why does the provisioning status of RHEL/Rocky Linux remote servers remain stuck at ‘installing’ in cluster.nodeinfo (omniadb)?** + +.. image:: ../../../images/InstallingStuckDB.png + +.. image:: ../../../images/InstallCorruptISO.png + +.. csv-table:: + :file: ../../../Tables/FAQ_provision.csv + :header-rows: 1 + :keepspace: + +⦾ **Why do subscription errors occur on RHEL OIM when** ``rhel_repo_local_path`` **in** ``input/provision_config.yml`` **is not provided and OIM does not have an active subscription?** + +.. image:: ../../../images/SubscriptionErrors.png + +For many of Omnia's features to work, RHEL OIMs need access to the following repositories: + + 1. AppStream + 2. BaseOS + +This can only be achieved using local repos specified in rhel_repo_local_path (``input/provision_config.yml``). + +.. note:: + To enable the repositories, run the following commands: :: + + subscription-manager repos --enable=codeready-builder-for-rhel-8-x86_64-rpms + subscription-manager repos --enable=rhel-8-for-x86_64-appstream-rpms + subscription-manager repos --enable=rhel-8-for-x86_64-baseos-rpms + + Verify your changes by running: :: + + yum repolist enabled \ No newline at end of file diff --git a/docs/source/Troubleshooting/FAQ/RHEL/Security.rst b/docs/source/Troubleshooting/FAQ/RHEL/Security.rst new file mode 100644 index 000000000..5ebb91f67 --- /dev/null +++ b/docs/source/Troubleshooting/FAQ/RHEL/Security.rst @@ -0,0 +1,9 @@ +Centralized authentication +============================= + +⦾ **Why don't IPA commands work after setting up FreeIPA on the cluster?** + +**Potential Cause**: Kerberos authentication may be missing on the target node. + +**Resolution**: Run ``kinit admin`` on the node and provide the ``kerberos_admin_password`` when prompted. (This password is also entered in ``input/security_config.yml``) + diff --git a/docs/source/Troubleshooting/FAQ/RHEL/index.rst b/docs/source/Troubleshooting/FAQ/RHEL/index.rst new file mode 100644 index 000000000..29623fe2a --- /dev/null +++ b/docs/source/Troubleshooting/FAQ/RHEL/index.rst @@ -0,0 +1,8 @@ +FAQs related to RHEL/Rocky Linux clusters +=========================================== + +.. toctree:: + + Provision + Security + diff --git a/docs/source/Troubleshooting/FAQ/Ubuntu/Provision.rst b/docs/source/Troubleshooting/FAQ/Ubuntu/Provision.rst new file mode 100644 index 000000000..7562dcef9 --- /dev/null +++ b/docs/source/Troubleshooting/FAQ/Ubuntu/Provision.rst @@ -0,0 +1,23 @@ +Provision +=========== + +⦾ **Why does the provisioning status of Ubuntu remote servers remain stuck at ‘bmcready’ or 'powering-on' in cluster.nodeinfo (omniadb)?** + +.. image:: ../../../images/ubuntu_pxe_failure.png + +.. csv-table:: + :file: ../../../Tables/FAQ_provision.csv + :header-rows: 1 + :keepspace: + +⦾ **Why does the provisioning status of Kubernetes RoCE pod remain stuck at 'Pending' or 'ContainerCreating' state?** + +**Potential Cause**: This issue is encountered if incorrect parameter values are provided during the installation of the Kubernetes plugin for the RoCE NIC. For more information about the parameters and their accepted values, `click here <../../../OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/k8s_plugin_roce_nic.html>`_. + +**Resolution**: If the RoCE pod is in 'Pending' or 'ContainerCreating' state, describe the pod to check for issues. If there is a mistake in the parameter values provided, use ``delete_roce_plugin.yml`` to delete the configurations made for the Kubernetes RoCE plugin, append the ``input/roce_plugin_config.yml`` with correct values and re-deploy the RoCE pod by executing ``deploy_roce_plugin.yml``. + +⦾ **Why does the node get stuck in "standby" status and continuously PXE boots during the installation of AMD ROCm drivers in the cluster provisioning process?** + +**Potential Cause**: This can happen due to any hardware or firmware issues on the node. + +**Resolution**: Resolve the underlying hardware or firmware issues and re-run the ``discovery_provision.yml`` playbook. \ No newline at end of file diff --git a/docs/source/Troubleshooting/FAQ/Ubuntu/index.rst b/docs/source/Troubleshooting/FAQ/Ubuntu/index.rst new file mode 100644 index 000000000..a53380b68 --- /dev/null +++ b/docs/source/Troubleshooting/FAQ/Ubuntu/index.rst @@ -0,0 +1,6 @@ +FAQs related to Ubuntu clusters +================================= + +.. toctree:: + + Provision \ No newline at end of file diff --git a/docs/source/Troubleshooting/FAQ/index.rst b/docs/source/Troubleshooting/FAQ/index.rst new file mode 100644 index 000000000..a9fd33b8f --- /dev/null +++ b/docs/source/Troubleshooting/FAQ/index.rst @@ -0,0 +1,8 @@ +Frequently Asked Questions (FAQs) +=================================== + +.. toctree:: + + RHEL/index + Ubuntu/index + Common/index \ No newline at end of file diff --git a/docs/source/Troubleshooting/KnownIssues/Common/AITools.rst b/docs/source/Troubleshooting/KnownIssues/Common/AITools.rst new file mode 100644 index 000000000..a806c84e6 --- /dev/null +++ b/docs/source/Troubleshooting/KnownIssues/Common/AITools.rst @@ -0,0 +1,47 @@ +AI Tools +========= + +⦾ **What to do if pulling the Kserve inference model fails with** ``Unable to fetch image "kserve/sklearnserver:v0.11.2": failed to resolve image to digest: Get "https://index.docker.io/v2/": dial tcp 3.219.239.5:443: i/o timeout.`` **?** + +**Resolution**: + +1. Edit the kubernetes configuration map: :: + + kubectl edit configmap -n knative-serving config-deployment + +2. Add docker.io and index.docker.io as part of the registries-skipping-tag-resolving. + +For more information, `click here. `_ + + +⦾ **What to do when Kubeflow pods are in 'ImagePullBackOff' or 'ErrImagePull' status after executing** ``kubeflow.yml`` **playbook?** + +**Potential Cause**: Your Docker pull limit has been exceeded. For more information, `click here. `_ + +**Resolution**: + +1. Delete Kubeflow deployment by executing the following command from the ``kube_control_plane``: :: + + kfctl delete -V -f /root/k8s/omnia-kubeflow/kfctl_k8s_istio.v1.0.2.yaml + +2. Re-execute ``kubeflow.yml`` after 8-9 hours + + +⦾ **What to do if the JupyterHub or Prometheus UI is not accessible?** + +**Resolution**: Run the command ``kubectl get pods namespace default`` to ensure **nfs-client** pod and all Prometheus server pods are in the **Running** state. + + +⦾ **What to do if JupyterHub pods are in 'ImagePullBackOff' or 'ErrImagePull' status after executing** ``jupyterhub.yml`` **playbook?** + +**Potential Cause**: Your Docker pull limit has been exceeded. For more information, `click here `_. + +**Resolution**: + +1. Delete Jupyterhub deployment by executing the following command on the ``kube_control_plane``: :: + + helm delete jupyterhub -n jupyterhub + +2. Re-execute ``jupyterhub.yml`` after 8-9 hours. + + diff --git a/docs/source/Troubleshooting/KnownIssues/Common/General_Query.rst b/docs/source/Troubleshooting/KnownIssues/Common/General_Query.rst new file mode 100644 index 000000000..e74965ea0 --- /dev/null +++ b/docs/source/Troubleshooting/KnownIssues/Common/General_Query.rst @@ -0,0 +1,26 @@ +General Query +============== + +⦾ **Why does the** ``TASK [gather_facts_from_all_the_nodes]`` **get stuck while re-running** ``omnia.yml`` **playbook?** + +**Potential Cause**: Corrupted entries in the ``/root/.ansible/oim/`` folder. For more information on this issue, `check this out `_! + +**Resolution**: Clear the directory ``/root/.ansible/oim/`` using the following commands: :: + + cd /root/.ansible/oim/ + + rm -rf * + +Alternatively, run the task manually: :: + + cd omnia/utils/cluster + ansible-playbook gather_facts_resolution.yml + + +⦾ **What to do if** ``omnia.yml`` **execution fails with a** ``403: Forbidden`` **error when an NFS share is provided as the** ``repo_store_path`` **?** + +.. image:: ../../../images/omnia_NFS_403.png + +**Potential Cause**: For ``omnia.yml`` execution, the NFS share folder provided in ``repo_store_path`` must have 755 permissions. + +**Resolution**: Ensure that the NFS share folder provided as the ``repo_store_path`` has 755 permissions, and re-run ``omnia.yml``. \ No newline at end of file diff --git a/docs/source/Troubleshooting/KnownIssues/Common/Kubernetes.rst b/docs/source/Troubleshooting/KnownIssues/Common/Kubernetes.rst new file mode 100644 index 000000000..9b3e9040f --- /dev/null +++ b/docs/source/Troubleshooting/KnownIssues/Common/Kubernetes.rst @@ -0,0 +1,130 @@ +Kubernetes +=========== + +⦾ **Why do Kubernetes Pods show "ImagePullBack" or "ErrPullImage" errors in their status?** + +**Potential Cause**: The errors occur when the Docker pull limit is exceeded. + +**Resolution**: + + * Ensure that the ``docker_username`` and ``docker_password`` are provided in ``input/provision_config_credentials.yml``. + + * For a HPC cluster, during ``omnia.yml`` execution, a kubernetes secret 'dockerregcred' will be created in default namespace and patched to service account. User needs to patch this secret in their respective namespace while deploying custom applications and use the secret as imagePullSecrets in yaml file to avoid ErrImagePull. `Click here for more info. `_ + +.. note:: If the playbook is already executed and the pods are in **ImagePullBack** state, then run ``kubeadm reset -f`` in all the nodes before re-executing the playbook with the docker credentials. + + +⦾ **What to do if the nodes in a Kubernetes cluster reboot?** + +**Resolution**: Wait for 15 minutes after the Kubernetes cluster reboots. Next, verify the status of the cluster using the following commands: + +* ``kubectl get nodes`` on the kube_control_plane to get the real-time kubernetes cluster status. + +* ``kubectl get pods all-namespaces`` on the kube_control_plane to check which the pods are in the **Running** state. + +* ``kubectl cluster-info`` on the kube_control_plane to verify that both the kubernetes master and kubeDNS are in the **Running** state. + + +⦾ **What to do when the Kubernetes services are not in "Running" state:** + +**Resolution**: + +1. Run ``kubectl get pods all-namespaces`` to verify that all pods are in the **Running** state. + +2. If the pods are not in the **Running** state, delete the pods using the command:``kubectl delete pods `` + +3. Run the corresponding playbook that was used to install Kubernetes: ``omnia.yml``, ``jupyterhub.yml``, or ``kubeflow.yml``. + + +⦾ **Why do Kubernetes Pods stop communicating with the servers when the DNS servers are not responding?** + +**Potential Cause**: The host network is faulty causing DNS to be unresponsive + +**Resolution**: + +1. In your Kubernetes cluster, run ``kubeadm reset -f`` on all the nodes. + +2. On the management node, edit the ``omnia_config.yml`` file to change the Kubernetes Pod Network CIDR. The suggested IP range is 192.168.0.0/16. Ensure that the IP provided is not in use on your host network. + +3. List ``k8s`` in ``input/software_config.json`` and re-run ``omnia.yml``. + + +⦾ **Why does the 'Initialize Kubeadm' task fail with 'nnode.Registration.name: Invalid value: \"\"'?** + +**Potential Cause**: The OIM does not support hostnames with an underscore in it, such as 'mgmt_station'. + +**Resolution**: As defined in RFC 822, the only legal characters are the following: + +1. Alphanumeric (a-z and 0-9): Both uppercase and lowercase letters are acceptable, and the hostname is not case-sensitive. In other words, omnia.test is identical to OMNIA.TEST and Omnia.test. + +2. Hyphen (-): Neither the first nor the last character in a hostname field should be a hyphen. + +3. Period (.): The period should be used only to delimit fields in a hostname (For example, dvader.empire.gov) + + +⦾ **What to do if** ``omnia.yml`` **playbook execution fails with MetalLB, a load-balancer for bare metal Kubernetes cluster?** + +**Potential Cause**: This failure is caused due to an issue with Kubespray, a third-party software. For more information about this issue, `click here `_. + +**Resolution**: If your ``omnia.yml`` playbook execution fails while waiting for the MetalLB controller to be up and running, you need to wait for the MetalLB pods to come to running state and then re-run ``omnia.yml/scheduler.yml``. + + +⦾ **Why does the** ``omnia.yml`` **or** ``scheduler.yml`` **playbook execution fails with a** ``Unable to retrieve file contents`` **error?** + +.. image:: ../../../images/kubespray_error.png + +**Potential Cause**: This error occurs when the Kubespray collection is not installed during the execution of ``prepare_oim.yml``. + +**Resolution**: Re-run ``prepare_oim.yml``. + + +⦾ **Why does the NFS-client provisioner go to a "ContainerCreating" or "CrashLoopBackOff" state?** + +.. image:: ../../../images/NFS_container_creating_error.png + +.. image:: ../../../images/NFS_crash_loop_back_off_error.png + +**Potential Cause**: This issue usually occurs when ``server_share_path`` given in ``storage_config.yml`` for ``k8s_share`` does not have an NFS server running. + +**Resolution**: + + * Ensure that ``storage.yml`` is executed on the same inventory which is being used for ``scheduler.yml``. + * Ensure that ``server_share_path`` mentioned in ``storage_config.yml`` for ``k8s_share: true`` has an active nfs_server running on it. + +⦾ **If the Nfs-client provisioner is in "ContainerCreating" or "CrashLoopBackOff" state, why does the** ``kubectl describe `` **command show the following output?** + +.. image:: ../../../images/NFS_helm_23743.png + +**Potential Cause**: This is a known issue. For more information, click `here. `_ + +**Resolution**: + + 1. Wait for some time for the pods to come up. **or** + 2. Do the following: + + * Run the following command to delete the pod: :: + + kubectl delete pod -n + + * Post deletion, the pod will be restarted and it will come to running state. + + +⦾ **Why does the nvidia-device-plugin pods in ContainerCreating status fail with a** ``no runtime for "nvidia" is configured`` **error?** + +.. image:: ../../../images/nvidia_noruntime.png + +**Potential Cause**: nvidia-container-toolkit is not installed on GPU nodes. + +**Resolution**: Install Kubernetes, download nvidia-container-toolkit, and perform the necessary configurations based on the OS running on the cluster. + +⦾ **After running the** ``reset_cluster_configuration.yml`` **playbook on a Kubernetes cluster, which should ideally delete all Kubernetes services and files, it is observed that some Kubernetes logs and configuration files are still present on the** ``kube_control_plane``. **However, these left-over files do not cause any issues for Kubernetes re-installation on the cluster. The files are present under the following directories:** + +* ``/var/log/containers/`` +* ``/sys/fs/cgroup/`` +* ``etc/system`` +* ``/run/systemd/transient/`` +* ``/tmp/releases`` + +**Potential Cause**: When ``reset_cluster_configuration.yml`` is executed on a Kubernetes cluster, it triggers the Kubespray playbook ``kubernetes_sigs.kubespray.reset`` internally, which is responsible for removing Kubernetes configuration and services from the cluster. However, this Kubespray playbook doesn't delete all Kubernetes services and files, resulting in some files being left behind on the ``kube_control_plane``. + +**Workaround**: After running the ``reset_cluster_configuration.yml`` playbook on a Kubernetes cluster, users can choose to remove the files from the directories mentioned above if they wish to do so. \ No newline at end of file diff --git a/docs/source/Troubleshooting/KnownIssues/Common/LocalRepo.rst b/docs/source/Troubleshooting/KnownIssues/Common/LocalRepo.rst new file mode 100644 index 000000000..b74f1a7c5 --- /dev/null +++ b/docs/source/Troubleshooting/KnownIssues/Common/LocalRepo.rst @@ -0,0 +1,73 @@ +Local Repositories +=================== + +⦾ **Why does running** ``local_repo.yml`` **fail with connectivity errors?** + +**Potential Cause**: The OIM was unable to reach a required online resource due to a network glitch. + +**Resolution**: Verify all connectivity and re-run the playbook. + + +⦾ **Why does any script that installs software fail with** ``The checksum for did not match.`` **error?** + +**Potential Cause**: A local repository for the software has not been configured by the ``local_repo.yml`` playbook. + +**Resolution**: + + * Delete the tarball/image/deb of the software from ``/cluster/tarball``. + * Re-run ``local_repo.yml``. + * Re-run the script to install the software. + + +⦾ **Why does the** ``TASK [configure_registry : Start and enable nerdctl-registry service`` **fail with** ``Job for nerdctl-registry.service failed because the control process exited with error code`` **?** + +.. image:: ../../../images/nerdctlError.png + +**Potential Causes**: + + * The subnet 10.4.0.0/24 has been assigned to the admin, bmc, or additional network. nerdctl uses this subnet by default and cannot be assigned to any other interface in the system. + * The docker pull limit has been breached. + +**Resolutions**: + + * Reassign the conflicting network to a different subnet. + * Update ``input/provision_config_credentials.yml`` with the ``docker_username`` and ``docker_password``. + + +⦾ **Why does the** ``TASK [parse_and_download : Get libssl package]`` **fail during** ``local_repo.yml`` **execution?** + +.. image:: ../../../images/local_repo_permissions_error.png + +**Potential Cause**: Executing ``local_repo.yml`` with ``repo_store_path`` set as an NFS share, but lacking the necessary permissions to access it from the OIM. + +**Resolution**: Provide the required (read, write, and execute) permissions for the NFS share. Verify the permissions of NFS share from the root user of the OIM. + + +⦾ **Why does the** ``TASK [parse_and_download : Display Failed Packages`` **fail during** ``prepare_upgrade.yml`` **execution?** + +.. image:: ../../../images/upgrade_failed_packages.png + +**Potential Cause**: This issue may arise while setting up of local repo for Omnia v1.6 and can occur due to internet connection issues on OIM. + +**Resolution**: Verify that the internet connectivity on OIM is stable and re-run the ``prepare_upgrade.yml`` playbook. + + +⦾ **Why does the** ``TASK [configure_repos : Generate metadata for repositories]`` **fails during the execution of** ``local_repo.yml`` **on RHEL clusters, if the Epel repository is unstable?** + +**Potential Cause**: If the external Epel repository link mentioned in ``omnia_repo_url_rhel`` is not stable, then it can cause failures in ``local_repo.yml`` playbook execution. + +**Resolution**: + +1. Check if the Epel repository link mentioned in ``omnia_repo_url_rhel`` is accessible. + +2. Verify the required software listed in ``software_config.json``, by examining the corresponding ``.json`` files located in the ``input/config/rhel/`` directory. User can do either of the following, based on the findings: + + - If none of the packages are dependent on the Epel repository, users can remove the Epel repository URL from ``omnia_repo_url_rhel``. + + - If any package required from the Epel repository is listed in the ``software_config.json`` file, it's advisable to either wait for the Epel repository to stabilize or host those Epel repository packages locally. Afterward, remove the Epel repository link from ``omnia_repo_url_rhel`` and provide the locally hosted URL for the Epel repository packages via the ``user_repo_url`` variable. + +⦾ **Why does** ``omnia.yml`` **execution fail during the** ``TASK [Kubernetes_sigs.kubesprate.container-engine/runc : Download_file | Create dest directory on node]`` **?** + +**Potential Cause**: This issue may arise if the directory path specified as the ``repo_store_path`` in the ``input/local_repo_config.yml`` does not have 755 permissions. + +**Resolution**: Ensure that not only the ``omnia_repo`` folder, but also the entire ``repo_store_path``, has 755 permissions. For example, if you specify ``/root/opt/omnia_repo`` as the ``repo_store_path``, the ``/root`` directory also must have 755 permissions. \ No newline at end of file diff --git a/docs/source/Troubleshooting/KnownIssues/Common/Network.rst b/docs/source/Troubleshooting/KnownIssues/Common/Network.rst new file mode 100644 index 000000000..bc7a25506 --- /dev/null +++ b/docs/source/Troubleshooting/KnownIssues/Common/Network.rst @@ -0,0 +1,22 @@ +Network +======== + +⦾ **Why does the** ``TASK [infiniband_switch_config : Authentication failure response]`` **fail with the message:** ``Status code was -1 and not [302]: Request failed: `` **on Infiniband Switches when executing** ``infiniband_switch_config.yml`` **playbook?** + +**Potential Cause**: To configure a new Infiniband Switch, HTTP and JSON gateway must be enabled. To verify that they are enabled, run: + +* Check if HTTP is enabled: ``show web`` + +* Check if JSON Gateway is enabled: ``show json-gw`` + +**Resolution**: To correct the issue, run: + +* Enable the HTTP gateway: ``web http enable`` + +* Enable the JSON gateway: ``json-gw enable`` + +⦾ **During consecutive runs of the** ``server_spec_update.yml`` **playbook, the additional NICs may not be configured according to the inputs provided in the** ``input/server_spec.yml`` **file, or any unexpected behavior may occur.** + +**Potential Cause**: Omnia does not support modifying the category definitions (for example, ``nic_name``, ``nicnetwork``, or ``nictype``) in ``input/server_spec.yml`` or changing the category details in the inventory file provided, during consecutive runs of the ``server_spec_update.yml`` playbook. + +**Resolution**: Re-provision the nodes. diff --git a/docs/source/Troubleshooting/KnownIssues/Common/Provision.rst b/docs/source/Troubleshooting/KnownIssues/Common/Provision.rst new file mode 100644 index 000000000..6205338cc --- /dev/null +++ b/docs/source/Troubleshooting/KnownIssues/Common/Provision.rst @@ -0,0 +1,95 @@ +Provision +========== + +⦾ **Why doesn't my newly discovered server list a MAC ID in the cluster.nodeinfo table?** + +Due to internal MAC ID conflicts on the target nodes, the MAC address will be listed against the target node using this format ``MAC ADDRESS 1 | MAC ADDRESS 2! *NOIP*`` in the xCAT node object. + +.. image:: ../../../images/MACConflict.png + + +⦾ **Why does we see** ``TASK [provision_validation : Failed - Assign admin nic IP]`` **while executing** ``discovery_provision.yml`` **playbook?** + +.. image:: ../../../images/AdminNICErrors.png + +**Potential Cause:** Omnia validates the admin NIC IP on the OIM. If the user has not assigned an admin NIC IP in case of dedicated network interface type, an error message is returned. There is a parsing logic that is being applied on the blank IP and hence, the error displays twice. + +**Resolution**: Ensure a OIM IP is assigned to the admin NIC. + + +⦾ **Why are some target servers not reachable after PXE booting them?** + +**Potential Causes**: + +1. The server hardware does not allow for auto rebooting + +2. The process of PXE booting the node has stalled. + +**Resolution**: + +1. Login to the iDRAC console to check if the server is stuck in boot errors (F1 prompt message). If true, clear the hardware error or disable POST (PowerOn Self Test). + +2. Hard-reboot the server to bring up the server and verify that the boot process runs smoothly. (If it gets stuck again, disable PXE and try provisioning the server via iDRAC.) + + +⦾ **Why does PXE boot fail with tftp timeout or service timeout errors?** + +**Potential Causes**: + +* RAID is configured on the server. + +* Two or more servers in the same network have xCAT services running. + +* The target cluster node does not have a configured PXE device with an active NIC. + +**Resolution**: + +* Create a Non-RAID or virtual disk on the server. + +* Check if other systems except for the OIM have ``xcatd`` running. If yes, then stop the xCAT service using the following commands: ``systemctl stop xcatd``. + +* On the server, go to **BIOS Setup -> Network Settings -> PXE Device**. For each listed device (typically 4), configure an active NIC under ``PXE device settings`` + + +⦾ **The** ``discovery_provision.yml`` **playbook fails to check for duplicate** ``disk_partition`` **values in** ``input/provision_config.yml`` **.** + +**Resolution**: User needs to ensure that there are no duplicate entries for the same partition in provision_config.yml. + + +⦾ **After executing** ``disocvery_provision.yml`` **, why is the node status in OmniaDB being displayed as "standingby"?** + +**Resolution**: For any discovery mechanism other than switch-based, do the following: + + 1. Execute the following command: :: + + chdef status=”” + + 2. Then run: :: + + rinstall + + Where refers to the node column in the OmniaDB, which has a “standingby” status. + + +⦾ **Why does the** ``discovery_provision.yml`` **playbook execution fail at task: "prepare_oim needs to be executed"?** + +**Potential Cause**: Invalid input provided in ``network_spec.yml`` for ``admin_network`` or ``bmc_network`` fields. + +**Resolution**: Perform a cleanup using ``oim_cleanup.yml`` with ``--tags provision`` & then re-run the ``discovery_provision.yml`` playbook. Execute the following command: + + :: + + ansible-playbook utils/oim_cleanup.yml --tags provision + ansible-playbook discovery_provision.yml + + +⦾ **While executing** ``discovery_provision.yml`` **playbook from the OIM, some of the cluster nodes fail to boot up and omniadb captures the node status as "failed".** + +.. image:: ../../../images/waco_node_boot_failure.png + +**Potential Cause**: This issue is encountered due to any configuration failure during node provisioning. + +**Resolution**: Perform the following steps: + + 1. Delete the failed node from the db using ``delete_node.yml`` playbook utility. For more information, `click here <../../../OmniaInstallGuide/Maintenance/deletenode.html>`_. + 2. Re-provision the node by re-running the ``discovery_provision.yml`` playbook. \ No newline at end of file diff --git a/docs/source/Troubleshooting/KnownIssues/Common/Security.rst b/docs/source/Troubleshooting/KnownIssues/Common/Security.rst new file mode 100644 index 000000000..ba74c238c --- /dev/null +++ b/docs/source/Troubleshooting/KnownIssues/Common/Security.rst @@ -0,0 +1,30 @@ +Centralized authentication +============================= + +⦾ **What to do when** ``omnia.yml`` **fails while completing the security role, and returns the following error message: 'Error: kinit: Connection refused while getting default cache'?** + +**Resolution**: + +1. Start the sssd-kcm.socket: ``systemctl start sssd-kcm.socket`` + +2. Re-run ``omnia.yml`` + + +⦾ **Why does the task 'security: Authenticate as admin' fail?** + +**Potential Cause**: The required services are not running on the node. Verify the service status using: :: + + systemctl status sssd-kcm.socket + systemctl status sssd.service + +**Resolution**: + +1. Restart the services using: :: + + systemctl start sssd-kcm.socket + systemctl start sssd.service + +2. Re-run ``omnia.yml`` using: :: + + ansible-playbook omnia.yml + diff --git a/docs/source/Troubleshooting/KnownIssues/Common/Storage.rst b/docs/source/Troubleshooting/KnownIssues/Common/Storage.rst new file mode 100644 index 000000000..820f23930 --- /dev/null +++ b/docs/source/Troubleshooting/KnownIssues/Common/Storage.rst @@ -0,0 +1,96 @@ +Storage +======== + +⦾ **Why does the task 'nfs_client: Mount NFS client' fail with ``Failed to mount NFS client. Make sure NFS Server is running on IP xx.xx.xx.xx``?** + +**Potential Cause**: The required services for NFS may not have been running: + + - nfs + - rpc-bind + - mountd + +**Resolution**: Enable the required services using ``firewall-cmd --permanent --add-service=`` and then reload the firewall using ``firewall-cmd --reload``. + + +⦾ **What to do when omnia.yml execution fails with nfs-server.service might not be running on NFS Server. Please check or start services?** + +**Potential Cause**: nfs-server.service is not running on the target node. + +**Resolution**: Use the following commands to bring up the service: :: + + systemctl start nfs-server.service + + systemctl enable nfs-server.service + + +⦾ **Why does the task 'Install Packages' fail on the NFS node with the message: Failure in talking to yum: Cannot find a valid baseurl for repo: base/7/x86_64.** + +**Potential Cause**: There are connections missing on the NFS node. + +**Resolution**: Ensure that there are 3 NICs being used on the NFS node: + +1. For provisioning the OS +2. For connecting to the internet (Management purposes) +3. For connecting to PowerVault (Data Connection) + + +⦾ **What to do if PowerVault throws the error: The specified disk is not available. - Unavailable disk (0.x) in disk range '0.x-x':** + +**Resolution**: + +1. Verify that the disk in question is not part of any pool using: ``show disks`` + +2. If the disk is part of a pool, remove it and try again. + + +⦾ **Why does PowerVault throw the error: You cannot create a linear disk group when a virtual disk group exists on the system.?** + +**Potential Cause**: At any given time only one type of disk group can be created on the system. That is, all disk groups on the system have to exclusively be linear or virtual. + +**Resolution**: To fix the issue, either delete the existing disk group or change the type of pool you are creating. + + +⦾ **Why does the task 'nfs_client: Mount NFS client' fail with the message "No route to host"?** + +**Potential Cause**: There's a mismatch in the share path listed in ``/etc/exports`` and in ``omnia_config.yml`` under ``nfs_client_params``. + +**Resolution**: Ensure that the input paths are a perfect match to avoid any errors. + + +⦾ **Why is my NFS mount not visible on the client?** + +**Potential Cause**: The directory being used by the client as a mount point is already in use by a different NFS export. + +**Resolution**: Verify that the directory being used as a mount point is empty by using ``cd | ls`` or ``mount | grep ``. If empty, re-run the playbook. + +.. image:: ../../../images/omnia_NFS_mount_fcfs.png + + +⦾ **Why does the "BeeGFS-client" service fails?** + +**Potential Causes**: + +1. SELINUX may be enabled. (use ``sestatus`` to diagnose the issue) + +2. Ports 8008, 8003, 8004, 8005 and 8006 may be closed. (use ``systemctl status beegfs-mgmtd, systemctl status beegfs-meta, systemctl status beegfs-storage`` to diagnose the issue) + +3. The BeeGFS set up may be incompatible with RHEL. + +**Resolutions**: + +1. If SELinux is enabled, update the file ``/etc/sysconfig/selinux`` and reboot the server. + +2. Open all ports required by BeeGFS: 8008, 8003, 8004, 8005 and 8006 + +3. Check the `support matrix for RHEL or Rocky Linux <../../../Overview/SupportMatrix/OperatingSystems/index.html>`_ to verify your setup. + +4. For further insight into the issue, check out ``/var/log/beegfs-client.log`` on nodes where the BeeGFS client is running. + + +⦾ **What to do if NFS clients are unable to access the share after an NFS server reboot?** + +Reboot the NFS server (external to the cluster) to bring up the services again: :: + + systemctl disable nfs-server + systemctl enable nfs-server + systemctl restart nfs-server \ No newline at end of file diff --git a/docs/source/Troubleshooting/KnownIssues/Common/Telemetry.rst b/docs/source/Troubleshooting/KnownIssues/Common/Telemetry.rst new file mode 100644 index 000000000..40ca4468f --- /dev/null +++ b/docs/source/Troubleshooting/KnownIssues/Common/Telemetry.rst @@ -0,0 +1,15 @@ +Telemetry +========== + +⦾ **Why does the telemetry service fail on a compute node and the Telemetry database remains empty after executing** ``omnia.yml`` **in a cross-OS setup where the OIM is running Ubuntu 22.04 and the compute nodes are on Ubuntu 20.04?** + +**Potential Cause**: This issue is encountered when there is a mismatch of libc version between the OIM (running on Ubuntu 22.04) and the compute node (running on Ubuntu 20.04). + +**Resolution**: To ensure proper functioning of the telemetry service, ensure that the same libc version is present on the OIM and the compute nodes. + +⦾ **Why are there no telemetry metrics available for the AMD accelerators in Omnia telemetry?** + +.. image:: ../../../images/telemetry_mi300.png + +**Potential Cause**: This issue is encountered due to a change in the API response format introduced in the latest AMD ROCm 6.2.2 driver. + diff --git a/docs/source/Troubleshooting/KnownIssues/Common/Upgrade_issues.rst b/docs/source/Troubleshooting/KnownIssues/Common/Upgrade_issues.rst new file mode 100644 index 000000000..b4691a225 --- /dev/null +++ b/docs/source/Troubleshooting/KnownIssues/Common/Upgrade_issues.rst @@ -0,0 +1,10 @@ +Upgrade +========== + +⦾ **The cryptography software is reporting a security vulnerability after upgrading from Omnia 1.6.1 to 1.7.** + +**Potential cause**: This issue occurs if the cryptography version on the login nodes is lower than 44.0.0. + +**Resolution**: After upgrading your Omnia OIM from 1.6.1 to 1.7, ensure to update the cryptography version on the login nodes to 44.0.0 using the following command: :: + + pip install cryptography==44.0.0 \ No newline at end of file diff --git a/docs/source/Troubleshooting/KnownIssues/Common/index.rst b/docs/source/Troubleshooting/KnownIssues/Common/index.rst new file mode 100644 index 000000000..75b131f4f --- /dev/null +++ b/docs/source/Troubleshooting/KnownIssues/Common/index.rst @@ -0,0 +1,17 @@ +Common Known Issues +====================== + +This topic highlights the common known issues related to: + +.. toctree:: + + General_Query + Upgrade_issues + LocalRepo + Provision + Network + Kubernetes + Security + Storage + Telemetry + AITools \ No newline at end of file diff --git a/docs/source/Troubleshooting/KnownIssues/RHEL/AITools.rst b/docs/source/Troubleshooting/KnownIssues/RHEL/AITools.rst new file mode 100644 index 000000000..5b8257189 --- /dev/null +++ b/docs/source/Troubleshooting/KnownIssues/RHEL/AITools.rst @@ -0,0 +1,11 @@ +AI Tools +========= + +⦾ **Kserve deployment occasionally fails on RHEL 8.8 clusters.** + +**Potential Cause**: This is a known issue. For more information, check the links attached below: + + 1. `Reference 1 `_ + 2. `Reference 2 `_ + +**Resolution**: Reprovision the cluster and re-deploy Kserve. The steps to deploy Kserve are located `here <../../../OmniaInstallGuide/Ubuntu/InstallAITools/kserve.html>`_. \ No newline at end of file diff --git a/docs/source/Troubleshooting/KnownIssues/RHEL/Security.rst b/docs/source/Troubleshooting/KnownIssues/RHEL/Security.rst new file mode 100644 index 000000000..a18cb6de6 --- /dev/null +++ b/docs/source/Troubleshooting/KnownIssues/RHEL/Security.rst @@ -0,0 +1,10 @@ +Centralized authentication +============================= + +⦾ **Why would FreeIPA server/client installation fail? (version 1.5 and below)** + +**Potential Cause**: The hostnames of the auth server nodes are not configured in the correct format. + +**Resolution**: If you have enabled the option to install the login node in the cluster, set the hostnames of the nodes in the format: *hostname.domainname*. For example, *authserver_node.omnia.test* is a valid hostname for the auth server node. + +.. note:: To find the cause for the failure of the FreeIPA server and client installation, see *ipaserver-install.log* in the auth server. \ No newline at end of file diff --git a/docs/source/Troubleshooting/KnownIssues/RHEL/Slurm.rst b/docs/source/Troubleshooting/KnownIssues/RHEL/Slurm.rst new file mode 100644 index 000000000..d3fa46178 --- /dev/null +++ b/docs/source/Troubleshooting/KnownIssues/RHEL/Slurm.rst @@ -0,0 +1,98 @@ +Slurm +====== + +⦾ **What to do if slurmd services do not start after executing** ``omnia.yml`` **playbook?** + +**Resolution**: Run the following command to manually restart slurmd services on the nodes :: + + systemctl restart slurmd + + +⦾ **What to do when Slurm services do not start automatically after the cluster reboots:** + +**Resolution**: + +* Manually restart the slurmd services on the kube_control_plane by running the following commands: :: + + systemctl restart slurmdbd + systemctl restart slurmctld + systemctl restart prometheus-slurm-exporter + +* Run ``systemctl status slurmd`` to manually restart the following service on all the cluster nodes. + + +⦾ **What to do if new slurm node is not added to sinfo output of slurm control node when** ``restart_slurm_services`` **in the** ``omnia_config.yml`` **is set to** ``false`` **?** + +**Resolution**: + +* Run the following command on slurm control node: :: + + systemctl restart slurmctld + +* Verify if the slurm node was added, using: :: + + sinfo + + +⦾ **Why do Slurm services fail?** + +**Potential Cause**: The ``slurm.conf`` is not configured properly. + +**Resolution**: + +1. Run the following commands: :: + + slurmdbd -Dvvv + slurmctld -Dvvv + +2. Refer the ``/var/lib/log/slurmctld.log`` file for more information. + + +⦾ **What causes the** ``Ports are Unavailable`` **error?** + +**Potential Cause:** Slurm database connection fails. + +**Resolution:** + +1. Run the following commands: :: + + slurmdbd -Dvvv + slurmctld -Dvvv + +2. Refer the ``/var/lib/log/slurmctld.log`` file. + +3. Check the output of ``netstat -antp | grep LISTEN`` for PIDs in the listening state. + +4. If PIDs are in the **Listening** state, kill the processes of that specific port. + +5. Restart all Slurm services: :: + + slurmctl restart slurmctld on slurm_control_node + systemctl restart slurmdbd on slurm_control_node + systemctl restart slurmd on slurm_node + + +⦾ **What to do if slurmctld services fails during** ``omnia.yml`` **execution, when** ``slurm_installaton_type`` **is** ``nfs_share`` **?** + +**Potential Cause**: This issue may arise due to internal network issues. + +**Resolution**: Re-run the playbook with same configuration and verify the status of slurmctld service in the slurm control node. + +⦾ **Why does the** ``TASK: Install packages for slurm`` **fail with the following error message?** + +.. image:: ../../../images/slurm_epel.png + +**Potential Cause**: This error can happen: + + * Due to intermittent connectivity issues with the EPEL8 repositories from where the Slurm packages are downloaded. + * Due to Slurm packages not downloaded successfully during ``local_repo.yml`` execution. + +**Resolution**: + + * While installing Slurm, Omnia recommends users to proceed with ``always`` or ``partial`` scenarios of ``repo_config`` in ``input/software_config.json``. + * If the user still wants to proceed with the ``never`` scenario, they must wait for the EPEL8 repositories to be reachable and then re-run the ``local_repo.yml`` playbook to download and install the Slurm packages. + * If the user doesn't want to wait, they can change ``repo_config`` in ``input/software_config.json`` to ``always`` or ``partial``, execute ``oim_cleanup.yml``, and then re-run ``local_repo.yml`` to download the Slurm packages. After the packages are downloaded successfully, users need to provision the cluster and run ``omnia.yml`` to install the slurm packages on the cluster nodes. + +⦾ **Why does the** ``job_based_user_access.yml`` **playbook fail while configuring the** `Slurm PAM module `_ **in either configless or NFS mode?** + +**Resolution**: This is a known issue, and Omnia team is actively working on a solution. \ No newline at end of file diff --git a/docs/source/Troubleshooting/KnownIssues/RHEL/index.rst b/docs/source/Troubleshooting/KnownIssues/RHEL/index.rst new file mode 100644 index 000000000..58595f4e8 --- /dev/null +++ b/docs/source/Troubleshooting/KnownIssues/RHEL/index.rst @@ -0,0 +1,11 @@ +Known Issues for RHEL/Rocky Linux clusters +============================================== + +This topic highlights the known issues in RHEL clusters related to: + +.. toctree:: + + local_repo + Slurm + Security + AITools \ No newline at end of file diff --git a/docs/source/Troubleshooting/KnownIssues/RHEL/local_repo.rst b/docs/source/Troubleshooting/KnownIssues/RHEL/local_repo.rst new file mode 100644 index 000000000..8d8b6907b --- /dev/null +++ b/docs/source/Troubleshooting/KnownIssues/RHEL/local_repo.rst @@ -0,0 +1,8 @@ +Local Repositories +====================== + +⦾ **During the execution of** ``local_repo.yml`` **playbook, if the** ``rhel_os_url`` **parameter in** ``local_repo_config.yml`` **is set to a Red Hat subscription URL, the playbook execution encounters an error and fails while attempting to contact the subscription URL.** + +**Potential Cause**: To connect to a Red Hat subscription URL, the local repository configuration requires Red Hat subscription authentication, which Omnia does not support. + +**Workaround**: The user is expected to provide a RHEL OS URL for the ``codereadybuilder`` (CRB) repository that does not require a Red Hat subscription. \ No newline at end of file diff --git a/docs/source/Troubleshooting/KnownIssues/Ubuntu/Provision.rst b/docs/source/Troubleshooting/KnownIssues/Ubuntu/Provision.rst new file mode 100644 index 000000000..bbda69fcb --- /dev/null +++ b/docs/source/Troubleshooting/KnownIssues/Ubuntu/Provision.rst @@ -0,0 +1,8 @@ +Provision +========== + +⦾ **While provisioning a node in an Ubuntu cluster,** ``Installing`` **status is not displayed in cluster.nodeinfo table.** + +**Potential Cause**: This failure is caused due to an issue with xCAT, a third-party software. For more information about this issue, `click here `_. + +**Resolution**: User can track provisioning progress by checking the supported status types. If the status shows ``bmcready`` or ``powering-on``, user can infer that the node is being provisioned. Once the node has been provisioned successfully, it will reflect a ``booted`` status in the OmniaDB. \ No newline at end of file diff --git a/docs/source/Troubleshooting/KnownIssues/Ubuntu/index.rst b/docs/source/Troubleshooting/KnownIssues/Ubuntu/index.rst new file mode 100644 index 000000000..8501cfb40 --- /dev/null +++ b/docs/source/Troubleshooting/KnownIssues/Ubuntu/index.rst @@ -0,0 +1,9 @@ +Known Issues for Ubuntu clusters +=================================== + +This topic highlights the known issues in Ubuntu clusters related to: + +.. toctree:: + + Provision + intel_gaudi \ No newline at end of file diff --git a/docs/source/Troubleshooting/KnownIssues/Ubuntu/intel_gaudi.rst b/docs/source/Troubleshooting/KnownIssues/Ubuntu/intel_gaudi.rst new file mode 100644 index 000000000..9aeca16eb --- /dev/null +++ b/docs/source/Troubleshooting/KnownIssues/Ubuntu/intel_gaudi.rst @@ -0,0 +1,20 @@ +Intel Gaudi accelerators +========================== + +⦾ **Why does the** ``hl-smi`` **command fail to detect the Intel Gaudi drivers installed during provisioning?** + +.. image:: ../../../images/intel_known_issue.png + +**Potential Cause**: This occurs when the Intel Gaudi node has internet access during provisioning. If the node has internet access, the OS kernel gets updated during provisioning which impact the Gaudi driver installation. + +**Resolution**: If you encounter the above-mentioned error, run the ``accelerator.yml`` playbook to fix the issue. Omnia recommends to install the Intel Gaudi driver post provisioning using the ``accelerator.yml`` playbook in case the node has internet connectivity during provisioning. For more information, `click here <../../../OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/Habana_accelerator.html>`_. + +⦾ **Why does the power stress test using** `Habana Labs Qualification Tool (hl_qual) `_ **fail for nodes with Intel Gaudi 3 accelerators?** + +**Resolution**: This is a known issue, and fix is expected in the upcoming Intel firmware release. + +⦾ **Why are only 7 Intel accelerators displayed after a cluster reboot?** + +**Potential Cause**: This issue occurs when the initialization of High Bandwidth Memory (HBM) fails for Intel Gaudi accelerators, often due to factors like low voltage or memory limitations. + +**Resolution**: This is a known issue, and fix is expected in the upcoming Intel firmware release. \ No newline at end of file diff --git a/docs/source/Troubleshooting/KnownIssues/index.rst b/docs/source/Troubleshooting/KnownIssues/index.rst new file mode 100644 index 000000000..2ad47acab --- /dev/null +++ b/docs/source/Troubleshooting/KnownIssues/index.rst @@ -0,0 +1,8 @@ +Known Issues +============== + +.. toctree:: + + RHEL/index + Ubuntu/index + Common/index \ No newline at end of file diff --git a/docs/source/Troubleshooting/index.rst b/docs/source/Troubleshooting/index.rst index 91c6081f5..f6bd1d4da 100644 --- a/docs/source/Troubleshooting/index.rst +++ b/docs/source/Troubleshooting/index.rst @@ -2,7 +2,9 @@ Troubleshooting ============== .. toctree:: - knownissues - FAQ + :maxdepth: 2 + + KnownIssues/index + FAQ/index troubleshootingguide diff --git a/docs/source/Troubleshooting/knownissues.rst b/docs/source/Troubleshooting/knownissues.rst deleted file mode 100644 index 87389b213..000000000 --- a/docs/source/Troubleshooting/knownissues.rst +++ /dev/null @@ -1,718 +0,0 @@ -Known issues -============== - -⦾ **Why doesn't my newly discovered server list a MAC ID in the cluster.nodeinfo table?** - -Due to internal MAC ID conflicts on the target nodes, the MAC address will be listed against the target node using this format ``MAC ADDRESS 1 | MAC ADDRESS 2! *NOIP*`` in the xCAT node object. - -.. image:: ../images/MACConflict.png - -⦾ **Why does the task Assign admin NIC IP fail during discovery_provision.yml with errors?** - -.. image:: ../images/AdminNICErrors.png - -**Potential Cause:** Omnia validates the admin NIC IP on the control plane. If the user has not assigned an admin NIC IP in case of dedicated network interface type, an error message is returned. There is a parsing logic that is being applied on the blank IP and hence, the error displays twice. - -**Resolution**: Ensure a control plane IP is assigned to the admin NIC. - - -⦾ **Why are some target servers not reachable after PXE booting them?** - - -**Potential Causes**: - -1. The server hardware does not allow for auto rebooting - -2. The process of PXE booting the node has stalled. - -**Resolution**: - -1. Login to the iDRAC console to check if the server is stuck in boot errors (F1 prompt message). If true, clear the hardware error or disable POST (PowerOn Self Test). - -2. Hard-reboot the server to bring up the server and verify that the boot process runs smoothly. (If it gets stuck again, disable PXE and try provisioning the server via iDRAC.) - - -⦾ **Why does the Task [infiniband_switch_config : Authentication failure response] fail with the message 'Status code was -1 and not [302]: Request failed: ' on Infiniband Switches when running infiniband_switch_config.yml?** - -**Potential Cause**: To configure a new Infiniband Switch, HTTP and JSON gateway must be enabled. To verify that they are enabled, run: - -To check if HTTP is enabled: ``show web`` - -To check if JSON Gateway is enabled: ``show json-gw`` - -**Resolution**: To correct the issue, run: - -To enable the HTTP gateway: ``web http enable`` - -To enable the JSON gateway: ``json-gw enable`` - - -⦾ **Why does PXE boot fail with tftp timeout or service timeout errors?** - - -**Potential Causes**: - -* RAID is configured on the server. - -* Two or more servers in the same network have xCAT services running. - -* The target cluster node does not have a configured PXE device with an active NIC. - - - -**Resolution**: - -* Create a Non-RAID or virtual disk on the server. - -* Check if other systems except for the control plane have ``xcatd`` running. If yes, then stop the xCAT service using the following commands: ``systemctl stop xcatd``. - -* On the server, go to **BIOS Setup -> Network Settings -> PXE Device**. For each listed device (typically 4), configure an active NIC under ``PXE device settings`` - - -⦾ **Why does running local_repo.yml fail with connectivity errors?** - -**Potential Cause**: The control plane was unable to reach a required online resource due to a network glitch. - -**Resolution**: Verify all connectivity and re-run the playbook. - -⦾ **Why does any script that installs software fail with "The checksum for did not match."**? - -**Potential Cause**: A local repository for the software was not configured by ``local_repo.yml``. - -**Resolution**: - - * Delete the tarball/image/deb of the software from ``/cluster/tarball``. - * Re-run ``local_repo.yml``. - * Re-run the script to install the software. - - -⦾ **Why do Kubernetes Pods show "ImagePullBack" or "ErrPullImage" errors in their status?** - -**Potential Cause**: The errors occur when the Docker pull limit is exceeded. -**Resolution**: - - * Ensure that the ``docker_username`` and ``docker_password`` are provided in ``input/provision_config_credentials.yml``. - - * For a HPC cluster, during ``omnia.yml`` execution, a kubernetes secret 'dockerregcred' will be created in default namespace and patched to service account. User needs to patch this secret in their respective namespace while deploying custom applications and use the secret as imagePullSecrets in yaml file to avoid ErrImagePull. `Click here for more info. `_ - -.. note:: If the playbook is already executed and the pods are in **ImagePullBack** state, then run ``kubeadm reset -f`` in all the nodes before re-executing the playbook with the docker credentials. - -⦾ **Why does the task 'Gather facts from all the nodes' get stuck when re-running omnia.yml?** - -**Potential Cause**: Corrupted entries in the ``/root/.ansible/cp/`` folder. For more information on this issue, `check this out `_! - -**Resolution**: Clear the directory ``/root/.ansible/cp/`` using the following commands: :: - - cd /root/.ansible/cp/ - - rm -rf * - -Alternatively, run the task manually: :: - - cd omnia/utils/cluster - ansible-playbook gather_facts_resolution.yml - -⦾ **What to do if the nodes in a Kubernetes cluster reboot?** - -**Resolution**: Wait for 15 minutes after the Kubernetes cluster reboots. Next, verify the status of the cluster using the following commands: - -* ``kubectl get nodes`` on the kube_control_plane to get the real-time kubernetes cluster status. - -* ``kubectl get pods all-namespaces`` on the kube_control_plane to check which the pods are in the **Running** state. - -* ``kubectl cluster-info`` on the kube_control_plane to verify that both the kubernetes master and kubeDNS are in the **Running** state. - - -⦾ **What to do when the Kubernetes services are not in "Running" state:** - -**Resolution**: - -1. Run ``kubectl get pods all-namespaces`` to verify that all pods are in the **Running** state. - -2. If the pods are not in the **Running** state, delete the pods using the command:``kubectl delete pods `` - -3. Run the corresponding playbook that was used to install Kubernetes: ``omnia.yml``, ``jupyterhub.yml``, or ``kubeflow.yml``. - - -⦾ **Why do Kubernetes Pods stop communicating with the servers when the DNS servers are not responding?** - - -**Potential Cause**: The host network is faulty causing DNS to be unresponsive - -**Resolution**: - -1. In your Kubernetes cluster, run ``kubeadm reset -f`` on all the nodes. - -2. On the management node, edit the ``omnia_config.yml`` file to change the Kubernetes Pod Network CIDR. The suggested IP range is 192.168.0.0/16. Ensure that the IP provided is not in use on your host network. - -3. List k8s in ``input/software_config.json`` and re-run ``omnia.yml``. - - -⦾ **What to do if pulling the Kserve inference model fail with "Unable to fetch image "kserve/sklearnserver:v0.11.2": failed to resolve image to digest: Get "https://index.docker.io/v2/": dial tcp 3.219.239.5:443: i/o timeout."?** - -**Resolution**: - -1. Edit the kubernetes configuration map: :: - - kubectl edit configmap -n knative-serving config-deployment - -2. Add docker.io and index.docker.io as part of the registries-skipping-tag-resolving. - -For more information, `click here. `_ - - -⦾ **Why does the 'Initialize Kubeadm' task fail with 'nnode.Registration.name: Invalid value: \"\"'?** - -**Potential Cause**: The control_plane playbook does not support hostnames with an underscore in it such as 'mgmt_station'. - -**Resolution**: As defined in RFC 822, the only legal characters are the following: - -1. Alphanumeric (a-z and 0-9): Both uppercase and lowercase letters are acceptable, and the hostname is not case-sensitive. In other words, omnia.test is identical to OMNIA.TEST and Omnia.test. - -2. Hyphen (-): Neither the first nor the last character in a hostname field should be a hyphen. - -3. Period (.): The period should be used only to delimit fields in a hostname (For example, dvader.empire.gov) - - -⦾ **What to do when Kubeflow pods are in 'ImagePullBackOff' or 'ErrImagePull' status after executing kubeflow.yml?** - - -**Potential Cause**: Your Docker pull limit has been exceeded. For more information, `click here. `_ - -**Resolution**: - -1. Delete Kubeflow deployment by executing the following command in kube_control_plane: ``kfctl delete -V -f /root/k8s/omnia-kubeflow/kfctl_k8s_istio.v1.0.2.yaml`` - -2. Re-execute ``kubeflow.yml`` after 8-9 hours - -⦾ **What to do when omnia.yml fails while completing the security role, and returns the following error message: 'Error: kinit: Connection refused while getting default cache'?** - -**Resolution**: - -1. Start the sssd-kcm.socket: ``systemctl start sssd-kcm.socket`` - -2. Re-run ``omnia.yml`` - -⦾ **What to do if slurmd services do not start after running omnia.yml playbook?** - -**Resolution**: Run the following command to manually restart slurmd services on the nodes :: - - systemctl restart slurmd - -⦾ **What to do when Slurm services do not start automatically after the cluster reboots:** - -**Resolution**: - -* Manually restart the slurmd services on the kube_control_plane by running the following commands: :: - - systemctl restart slurmdbd - systemctl restart slurmctld - systemctl restart prometheus-slurm-exporter - -* Run ``systemctl status slurmd`` to manually restart the following service on all the cluster nodes. - -⦾ **What to do if new slurm node is not added to sinfo output of slurm control node when restart_slurm_services in omnia_config.yml is set to "false"?** - -**Resolution**: - -* Run the following command on slurm control node: :: - - systemctl restart slurmctld - -* Verify if the slurm node was added, using: :: - - sinfo - -⦾ **Why do Slurm services fail?** - -**Potential Cause**: The ``slurm.conf`` is not configured properly. - -**Resolution**: - -1. Run the following commands: :: - - slurmdbd -Dvvv - slurmctld -Dvvv - -2. Refer the ``/var/lib/log/slurmctld.log`` file for more information. - -⦾ **What causes the "Ports are Unavailable" error?** - -**Potential Cause:** Slurm database connection fails. - -**Resolution:** - -1. Run the following commands::: - - - - slurmdbd -Dvvv - slurmctld -Dvvv - - - -2. Refer the ``/var/lib/log/slurmctld.log`` file. - -3. Check the output of ``netstat -antp | grep LISTEN`` for PIDs in the listening state. - -4. If PIDs are in the **Listening** state, kill the processes of that specific port. - -5. Restart all Slurm services: :: - - - - slurmctl restart slurmctld on slurm_control_node - - systemctl restart slurmdbd on slurm_control_node - - systemctl restart slurmd on slurm_node - - - -⦾ **Why does the task 'nfs_client: Mount NFS client' fail with ``Failed to mount NFS client. Make sure NFS Server is running on IP xx.xx.xx.xx``?** - -**Potential Cause**: The required services for NFS may not have been running: - - - nfs - - rpc-bind - - mountd - -**Resolution**: Enable the required services using ``firewall-cmd --permanent --add-service=`` and then reload the firewall using ``firewall-cmd --reload``. - -⦾ **What to do when omnia.yml execution fails with nfs-server.service might not be running on NFS Server. Please check or start services?** - -**Potential Cause**: nfs-server.service is not running on the target node. - -**Resolution**: Use the following commands to bring up the service: :: - - systemctl start nfs-server.service - - systemctl enable nfs-server.service - - -⦾ **Why does the task `configure registry: Start and enable nerdctl-registry service` fail with "Job for nerdctl-registry.service failed because the control process exited with error code"?** - -.. image:: ../images/nerdctlError.png - - -**Potential Causes**: - - * The subnet 10.4.0.0/24 has been assigned to the admin, bmc, or additional network. nerdctl uses this subnet by default and cannot be assigned to any other interface in the system. - * The docker pull limit has been breached. - -**Resolutions**: - - * Reassign the conflicting network to a different subnet. - * Update ``input/provision_config_credentials.yml`` with the ``docker_username`` and ``docker_password``. - -⦾ **Why does the task 'Install Packages' fail on the NFS node with the message: Failure in talking to yum: Cannot find a valid baseurl for repo: base/7/x86_64.** - - -**Potential Cause**: There are connections missing on the NFS node. - -**Resolution**: - - Ensure that there are 3 NICs being used on the NFS node: - - 1. For provisioning the OS - - 2. For connecting to the internet (Management purposes) - - 3. For connecting to PowerVault (Data Connection) - - -⦾ **What to do when the JupyterHub or Prometheus UI is not accessible?** - -**Resolution**: Run the command ``kubectl get pods namespace default`` to ensure **nfs-client** pod and all Prometheus server pods are in the **Running** state. - - -⦾ **What to do if PowerVault throws the error: The specified disk is not available. - Unavailable disk (0.x) in disk range '0.x-x':** - -**Resolution**: - -1. Verify that the disk in question is not part of any pool using: ``show disks`` - -2. If the disk is part of a pool, remove it and try again. - -⦾ **Why does PowerVault throw the error: You cannot create a linear disk group when a virtual disk group exists on the system.?** - -**Potential Cause**: At any given time only one type of disk group can be created on the system. That is, all disk groups on the system have to exclusively be linear or virtual. - -**Resolution**: To fix the issue, either delete the existing disk group or change the type of pool you are creating. - -⦾ **Why does the task 'nfs_client: Mount NFS client' fail with the message "No route to host"?** - -**Potential Cause**: There's a mismatch in the share path listed in ``/etc/exports`` and in ``omnia_config.yml`` under ``nfs_client_params``. - -**Resolution**: Ensure that the input paths are a perfect match to avoid any errors. - - -⦾ **Why is my NFS mount not visible on the client?** - - -**Potential Cause**: The directory being used by the client as a mount point is already in use by a different NFS export. - -**Resolution**: Verify that the directory being used as a mount point is empty by using ``cd | ls`` or ``mount | grep ``. If empty, re-run the playbook. - -.. image:: ../images/omnia_NFS_mount_fcfs.png - - - - -⦾ **Why does the "BeeGFS-client" service fail?** - -**Potential Causes**: - -1. SELINUX may be enabled. (use ``sestatus`` to diagnose the issue) - -2. Ports 8008, 8003, 8004, 8005 and 8006 may be closed. (use ``systemctl status beegfs-mgmtd, systemctl status beegfs-meta, systemctl status beegfs-storage`` to diagnose the issue) - -3. The BeeGFS set up may be incompatible with RHEL. - - - -**Resolutions**: - -1. If SELinux is enabled, update the file ``/etc/sysconfig/selinux`` and reboot the server. - -2. Open all ports required by BeeGFS: 8008, 8003, 8004, 8005 and 8006 - -3. Check the `support matrix for RHEL or Rocky Linux <../Overview/SupportMatrix/OperatingSystems/index.html>`_ to verify your set-up. - -4. For further insight into the issue, check out ``/var/log/beegfs-client.log`` on nodes where the BeeGFS client is running. - - - -⦾ **Why does the task 'security: Authenticate as admin' fail?** - -**Potential Cause**: -The required services are not running on the node. Verify the service status using: :: - - systemctl status sssd-kcm.socket - - systemctl status sssd.service - -**Resolution**: - -* Restart the services using: :: - - systemctl start sssd-kcm.socket - systemctl start sssd.service - -* Re-run ``omnia.yml`` using: :: - - ansible-playbook omnia.yml - - -⦾ **Why would FreeIPA server/client installation fail? (version 1.5 and below)** - - -**Potential Cause**: - -The hostnames of the auth server nodes are not configured in the correct format. - -**Resolution**: - -If you have enabled the option to install the login node in the cluster, set the hostnames of the nodes in the format: *hostname.domainname*. For example, *authserver_node.omnia.test* is a valid hostname for the auth server node. - -.. note:: To find the cause for the failure of the FreeIPA server and client installation, see *ipaserver-install.log* in the auth server. - - - -⦾ **What to do when JupyterHub pods are in 'ImagePullBackOff' or 'ErrImagePull' status after executing jupyterhub.yml:** - -**Potential Cause**: Your Docker pull limit has been exceeded. For more information, `click here `_. - -**Resolution**: - -1. Delete Jupyterhub deployment by executing the following command in kube_control_plane: ``helm delete jupyterhub -n jupyterhub`` - -2. Re-execute ``jupyterhub.yml`` after 8-9 hours. - -⦾ **What to do if NFS clients are unable to access the share after an NFS server reboot?** - -Reboot the NFS server (external to the cluster) to bring up the services again: :: - - systemctl disable nfs-server - systemctl enable nfs-server - systemctl restart nfs-server - - -⦾ **Why do Kuberneteschildnode & kubernetesnodes log as Pass in the database even if there are nodes in the Ready,Schedulingdisabled state?** - -**Potential Cause**: Omnia telemetry considers ``Ready,SchedulingDisabled`` as a Ready state of Kubernetes nodes . So, even if the ``kubectl get nodes`` command shows any node’s state as ``Ready,SchedulingDisabled``, the entry in DB for ``Kuberneteschildnode`` & ``kubernetesnodes`` will be logged as Pass instead of Fail. - -⦾ **What to do if omnia.yml playbook execution fails with MetalLB, a load-balancer for bare metal Kubernetes cluster?** - -**Resolution**: - -If your ``omnia.yml`` playbook execution fails while waiting for the MetalLB controller to be up and running, you need to wait for the MetalLB pods to come to running state and then re-run ``omnia.yml/scheduler.yml``. - -⦾ **What to do if omnia.yml playbook execution fails to execute "kubeadm join" or "kubeadm init" command?** - -**Potential Cause**: An additional active NIC other than the admin NIC is present on the ``kube_control_plane``, with an active internet connection and lower metric value. - -**Resolution**: Perform the following steps: - -1. If ``kubeadm join``/ ``kubeadm init`` command fails, either one of the following should be done: - - * Run ``kubeadm reset -f`` on the node where ``kubeadm join``/ ``kubeadm init`` command fails. - * Reset the cluster using ``utils/reset_cluster_configuration.yml``. - -2. After the cluster has been reset, inventory should be updated with argument ``ip``, and ``ip`` should have the value of required admin IP in case node has more than one network interface. If ``kube_control_plane`` has 2 interfaces ``eno1`` and ``eno2`` with IPs ``eno1=10.5.0.3`` and ``eno2=198.168.0.19``, inventory should have the following format: :: - - [kube_control_plane] - - 10.5.0.3 ip=10.5.0.3 - - [kube_node] - - 10.5.0.4 ip=10.5.0.4 - - [etcd] - - 10.5.0.3 ip=10.5.0.3 - -3. Re-run ``omnia.yml`` playbook with the updated inventory file. - -⦾ **What to do if local_repo.yml execution fails with the following error:** - -.. image:: ../images/local_repo_permissions_error.png - -**Potential Cause**: Executing ``local_repo.yml`` with ``repo_store_path`` set as an NFS share, but lacking the necessary permissions to access it from the control plane. - -**Resolution**: Provide the required (read, write, and execute) permissions for the NFS share. Verify the permissions of NFS share from the root user of the control plane. - -⦾ **What to do if omnia.yml execution fails with a "403: Forbidden" error when an NFS share is provided as the repo_store_path?** - -.. image:: ../images/omnia_NFS_403.png - -**Potential Cause**: For omnia.yml execution, the NFS share folder provided in repo_store_path must have 755 permissions. - -**Resolution**: Ensure that the NFS share folder provided as the repo_store_path has 755 permissions, and re-run ``omnia.yml``. - -⦾ **omnia.yml or scheduler.yml playbook execution fails with the following error:** - -.. image:: ../images/kubespray_error.png - -**Potential Cause**: This error occurs when the Kubespray collection is not installed during the execution of ``prepare_cp.yml``. - -**Resolution**: Re-run ``prepare_cp.yml``. - -⦾ **NFS-client provisioner is in "ContainerCreating" or "CrashLoopBackOff" state.** - -.. image:: ../images/NFS_container_creating_error.png - -.. image:: ../images/NFS_crash_loop_back_off_error.png - -**Potential Cause**: This issue usually occurs when ``server_share_path`` given in ``storage_config.yml`` for ``k8s_share`` does not have an NFS server running. - -**Resolution**: - - * Ensure that ``storage.yml`` is executed on the same inventory which is being used for ``scheduler.yml``. - * Ensure that ``server_share_path`` mentioned in ``storage_config.yml`` for ``k8s_share: true`` has an active nfs_server running on it. - -⦾ **Nfs-client provisioner is in "ContainerCreating" or "CrashLoopBackOff" state and "kubectl describe " shows the following output:** - -.. image:: ../images/NFS_helm_23743.png - -**Potential Cause**: This is a known issue. For more information, click `here. `_ - -**Resolution**: - - 1. Wait for some time for the pods to come up. **OR** - 2. Do the following: - - * Run the following command to delete the pod: :: - - kubectl delete pod -n - - * Post deletion, the pod will be restarted and it will come to running state. - -⦾ **What to do if slurmctld services fails when slurm_installaton_type is nfs_share during omnia.yml execution?** - -**Potential Cause**: This issue may arise due to internal network issues. - -**Resolution**: Re-run the playbook with same configuration and verify the status of slurmctld service in the slurm control node. - -⦾ **Why does the task ‘Parse and Download: Display Failed Packages’ fail while running prepare_upgrade.yml?** - -.. image:: ../images/upgrade_failed_packages.png - -**Potential Cause**: This issue may arise while setting up of local repo for Omnia v1.6 and can occur due to internet connection issues on control plane. - -**Resolution**: Verify that the internet connectivity on control plane is stable and re-run the ``prepare_upgrade.yml`` playbook. - -⦾ **Why does omnia.yml (or upgrade.yml, in case of upgrade) fail with an error “Unable to retrieve file contents. Could not find or access... kubernetes_sigs.kubespray.cluster on the Ansible Controller”?** - -.. image:: ../images/kubernetes_unable_to_retrieve1.png - -**Potential Cause**: This issue may arise when the task *‘prepare_cp/roles/omnia_appliance_cp: Install Kubespray ansible-collection’* in ``prepare_upgrade.yml`` silently passes (as shown in the following image), without installing the Kubespray ansible-collection. This can happen due to unstable internet connectivity on control plane during installation. - -.. image:: ../images/kubernetes_unable_to_retrieve2.png - -**Resolution**: Manually try to install the Kubespray ansible-collection as shown below and re-run the ``omnia.yml`` playbook (or ``upgrade.yml`` playbook in case of upgrade): - -.. image:: ../images/kubernetes_unable_to_retrieve3.png - -⦾ **Why does the task ‘loki: Start Docker Service’ fail at “Job for docker.service failed because the control process exited with error code” while running upgrade.yml?** - -.. image:: ../images/loki_docker.png - -**Potential Cause**: This issue may arise when the ‘docker0’ interface is already bound to a zone in the firewall settings and Docker tries to use this interface, resulting in a ‘Zone Conflict’. - -**Resolution**: Perform the following steps to adjust your firewall settings, allowing Docker to utilize the 'docker0' interface without encountering conflicts. - -1. Add the the docker0 interface to the docker zone using the following command: :: - - sudo firewall-cmd --zone=docker --add-interface=docker0 --permanent - -2. Reload the firewall to apply the changes, using the following command: :: - - sudo firewall-cmd --reload - -3. Restart docker service to ensure it picks up the changes, using the following command: :: - - sudo systemctl restart docker - -4. Finally, run the following command to ensure docker service is active and running: :: - - systemctl status docker - -After performing all the above steps, re-run ``upgrade.yml`` playbook. - -⦾ **Why does the nvidia-device-plugin pods in ContainerCreating status fails with "no runtime for "nvidia" in configured" error?** - -.. image:: ../images/nvidia_noruntime.png - -**Potential Cause**: nvidia-container-toolkit is not installed on GPU nodes. - -**Resolution**: Go to `Install Kubernetes <../InstallationGuides/BuildingClusters/install_kubernetes.html>`_ and follow the steps to download nvidia-container-toolkit and perform the necessary configurations based on the OS running on the cluster. - -⦾ **While provisioning a node in an Ubuntu cluster, "Installing" status is not displayed in cluster.nodeinfo table.** - -**Resolution**: User can track provisioning progress by checking the supported status types. If the status shows ``bmcready`` or ``powering-on``, user can infer that the node is being provisioned. Once the node has been provisioned successfully, it will reflect a ``booted`` status in the OmniaDB. - -⦾ **``discovery_provision.yml`` fails to check for duplicate disk_partition values in provision_config.yml** - -**Resolution**: User needs to ensure that there are no duplicate entries for the same partition in provision_config.yml. - -⦾ **After executing ``disocvery_provision.yml``, the node status in OmniaDB reflects as "standingby"?** - -**Resolution**: For any discovery mechanism other than switch-based, do the following: - - 1. Execute the following command: :: - - chdef status=”” - - 2. Then run: :: - - rinstall - - Where refers to the node column in the OmniaDB, which has a “standingby” status. - -⦾ **While executing local_repo.yml playbook, subgroup entries for applicable software is not validated during playbook execution.** - -**Resolution**: User must provide the software subgroup (if required) for the respective software in ``input/software_config.json``. For more information, `click here <../InstallationGuides/LocalRepo/InputParameters.html>`_. - -⦾ **The "TASK [configure_repos : Generate metadata for repositories]" fails during the execution of local_repo.yml on RHEL clusters if the Epel repository is unstable.** - -**Potential Cause**: If the external Epel repository link mentioned in ``omnia_repo_url_rhel`` is not stable, then it can cause failures in ``local_repo.yml`` playbook execution. - -**Resolution**: - -1. Check if the Epel repository link mentioned in ``omnia_repo_url_rhel`` is accessible. - -2. Verify the required software listed in ``software_config.json``, by examining the corresponding ``.json`` files located in the ``input/config/rhel/`` directory. User can do either of the following, based on the findings: - - - If none of the packages are dependent on the Epel repository, users can remove the Epel repository URL from ``omnia_repo_url_rhel``. - - - If any package required from the Epel repository is listed in the ``software_config.json`` file, it's advisable to either wait for the Epel repository to stabilize or host those Epel repository packages locally. Afterward, remove the Epel repository link from ``omnia_repo_url_rhel`` and provide the locally hosted URL for the Epel repository packages via the ``user_repo_url`` variable. - -⦾ **Why does the discovery_provision.yml playbook execution fail at task: "Prepare_cp needs to be executed"?** - -**Potential Cause**: Invalid input provided in ``network_spec.yml`` for ``admin_network`` or ``bmc_network`` fields. - -**Resolution**: Perform a cleanup using ``control_plane_cleanup.yml`` with ``--tags provision`` & then re-run the ``discovery_provision.yml`` playbook. Execute the following command: - - :: - - ansible-playbook utils/control_plane_cleanup.yml --tags provision - ansible-playbook discovery_provision.yml - -⦾ **After resetting an existing slurm cluster using reset_cluster_config.yml, issues are faced while changing the slurm_installation_type from nfs_share to configless in input/omnia.yml. Post update, possible scenarios observed while executing omnia.yml playbook are:** - - * **Playbook execution fails at TASK: [slurm_common : Add the user 'slurm' with uid 6001 and a primary group of 'slurm'].** - - .. image:: ../images/nfs_slurm_error.png - - * **Playbook execution is successful but the slurm services are inactive.** - -**Potential Causes**: - - 1. While updating the ``slurm_installation_type`` from ``nfs_share`` to ``configless`` in ``input/omnia.yml``, the previous 'slurm' user is active, which can cause the deletion and addition of the configurations to fail intermittently. - 2. NFS share path is not removed from ``LD_LIBRARY_PATH`` environment variable while resetting a slurm cluster in ``nfs_share`` mode. - -**Resolution**: Perform the following steps: - - 1. Remove the NFS share path from ``LD_LIBRARY_PATH``. - 2. Remove ``slurmd.service`` file from all the compute nodes in the slurm cluster. - 3. Re-run ``omnia.yml`` playbook. - -⦾ **While upgrading Omnia in an NFS-Bolt-On setup, the prepare_config.yml playbook fails to import the nfs_client_params mentioned in input/storage_config.yml for v1.5.1 to Omnia v1.6. Consequently, if the same NFS-Bolt-On share doubles as the Omnia share, the prepare_upgrade.yml playbook fails to unmount it on the head node.** - -**Potential Cause**: This issue occurs when ``client_share_path`` or ``client_mount_options`` in ``nfs_client_params`` is left empty. - -**Resolution**: Perform the following steps based on your cluster configuration: - - After executing the ``prepare_config.yml`` playbook, you need to manually update the ``nfs_client_params`` in ``input/storage_config.yml`` of Omnia v1.6, in the format `mentioned here <../InstallationGuides/BuildingClusters/NFS.html>`_. Ensure that the values for ``server_ip``, ``server_share_path``, ``client_share_path``, and ``client_mount_options`` are the same between Omnia v1.5.1 and v1.6. - - * When ``enable_omnia_nfs`` is set to ``true`` in Omnia v1.5.1, update the ``nfs_client_params`` in the format added below - - # For example, if ``nfs_client_params`` in Omnia v1.5.1 is: :: - - - { server_ip: 10.6.0.4, server_share_path: "/mnt/nfs_shares/users1", client_share_path: “/users1”, client_mount_options: } - - { server_ip: 10.6.0.4, server_share_path: "/mnt/nfs_shares/users1", client_share_path: “/users2”, client_mount_options: } - - # Then the ``nfs_client_params`` in Omnia v1.6 should be updated as: :: - - - { server_ip: 10.6.0.4, server_share_path: "/mnt/nfs_shares/users1", client_share_path: “/users1”, client_mount_options: , nfs_server: false, slurm_share: false, k8s_share: false } - - { server_ip: 10.6.0.4, server_share_path: "/mnt/nfs_shares/users1", client_share_path: “/users2”, client_mount_options: , nfs_server: false, slurm_share: false, k8s_share: false } - - .. note:: Do not remove the auto populated entries in ``nfs_client_params`` from ``input/storage_config.yml``. The default entry is similar to: - :: - { server_ip: localhost, server_share_path: /mnt/omnia_home_share, client_share_path: /home, client_mount_options: "nosuid,rw,sync,hard,intr", nfs_server: true, slurm_share: true, k8s_share: true } - - * When ``enable_omnia_nfs`` is set to ``false`` and ``omnia_usrhome_share`` is set to ``/mnt/nfs_shares/appshare`` in Omnia v1.5.1, update the ``nfs_client_params`` in the format added below - - # For example, if the ``nfs_client_params`` in Omnia v1.5.1 is: :: - - { server_ip: 10.6.0.4, server_share_path: "/mnt/nfs_shares/users", client_share_path: , client_mount_options: } - { server_ip: 10.6.0.4, server_share_path: "/mnt/nfs_shares/appshare", client_share_path: , client_mount_options: } - - # Then the ``nfs_client_params`` in Omnia v1.6 should be updated as: :: - - { server_ip: 10.6.0.4, server_share_path: "/mnt/nfs_shares/users", client_share_path: , client_mount_options: , nfs_server: false, slurm_share: false, k8s_share: false } - { server_ip: 10.6.0.4, server_share_path: "/mnt/nfs_shares/appshare", client_share_path: "/home", client_mount_options: , nfs_server: false, slurm_share: true, k8s_share: true } - - .. note:: When ``enable_omnia_nfs`` is set to ``false``, the ``prepare_upgrade.yml`` playbook execution fails while attempting to delete the nfs_share directory from the manager node. In such a scenario, the user needs to manually unmount the Omnia NFS share from the head node and re-run the ``prepare_upgrade.yml`` playbook. - -⦾ **While executing discovery_provision.yml playbook from the control plane, some of the cluster nodes fail to boot up and omniadb captures the node status as "failed".** - -.. image:: ../images/waco_node_boot_failure.png - -**Potential Cause**: This issue is encountered due to any configuration failure during node provisioning. - -**Resolution**: Perform the following steps: - - 1. Delete the failed node from the db using ``delete_node.yml`` playbook utility. For more information, `click here <../InstallationGuides/deletenode.html#delete-provisioned-node>`_. - 2. Re-provision the node by re-running the ``discovery_provision.yml`` playbook. - -⦾ **Kserve deployment occasionally fails on RHEL 8.8 clusters.** - -**Potential Cause**: This is a known issue. For more information, check the links attached below: - - 1. `Reference 1 `_ - 2. `Reference 2 `_ - -**Resolution**: Reprovision the cluster and re-deploy Kserve. The steps to deploy Kserve are located `here <../InstallationGuides/Platform/kserve.html>`_. \ No newline at end of file diff --git a/docs/source/Troubleshooting/troubleshootingguide.rst b/docs/source/Troubleshooting/troubleshootingguide.rst index 680014c28..69e628a20 100644 --- a/docs/source/Troubleshooting/troubleshootingguide.rst +++ b/docs/source/Troubleshooting/troubleshootingguide.rst @@ -10,12 +10,12 @@ Connecting to internal databases ------------------------------------ * TimescaleDB * Start a bash session within the timescaledb pod: ``kubectl exec -it pod/timescaledb-0 -n telemetry-and-visualizations -- /bin/bash`` - * Connect to psql: ``psql -U `` - * Connect to database: ``\c telemetry_metrics`` + * Connect to psql using the ``psql -u `` command. + * Connect to database using the ``\c telemetry_metrics`` command. * MySQL DB - * Start a bash session within the mysqldb pod: ``kubectl exec -it pod/mysqldb-0 -n telemetry-and-visualizations -- /bin/bash`` - * Connect to mysql: ``mysql -U -p `` - * Connect to database: ``USE idrac_telemetrysource_services_db`` + * Start a bash session within the mysqldb pod using the ``kubectl exec -it pod/mysqldb-0 -n telemetry-and-visualizations -- /bin/bash`` command. + * Connect to mysql using the ``mysql -u `` command and provide password when prompted. + * Connect to database using the ``USE idrac_telemetrysource_services_db`` command. Checking and updating encrypted parameters ----------------------------------------------- @@ -34,7 +34,7 @@ Checking and updating encrypted parameters ansible-vault edit provision_config_credentials.yml --vault-password-file .provision_credential_vault_key -Checking pod status on the control plane +Checking pod status from the OIM -------------------------------------------- * Use this command to get a list of all available pods: ``kubectl get pods -A`` * Check the status of any specific pod by running: ``kubectl describe pod -n `` @@ -111,7 +111,7 @@ If you encounter image download failures while executing ``local_repo.yml``, do :: - curl -k https://:5001/v2/_catalog + curl -k https://:5001/v2/_catalog Expected outputs: @@ -120,21 +120,21 @@ If you encounter image download failures while executing ``local_repo.yml``, do Else, do the following: - a. Restart control-plane and check curl command output again. + a. Restart the OIM and check curl command output again. b. Re-run ``local_repo.yml``. 5. Run the following command: :: - openssl s_client -showcerts -connect :5001 + openssl s_client -showcerts -connect :5001 Expected output: .. image:: ../images/image_failure_output_s5.png * Verify that the certificate is valid and ``CN=private_registry``. - * Certificate shown by this command output should be the same as output present at ``/etc/containerd/certs.d/5001/ca.crt``. + * Certificate shown by this command output should be the same as output present at ``/etc/containerd/certs.d/5001/ca.crt``. If no certificate is visible on screen, run the following command: diff --git a/docs/source/Upgrade/1.5.1to1.6.1/index.rst b/docs/source/Upgrade/1.5.1to1.6.1/index.rst deleted file mode 100644 index 59d439243..000000000 --- a/docs/source/Upgrade/1.5.1to1.6.1/index.rst +++ /dev/null @@ -1,42 +0,0 @@ -Upgrade Omnia v1.5.1 to v1.6.1 -================================ - -The upgrade feature in v1.6.1 helps customers to upgrade their Omnia setup from v1.5.1 to v1.6.1. This includes upgrading the essential software requirements, configurations, and cluster software. - -**Prerequisites** - - 1. The control plane must have internet connectivity and run a full version of the operating system. - - 2. If Git is not installed on the control plane, install Git using the following command: :: - - dnf install git -y - - 3. Clone the Omnia v1.6.1 repository from GitHub and place it in a directory on the control plane. This directory must be different from the one containing the Omnia v1.5.1 repository. Execute the following command to perform the cloning operation: :: - - git clone https://github.com/dell/omnia.git - -Once the cloning process is done, follow the steps listed below to invoke the upgrade process: - -.. toctree:: - - prepare_config - prepare_upgrade - upgrade - -.. note:: - - * Upgrade flow tries best to map v1.5.1 input configurations to v1.6.1, but the user must review the same before running ``prepare_upgrade.yml``. - * Upgrade flow upgrades the existing Omnia v1.5.1 cluster and should not be combined with provisioning of new nodes for Omnia v1.6.1. - * Addition of new nodes can be performed after Omnia upgrade by providing suitable parameters in v1.6.1 input configurations files such as ``provision_config.yml``. - * Upgrade flow resets the existing kubernetes setup on the cluster and updates other relevant software as well. Hence, ensure there are no active jobs running on the cluster when the upgrade is planned. - * Omnia v1.6.1 upgrade feature disables the NFS server on the head node and configures it on the control plane. The NFS share directory mentioned in ``omnia_usrhome_share``, provided in v1.5.1 ``omnia_config.yml``, is unmounted from the cluster and deleted from the head node while executing the ``prepare_upgrade.yml`` playbook. Hence, ensure the cluster does not have any Kubernetes jobs or any other active jobs running when the upgrade is planned. - * As part of upgrade, existing v1.5.1 features are migrated. The new Omnia v1.6.1 functionalities can be restricted depending on the way Omnia v1.5.1 was setup. For example: - - - In Omnia v1.5.1 OpenLDAP client configuration was supported. If you had configured OpenLDAP client to external enterprise LDAP server in Omnia v1.5.1, then this configuration will not be restored during upgrade. In Omnia v1.6.1, Omnia installs OpenLDAP server and the user needs to reconfigure the OpenLDAP server to integrate it with an external LDAP server. - - The slurm setup in Omnia v1.5.1 cluster is upgraded to configless slurm in v1.6.1. - * While the Omnia upgrade process does attempt an automatic backup of the Telemetry database, it is recommended to manually create a backup before initiating the upgrade for added precaution. After the upgrade, the restoration of the telemetry database must be performed manually by the user. - - * Omnia recommends to stop the telemetry services in Omnia v1.5.1 by configuring ``idrac_telemetry_support`` and ``omnia_telemetry_support`` to ``false`` in ``input/telemetry_config.yml``, followed by the execution of the ``telemetry.yml`` playbook before proceeding with the upgrade flow. - * For a successful restoration of the telemetry database in Omnia v1.6.1, ensure ``input/telemetry_config.yml`` has ``idrac_telemetry_support`` set to ``false`` and ``omnia_telemetry_support`` set to ``true``, after executing ``prepare_config.yml``. - -.. caution:: The NFS share directory mentioned in ``omnia_usrhome_share``, provided in v1.5.1 ``omnia_config.yml``, is unmounted from the cluster and deleted from the head node, along with all the user data while executing the ``prepare_upgrade.yml`` playbook. Hence, it is recommended that you take a backup of the Omnia NFS share before executing the ``prepare_upgrade.yml`` playbook. \ No newline at end of file diff --git a/docs/source/Upgrade/1.5.1to1.6.1/prepare_config.rst b/docs/source/Upgrade/1.5.1to1.6.1/prepare_config.rst deleted file mode 100644 index afe05e1f5..000000000 --- a/docs/source/Upgrade/1.5.1to1.6.1/prepare_config.rst +++ /dev/null @@ -1,67 +0,0 @@ -Prepare Config -=============== - -This is the first step of upgrade process and uses the ``prepare_config.yml`` playbook. This playbook performs the following tasks: - - * Imports the input configuration parameters from Omnia v1.5.1 and generates the input configurations for v1.6.1. - * Generates the inventory for Omnia v1.6.1 from the v1.5.1 inventory. - * Sets the Omnia v1.6.1 execution environment by updating the ansible and python versions compatible to v1.6.1. - * Creates backup of the Omnia v1.5.1 database. - * Creates a backup of the Omnia v1.5.1 telemetry database if the ``timescaledb`` pod is in ``running`` state. - -.. note:: Post upgrade, restoring the Omnia telemetry database in Omnia v1.6.1 is completely manual and user-driven. - -Before executing ``prepare_config.yml``, user needs to update ``upgrade_config.yml`` with the following details: - -+-----------------------------+------------------------------------------------------------------------------------------+ -| Parameter | Description | -+=============================+==========================================================================================+ -| **old_input_location** | * This variable points to the input directory of the old Omnia 1.5.1 installation | -| (Required) | * **Example input location:** ``/root/omnia15/omnia/input`` | -+-----------------------------+------------------------------------------------------------------------------------------+ -| **backup_location** | * This variable points to the directory where OmniaDB backups should be stored. | -| (Required) | * This directory must exist prior to running ``prepare_config.yml`` | -| | * **Example:** ``/root/omnia-backups`` | -+-----------------------------+------------------------------------------------------------------------------------------+ - -To execute the ``prepare_config.yml`` playbook, run the following command: :: - - cd omnia/upgrade - ansible-playbook prepare_config.yml -i - -Expected output of this playbook execution: - - * Auto-populated Omnia v1.6.1 configuration files in the ``/omnia/input``. - * Auto-generated inventory file in Omnia v1.6.1 format. This is available in the ``/omnia/upgrade`` folder and will be used later during the execution of `upgrade.yml `_. - * Backup of the Omnia v1.5.1 database is created at the ``backup_location`` specified in the ``upgrade_config.yml``. The backup file is named as ``backup.sql``. - * Backup of the Omnia v1.5.1 telemetry database is created at the ``backup_location`` specified in the ``upgrade_config.yml``. The backup file is named as ``telemetry_tsdb_dump.sql``. - -**Review or Update the auto-generated config files** - -Post ``prepare_config.yml`` execution, user must review or update the auto-populated configuration files in ``/omnia/input`` as mentioned below. - -.. note:: To view/update the encrypted input files, user can use the 'ansible-vault view' or 'ansible-vault edit' command. For sample commands, `click here <../Troubleshooting/troubleshootingguide.html#checking-and-updating-encrypted-parameters>`_. - -* Review the ``software_config.json`` which contains a list of all softwares identified for the cluster. This is used to configure the Omnia v1.6.1 local repository. For more information about local repository, `click here <../InstallationGuides/LocalRepo/index.html>`_. - - - Ensure there is a software entry(s) corresponding to the ``scheduler_type`` configured in Omnia v1.5.1 input configuration. For example, if ``scheduler_type`` is ``k8s,slurm`` in Omnia v1.5.1, then there must be a corresponding software entry(s) in the v1.6.1 ``software_config.json``. - - - Similarly, if a security type (FreeIPA/OpenLDAP) is enabled in v1.5.1, then corresponding entry must be present in the ``software_config.json`` for Omnia v1.6.1. - - - If telemetry is enabled in Omnia v1.5.1, then the Omnia v1.6.1 ``software_config.json`` list should also contain the ``telemetry`` entry. - -* Add ``rhel_os_url`` in ``local repo_config.yml`` when the cluster OS type is RHEL. - -* Verify ``input/network_spec.yml`` for ``admin_network`` and ``bmc_network`` details. - -* If Omnia v1.5.1 installation had slurm set up, ensure that the v1.6.1 ``omnia_config.yml`` has ``slurm_installation_type`` updated as "configless". - -* The new inventory format for Omnia v1.6.1 lists all Omnia v1.5.1 manager nodes as ``kube_control_plane`` and/or ``slurm_control_node`` based on the ``scheduler_type``. All compute nodes will be listed as ``kube_node`` or ``slurm_node`` based on the ``scheduler_type``. - -* Verify ``nfs_client_params`` details in ``input/storage_config.yml`` file, as mentioned below: - - - Omnia v1.6.1 upgrade configures the NFS server on the control plane, when ``enable_omnia_nfs`` is set to true in v1.5.1 ``omnia_config.yml``. Verify that the ``server_ip`` corresponds to the IP address of the control plane. - - - Depending on the ``scheduler_type``, that is, Slurm or Kubernetes, either ``k8s_share`` or ``slurm_share`` will be set to ``true`` for Omnia NFS share. - -* Ensure that the Omnia database backup has been created in the ``backup_location`` provided in ``upgrade_config.yml``. \ No newline at end of file diff --git a/docs/source/Upgrade/1.5.1to1.6.1/prepare_upgrade.rst b/docs/source/Upgrade/1.5.1to1.6.1/prepare_upgrade.rst deleted file mode 100644 index 4c1d30974..000000000 --- a/docs/source/Upgrade/1.5.1to1.6.1/prepare_upgrade.rst +++ /dev/null @@ -1,21 +0,0 @@ -Prepare Upgrade -================ - -This is the second step of upgrade process and uses the ``prepare_upgrade.yml`` playbook. This playbook performs the following tasks: - - * Runs validations on the Omnia v1.6.1 input configurations. - * Cleanup of the Omnia v1.5.1 telemetry configuration while preserving the persistent data. - * Cleanup of the Omnia v1.5.1 OpenLDAP packages from the control plane. - * Cleanup of the Omnia v1.5.1 Docker installation from the control plane. - * Cleanup of the Omnia v1.5.1 Kubernetes setup from the cluster. - * Creates the v1.6.1 local repository based on the ``software_config.json``, generated after running ``prepare_config.yml``. - * Unmounts the NFS share directory, mentioned as ``omnia_usrhome_share`` in v1.5.1 ``omnia_config.yml``, from the cluster and then deletes it. - * In case where ``enable_omnia_nfs`` is set to true in v1.5.1 ``omnia_config.yml`` and the head node acts as the NFS server, the upgrade process disables the NFS server running on the head node and removes the NFS share mentioned in ``omnia_usrhome_share`` from the head node. The NFS server will be set up on the control plane as per Omnia v1.6.1. - * Prepares the control plane which includes upgrading xCAT, setting up Omnia telemetry binaries for cluster, restoring the OmniaDB backup to v1.6.1 format. - -.. caution:: The NFS share directory mentioned in ``omnia_usrhome_share``, provided in v1.5.1 ``omnia_config.yml``, is unmounted from the cluster and deleted from the head node, along with all the user data while executing the ``prepare_upgrade.yml`` playbook. Hence, it is recommended that you take a backup of the Omnia NFS share before executing the ``prepare_upgrade.yml`` playbook. - -To execute the ``prepare_upgrade.yml`` playbook, run the following command: :: - - cd omnia/upgrade - ansible-playbook prepare_upgrade.yml -i diff --git a/docs/source/Upgrade/1.5.1to1.6.1/restore_telemetryDB.rst b/docs/source/Upgrade/1.5.1to1.6.1/restore_telemetryDB.rst deleted file mode 100644 index ea00aa924..000000000 --- a/docs/source/Upgrade/1.5.1to1.6.1/restore_telemetryDB.rst +++ /dev/null @@ -1,65 +0,0 @@ -Restoring Telemetry database post Omnia upgrade -================================================ - -After upgrading Omnia, if you want to retain the telemetry data from Omnia v1.5.1, you need to manually restore the telemetry database from the ``telemetry_tsdb_dump.sql`` file. Perform the following steps to do so: - -1. Copy the backed up telemetry database file, that is ``telemetry_tsdb_dump.sql``, from the ``backup_location`` to ``/opt/omnia/telemetry/iDRAC-Referencing-Tools``. - -2. Stop the Omnia telemetry services on all the cluster nodes. Run the ``telemetry.yml`` playbook after setting the ``idrac_telemetry_support``, ``omnia_telemetry_support``, and ``visualization_support`` parameters to ``false`` in ``input/telemetry_config.yml``. Execute the following command: :: - - cd telemetry - ansible-playbook telemetry.yml -i ../upgrade/inventory - -3. Connect to the ``timescaledb`` pod and execute the psql commands. Perform the following steps: - - * Execute the following command: :: - - kubectl exec -it timescaledb-0 -n telemetry-and-visualizations -- /bin/bash - - * Verify that the dump file is present using the ``ls`` command. - - * Connect to the psql client using the following command: :: - - psql -U - - where "timescaledb_user" is the configured ``timescaledb`` username for telemetry. - - * Drop the current database using the command below: :: - - DROP DATABASE telemetry_metrics; - - .. note:: If there are processes which are preventing you to drop the database, then terminate those processes and try again. - - * Create an empty telemetry database for Omnia v1.6.1 using the command below: :: - - CREATE DATABASE telemetry_metrics; - - * Exit from the psql client using ``\q`` command. - - * Execute the following command to initiate the database restore operation: :: - - psql --dbname=telemetry_metrics --host= --port=5432 --username= -v ON_ERROR_STOP=1 --echo-errors -c "SELECT public.timescaledb_pre_restore();" -f telemetry_tsdb_dump.sql -c "SELECT public.timescaledb_post_restore();" - - .. note:: Execute the following command to obtain the ``pod_external_ip`` and ``port`` for the ``timescaledb`` pod: - :: - kubectl get svc -A output - - * Drop the ``insert_block_trigger`` if it exists using the following commands: :: - - psql -U omnia - \c telemetry_metrics - DROP TRIGGER IF EXISTS ts_insert_blocker ON public.timeseries_metrics; - DROP TRIGGER IF EXISTS ts_insert_blocker ON omnia_telemetry.metrics; - - -Next steps -============ - -1. Connect to the ``telemetry_metrics`` database and verify if the restored telemetry data is present in ``public.timeseries_metrics`` and ``omnia_telemetry.metrics`` tables. - -2. Post verification, you can choose to restart the Omnia telemetry services. Run the ``telemetry.yml`` playbook after modifying the ``input/telemetry_config.yml`` as per your requirements. For more information regarding the telemetry parameters, `click here <../../InstallationGuides/BuildingClusters/schedulerinputparams.html#id18>`_. Execute the following command: :: - - cd telemetry - ansible-playbook telemetry.yml -i ../upgrade/inventory - -3. After telemetry services are enabled, check ``omnia_telemetry.metrics`` and ``public.timeseries_metrics`` tables to see if the number of rows have increased. This signifies that the fresh telemetry data from Omnia v1.6.1 is getting updated in the database. \ No newline at end of file diff --git a/docs/source/Upgrade/1.5.1to1.6.1/upgrade.rst b/docs/source/Upgrade/1.5.1to1.6.1/upgrade.rst deleted file mode 100644 index bdaf1e1bf..000000000 --- a/docs/source/Upgrade/1.5.1to1.6.1/upgrade.rst +++ /dev/null @@ -1,17 +0,0 @@ -Upgrade Omnia -============== - -This is the third step of upgrade process and uses the ``upgrade.yml`` playbook. This playbook performs the following task: - - * Invokes the v1.6.1 ``omnia.yml`` tasks to setup the cluster in 1.6.1 format, that is, configuring scheduler, storage, security and telemetry. - -To execute the ``upgrade.yml`` playbook, run the following command: :: - - cd omnia/upgrade - ansible-playbook upgrade.yml -i inventory - -Where inventory refers to the auto-generated inventory file in Omnia v1.6.1 format. - -This is the final step, and once the upgrade.yml playbook is executed successfully, the upgrade process is complete! - -Optional - `Restore Telemetry database post upgrade `_ \ No newline at end of file diff --git a/docs/source/Upgrade/1.6to1.6.1/index.rst b/docs/source/Upgrade/1.6to1.6.1/index.rst deleted file mode 100644 index 469185e28..000000000 --- a/docs/source/Upgrade/1.6to1.6.1/index.rst +++ /dev/null @@ -1,27 +0,0 @@ -Upgrade Omnia v1.6 to v1.6.1 -============================== - -The upgrade feature in v1.6.1 helps customers to upgrade their Omnia setup from v1.6 to v1.6.1. - -.. note:: Omnia v1.6.1 addresses the issue of the unavailable dependent package 'libssl1.1_1.1.1f-1ubuntu2.22_amd64' required by Omnia 1.6 on Ubuntu 22.04 OS. For more information, see the `release notes <../../Overview/releasenotes.html#id1>`_. - -**Prerequisites** - - 1. The control plane must have internet connectivity and run a full version of the operating system. - - 2. If Git is not installed on the control plane, install Git using the following command: :: - - dnf install git -y - - 3. Clone the Omnia v1.6.1 repository from GitHub and place it in a directory on the control plane. This directory must be different from the one containing the Omnia v1.6 repository. Execute the following command to perform the cloning operation: :: - - git clone https://github.com/dell/omnia.git - -Once the cloning process is done, follow the steps listed below to invoke the upgrade process: - -.. toctree:: - - prepare_config - prepare_upgrade - upgrade - diff --git a/docs/source/Upgrade/1.6to1.6.1/prepare_config.rst b/docs/source/Upgrade/1.6to1.6.1/prepare_config.rst deleted file mode 100644 index 04d3b9694..000000000 --- a/docs/source/Upgrade/1.6to1.6.1/prepare_config.rst +++ /dev/null @@ -1,30 +0,0 @@ -Prepare Config -=============== - -This is the first step of upgrade process and uses the ``prepare_config.yml`` playbook. This playbook performs the following tasks: - - * Imports the input configuration parameters from Omnia v1.6 and generates the input configurations for v1.6.1. - -Before executing ``prepare_config.yml``, user needs to update ``upgrade_config.yml`` with the following details: - -+-----------------------------+------------------------------------------------------------------------------------------+ -| Parameter | Description | -+=============================+==========================================================================================+ -| **old_input_location** | * This variable points to the input directory of the old Omnia 1.6 installation | -| (Required) | * **Example input location:** ``/root/omnia15/omnia/input`` | -+-----------------------------+------------------------------------------------------------------------------------------+ -| **backup_location** | * This variable points to the directory where OmniaDB backups should be stored. | -| (Required) | * This directory must exist prior to running ``prepare_config.yml`` | -| | * **Example:** ``/root/omnia-backups`` | -+-----------------------------+------------------------------------------------------------------------------------------+ - -.. note:: During the upgrade from version 1.6 to 1.6.1, Omnia does not require the creation of a backup file. This is because none of the details from the Omnia database are deleted during the upgrade process. - -To execute the ``prepare_config.yml`` playbook, run the following command: :: - - cd omnia/upgrade - ansible-playbook prepare_config.yml -i - -Expected output of this playbook execution: - - * Auto-populated Omnia v1.6.1 configuration files in the ``/omnia/input``. \ No newline at end of file diff --git a/docs/source/Upgrade/1.6to1.6.1/prepare_upgrade.rst b/docs/source/Upgrade/1.6to1.6.1/prepare_upgrade.rst deleted file mode 100644 index 777977e36..000000000 --- a/docs/source/Upgrade/1.6to1.6.1/prepare_upgrade.rst +++ /dev/null @@ -1,9 +0,0 @@ -Prepare Upgrade -================ - -This is the second step of upgrade process and uses the ``prepare_upgrade.yml`` playbook. - -To execute the ``prepare_upgrade.yml`` playbook, run the following command: :: - - cd omnia/upgrade - ansible-playbook prepare_upgrade.yml -i diff --git a/docs/source/Upgrade/1.6to1.6.1/upgrade.rst b/docs/source/Upgrade/1.6to1.6.1/upgrade.rst deleted file mode 100644 index 42bd25ac7..000000000 --- a/docs/source/Upgrade/1.6to1.6.1/upgrade.rst +++ /dev/null @@ -1,11 +0,0 @@ -Upgrade Omnia -============== - -This is the third step of upgrade process and uses the ``upgrade.yml`` playbook. - -To execute the ``upgrade.yml`` playbook, run the following command: :: - - cd omnia/upgrade - ansible-playbook upgrade.yml -i - -This is the final step, and once the ``upgrade.yml`` playbook is executed successfully, the upgrade process is complete! \ No newline at end of file diff --git a/docs/source/Upgrade/index.rst b/docs/source/Upgrade/index.rst index a7532cf40..1eca06451 100644 --- a/docs/source/Upgrade/index.rst +++ b/docs/source/Upgrade/index.rst @@ -1,9 +1,105 @@ -Upgrade Omnia -============== +Upgrade Omnia OIM +============================== -You can upgrade to Omnia v1.6.1 from v1.5.1 or v1.6. To upgrade your Omnia version, do the following: +To upgrade the Omnia version 1.6.1 to version 1.7 on your OIM, you can use the ``upgrade_oim.yml`` playbook in Omnia 1.7. This ensures that your OIM is running the latest version and includes any new features and improvements that are available. -.. toctree:: +.. caution:: Do not reboot the OIM before initiating the upgrade process, as it leads to loss of telemetry data. + +.. note:: + + * Before initiating upgrade, ensure that the OIM has a stable internet connection to avoid intermittent issues caused by poor network connectivity. + * After upgrading the Omnia OIM running on a `supported OS <../Overview/SupportMatrix/OperatingSystems/index.html>`_ (except RHEL/Rocky Linux 8.6 and 8.8), the ``input/software_config.json`` file remains in its default state. This enables users to install the default software versions on a new cluster. + * After upgrading your OIM from Omnia 1.6.1 to version 1.7, ensure that the cryptography version on the login nodes is also updated to 44.0.0. This is necessary to address a security vulnerability reported with the lower versions of the cryptography software. To update the cryptography software version, run the following command: :: + + pip install cryptography==44.0.0 + +**Tasks performed by the** ``upgrade_oim.yml`` **playbook** + +The ``upgrade_oim.yml`` playbook performs the following tasks: + +* Validates whether upgrade can be performed on the Omnia OIM. +* Takes backup of the Kubernetes etcd database, TimescaleDB, and MySQLDB at the backup location specified by the user. +* Regenerates the inventory files with hostname values. +* Imports input parameters from provided source code path of already installed Omnia version. +* Upgrades the software version for nerdctl and kubernetes on the OIM. +* Upgrades ``omnia_telemetry`` binaries on nodes where the telemetry service is running. +* Upgrades iDRAC telemetry services on the OIM. + +**Pre-check before Upgrade** + +If you have deployed a telemetry service in your Kubernetes cluster, it is important to ensure that the cluster is running properly before you initiate the upgrade process. As part of the upgrade pre-check, Omnia verifies if there are any issues with the cluster, such as non-running pods, LoadBalancer services without external IPs, or unbounded PVCs. If any of these issues are detected, you will need to address them before you can proceed with the upgrade. + +**Steps to be performed for Upgrade** + +To upgrade the Omnia OIM, do the following: + +1. Clone the Omnia 1.7 source code to your OIM using the following command: :: + + git clone https://github.com/dell/omnia.git + +2. Execute the ``prereq.sh`` script using the following command: :: + + cd omnia + ./prereq.sh + +3. Use any one of the following commands to activate the Omnia virtual environment, based on the operating system running on the OIM: + + * For RHEL or Rocky Linux 8.8, and Ubuntu 20.04 or 22.04, use: :: + + source /opt/omnia/omnia17_venv/bin/activate + + * On RHEL/Rocky Linux 8.6 or 8.7, use: :: + + source /opt/omnia/omnia161_venv/bin/activate + +4. Update the ``omnia/upgrade/upgrade_config.yml`` file with the following details: + + +-----------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+ + | ``installed_omnia_path`` | * This variable points to the currently installed Omnia 1.6.1 source code directory. | + | Required | * **Example**: ``/root/omnia161/omnia`` | + | | .. note:: Verify that the directory has not been altered since the last execution of ``discovery_provision.yml`` and ``omnia.yml`` playbooks. | + +-----------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+ + | ``backup_location`` | * This variable points to the directory where the Omnia OIM backup is stored during the upgrade process. | + | Optional | * User must create this directory before running ``upgrade_oim.yml`` playbook and provide the complete path of that directory. | + | | * If the specified directory doesn't exist, backups will be taken at ``/opt/omnia/backup_before_upgrade`` | + | | * **Example**: ``/opt/omnia/upgrade_backup`` | + +-----------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+ + +5. Finally, execute the ``upgrade_oim.yml`` playbook using the following command: :: + + cd upgrade + ansible-playbook upgrade_oim.yml + +.. caution:: + + If ``upgrade_oim.yml`` execution fails, you can rollback to Kubernetes version 1.26.12 and restore the old backed-up data using the ``restore_oim.yml`` playbook. To restore, do the following: + + 1. Activate the Omnia virtual environment using the ``source /opt/omnia/omnia161_venv/bin/activate`` command. + + 2. Execute the ``restore_oim.yml`` playbook using the following command: :: + + cd upgrade + ansible-playbook restore_oim.yml + +**Post Upgrade** + +Things to keep in mind after the OIM has been upgraded successfully: + +* To use Omnia 1.7 features, ensure to execute all the playbooks from within the Omnia 1.7 virtual environment. To activate the 1.7 virtual environment, use the following command: :: + + source /opt/omnia/omnia17_venv/bin/activate + +* After upgrading your Omnia OIM to version 1.7, the new cluster configuration features added in this version won’t work with any of your existing clusters. These new features will only be available when you create new clusters on RHEL/Rocky Linux 8.8 or Ubuntu 22.04 platforms, using Omnia 1.7 source code. +* The new cluster configuration features in Omnia 1.7 are not supported on RHEL/Rocky Linux 8.6 or 8.7. This means that even if you upgrade your Omnia OIM to version 1.7, these features won’t function on those platforms. +* Post-upgrade to Omnia 1.7, if you want to use old 1.6.1 software versions of Kubernetes (1.26.12), KServe (0.11.2), Kubeflow (1.8.0), and MPI operator (0.4.0), then perform the following steps: + + * Activate the Omnia 1.6.1 virtual environment using the following command: :: + + source /opt/omnia/omnia161_venv/bin/activate + + * Update the ``input/software_config.json`` file of Omnia 1.7 with the required software versions. + + * [Optional] Omnia recommends to take a backup of the ``input/software_config.json`` and all other configurations files in case you want to switch to Omnia 1.7 at a later point of time. + + * Copy the ``.json`` files from the ``input/config//`` folder in Omnia 1.6.1 and overwrite the existing files in the same directory of Omnia 1.7. - 1.5.1to1.6.1/index - 1.6to1.6.1/index \ No newline at end of file diff --git a/docs/source/Utils/AdditionalNIC.rst b/docs/source/Utils/AdditionalNIC.rst new file mode 100644 index 000000000..a51592721 --- /dev/null +++ b/docs/source/Utils/AdditionalNIC.rst @@ -0,0 +1,177 @@ +Configuring additional NICs and Kernel Parameters on the nodes +---------------------------------------------------------------- +After the ``discovery_provision.yml`` playbook has been executed and the nodes have booted up, additional NICs or OS Kernel command-line parameters can be configured on the cluster nodes using the ``server_spec_update.yml`` playbook. For more information about the supported kernel command-line parameters, `click here `_. +The ``server_spec_update.yml`` playbook can be used to do the following tasks: + + * Configure additional NICs on the nodes. + * Configure OS Kernel command-line parameters on the nodes. + * Configure both additional NICs and OS Kernel command-line parameters on the nodes. + +**Prerequisites** + +* All target nodes are provisioned and booted. `Click here <../OmniaInstallGuide/Ubuntu/Provision/ViewingDB.html>`_ to know how to verify the status of the nodes. + +* Ensure that ``input/network_spec.yml`` file has been updated with all network information in addition to ``admin_network`` and ``bmc_network`` information. Below are all applicable properties of an additional network: + + * ``nic_name``: The name of the NIC on which the administrative network is accessible to the OIM. + * ``netmask_bits``: The 32-bit "mask" used to divide an IP address into subnets and specify the network's available hosts. + * ``static_range``: The static range of IPs to be provisioned on target nodes. This indicates that only a certain static range is available to Omnia. + +* In addition to the above mentioned properties, the following properties are also applicable and can be added in ``input/network_spec.yml`` for additional NICs: + + * ``CIDR``: Classless or Classless Inter-Domain Routing (CIDR) addresses use variable length subnet masking (VLSM) to alter the ratio between the network and host address bits in an IP address. + + .. note:: You can either use ``CIDR`` or ``static_range``. Simultaneous use of both parameters will result in an error message being displayed. + + * ``MTU``: Maximum transmission unit (MTU) is a measurement in bytes of the largest data packets that an Internet-connected device can accept. Default value of ``MTU`` is 1500. You can enter your desired value. + * ``VLAN``: A 12-bit field that identifies a virtual LAN (VLAN) and specifies the VLAN that an ethernet frame belongs to. This property is not supported on clusters running Ubuntu. + +* Modify the ``input/server_spec.yml`` file with the additional NIC information and/or OS command-line kernel parameters that you want to add or alter for the target nodes. Ensure the following: + + * All NICs listed in the ``server_spec.yml`` file are grouped into categories (groups for servers). The field ``Categories:`` should not be edited out of the ``input/server_spec.yml`` file. + * The name of the NIC specified in the file (in this sample, ``ensp0``, ``ensp0.5``, and ``eno1``) is the unique identifier of NICs in the file. + * The property ``nictype`` indicates what kind of NIC is in use (ethernet, infiniband, or vlan). If the ``nictype`` is set to ``vlan``, ensure to specify a primary NIC for the VLAN using the property ``nicdevices``. + * The OS Kernel command-line parameters should be provided under ``cmdline`` field. If you want to provide multiple kernel parameters, ensure that they are separated by a "space". + +.. note:: + + * If a ``static_range`` value is provided in ``input/network_spec.yml``, additional networks are not correlated. + * If a ``CIDR`` value is provided in ``input/network_spec.yml``, the complete subnet is used for Omnia to assign IPs and where possible, the IPs will be correlated with the assignment on the admin network. Omnia performs correlation for additional networks if the subnet prefix for the admin network is a superset, and the additional network is a subset. For example, if the subnet prefix for the admin network is */16* and for the additional network it's */24*, Omnia attempts to correlate the IPs if the value for the ``correlate_to_admin`` field is set to true in ``input/network_spec.yml``. + * If a VLAN is required, ensure that a VLAN ID is provided in the ``vlan`` field in ``input/server_spec.yml`` and ensure that it's provided in the ``NIC.vlan_id`` format. For example, consider "eth1.101" where ``eth1`` is the NIC name configured with a VLAN is and ``101`` is the ``vlan_id``. This field is not supported on admin or bmc networks. + * While new networks can be added to the ``network_spec.yml`` file on subsequent runs of the ``server_spec_update.yml`` playbook, existing networks cannot be edited or deleted. If the user modifies or removes existing networks from ``input/network_spec.yml``, the playbook execution might fail. In that case, the user needs to `reprovision the node <../OmniaInstallGuide/Maintenance/reprovision.html>`_. + +**Usage Instructions** + +* *Configure additional NICs on the nodes.* + + * Fill up all the necessary details for the additional NICs in the ``input/network_spec.yml`` file. You can refer the following sample: :: + + - nic_network1: + netmask_bits: "24" + CIDR: "10.23.1.0" + network_gateway: "" + MTU: "1500" + VLAN: "" + - nic_network2: + netmask_bits: "24" + static_range: "10.23.2.1-10.23.2.254" + network_gateway: "" + MTU: "1500" + VLAN: "1" + + * Add the additional NIC information to the ``input/server_spec.yml`` file. You can refer the following sample: :: + + Categories: + - group-1: + - network: + - ensp0: + nicnetwork: "nic_network1" + nictypes: "ethernet" + - ensp0.5: + nicnetwork: "nic_network2" + nictypes: "vlan" + nicdevices: "ensp0" + + + +* *Configure OS Kernel command-line parameters on the nodes.* + + * Do not change anything in the ``input/network_spec.yml`` file. + + * Add the OS Kernel command-line parameters to the ``cmdline`` field in the ``input/server_spec.yml`` file. You can refer the following sample: :: + + Categories: + - group-1: + - os: + - kernel: + - cmdline: "iommu=pt intel_iommu=off pci=realloc=off processor.max_cstate=0 intel_idle.max_cstate=0 intel_pstate=disable" + + + +* *Configure both additional NICs and OS Kernel command-line parameters on the nodes.* + + * Fill up all the necessary details for the additional NICs in the ``input/network_spec.yml`` file. You can refer the following sample: :: + + - nic_network1: + netmask_bits: "24" + CIDR: "10.23.1.0" + network_gateway: "" + MTU: "1500" + VLAN: "" + - nic_network2: + netmask_bits: "24" + static_range: "10.23.2.1-10.23.2.254" + network_gateway: "" + MTU: "1500" + VLAN: "1" + + * Add the OS Kernel command-line parameters to the ``cmdline`` field in the ``input/server_spec.yml`` file. You can refer the following sample: :: + + Categories: + - group-1: + - network: + - ensp0: + nicnetwork: "nic_network1" + nictypes: "ethernet" + - ensp0.5: + nicnetwork: "nic_network2" + nictypes: "vlan" + nicdevices: "ensp0" + - os: + - kernel: + - cmdline: "iommu=pt intel_iommu=off pci=realloc=off processor.max_cstate=0 intel_idle.max_cstate=0 intel_pstate=disable" + +.. note:: + + * If OS Kernel command-line parameter configuration is not required on the nodes, the user can leave the ``cmdine`` entry empty in ``input/server_spec.yml`` or remove the ``os`` section. + * The ``nicnetwork`` details must be consistent with the network names specified in the ``input/network_spec.yml`` file. + * While new groups can be added to the ``input/server_spec.yml`` file on subsequent runs of the ``server_spec_update.yml`` playbook, existing groups cannot be edited or deleted. If the user modifies or removes existing groups from ``input/server_spec.yml``, the playbook execution might fail. In that case, the user needs to `reprovision the node <../../Maintenance/reprovision.html>`_. + * This playbook has been validated with the following Kernel parameters: + + * iommu=pt + * intel_iommu=off + * pci=realloc=off + * processor.max_cstate=0 + * intel_idle.max_cstate=0 + * intel_pstate=disable + +.. caution:: + + * If duplicate entries of the same command line parameter is provided but with different values, then the playbook picks up to the last provided value overwriting any previous entries. For example, if the user provides ``"intel_iommu=on intel_iommu=off"`` as the parameters, the configuration will ultimately be set to ``"intel_iommu=off"``, as this is the last value provided. + * Similarly, if the ``server_spec_update.yml`` playbook is executed with a command line parameter, such as ``"intel_iommu=off"``, and is later rerun with the same parameter but an updated value, such as ``"intel_iommu=on"``, the playbook will assign the latest value for that parameter. As a result, it will ultimately set ``"intel_iommu=on"`` for the configuration. This behavior ensures that the most recent configuration is applied during execution. + +**Executing the playbook** + +After you have filled up the ``input/network_spec.yml`` and ``input/server_spec.yml`` with all the necessary details based on the configuration(s) required, do the following to execute the playbook: + +* First, create an inventory while referencing the sample inventory format is present in ``examples/server_spec_inv`` and also attached below: :: + + [node-group1] + 10.5.0.3 + + [node-group1:vars] + Categories=group-1 + + [node-group2] + 10.5.0.4 + 10.5.0.5 + + [node-group2:vars] + Categories=group-2 + +In the above sample inventory file, ``[node-group1]`` and ``[node-group2]`` are user-defined groups with servers associated to them. The groups are mapped and categorised in ``input/server_spec.yml`` under ``[:vars]``. + +.. note:: While creating the inventory file, ensure that each group has unique nodes. One node should not be part of two groups. + +* Finally, use the below command to execute the playbook: :: + + cd utils/server_spec_update + ansible-playbook server_spec_update.yml -i + +.. note:: In Omnia v1.7, while executing ``server_spec_update.yml``, the user needs to ensure that only admin IP addresses are used in the inventory file, not service tags or node names. + +Based on the provided sample files, server 10.5.0.3 has been mapped to node-group1 which corresponds to group-1. Therefore, the NICs ensp0 and ensp0.5 will be configured in an ethernet VLAN group with ensp0 as the primary device. + + + + diff --git a/docs/source/Roles/Utils/KernelUpdate.rst b/docs/source/Utils/KernelUpdate.rst similarity index 69% rename from docs/source/Roles/Utils/KernelUpdate.rst rename to docs/source/Utils/KernelUpdate.rst index 32279fc46..31282a55a 100644 --- a/docs/source/Roles/Utils/KernelUpdate.rst +++ b/docs/source/Utils/KernelUpdate.rst @@ -1,11 +1,11 @@ -Updating kernels +Update Kernels ================= **Pre requisites**: * All target nodes should be running RHEL, Rocky Linux, or Ubuntu OS. -* Download the kernel packages using ``local_repo.yml``. For more information, `click here <../../InstallationGuides/LocalRepo/index.html>`_. -* Verify that the cluster nodes are in the ``booted`` state. For more information, `click here <../../InstallationGuides/InstallingProvisionTool/ViewingDB.html>`_. +* Download the kernel packages using ``local_repo.yml``. +* Verify that the cluster nodes are in the ``booted`` state. **Install kernel updates to cluster nodes** @@ -13,7 +13,7 @@ Updating kernels 1. Go to ``utils/software_update`` and edit ``software_update_config.yml``, as per the parameters table below. .. csv-table:: Parameters for Kernel Update - :file: ../../Tables/kernel_update.csv + :file: ../Tables/kernel_update.csv :header-rows: 1 :keepspace: diff --git a/docs/source/Roles/Utils/configuringPXE.rst b/docs/source/Utils/configuringPXE.rst similarity index 100% rename from docs/source/Roles/Utils/configuringPXE.rst rename to docs/source/Utils/configuringPXE.rst diff --git a/docs/source/Roles/Utils/epel.rst b/docs/source/Utils/epel.rst similarity index 86% rename from docs/source/Roles/Utils/epel.rst rename to docs/source/Utils/epel.rst index a0ddb831c..19ae125e7 100644 --- a/docs/source/Roles/Utils/epel.rst +++ b/docs/source/Utils/epel.rst @@ -12,5 +12,5 @@ To run the script: :: cd omnia/utils ansible-playbook install_hpc_thirdparty_packages.yml -i inventory -Where the inventory refers to a file listing all nodes per the format provided in `inventory file <../samplefiles.html>`_. +Where the inventory refers to a file listing all nodes per the format provided in `inventory file <../OmniaInstallGuide/samplefiles.html>`_. diff --git a/docs/source/Roles/Utils/freeipa_installation.rst b/docs/source/Utils/freeipa_installation.rst similarity index 82% rename from docs/source/Roles/Utils/freeipa_installation.rst rename to docs/source/Utils/freeipa_installation.rst index be5acef72..ec854c58c 100644 --- a/docs/source/Roles/Utils/freeipa_installation.rst +++ b/docs/source/Utils/freeipa_installation.rst @@ -1,5 +1,5 @@ FreeIPA installation on the NFS node -------------------------------------- +===================================== IPA services are used to provide account management and centralized authentication. If admin user intends to install the FreeIPA authentication on the NFS node (server connected to the storage devices), then the following playbook can be utilized. @@ -17,26 +17,20 @@ To install FreeIPA on NFS node, get the values of ``kerberos_admin_password`` an | ipa_server_ipadress | The IP address of the IPA server | The IP address can be found on the IPA server on the ``auth_server`` using the ``ip a`` command. This IP address should be accessible from the NFS node. | +-------------------------+-----------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------+ -To set up IPA services for the NFS node in the target cluster, run the following command from the ``utils/cluster`` folder on the control plane: :: +To set up IPA services for the NFS node in the target cluster, run the following command from the ``utils/cluster`` folder on the OIM: :: cd utils/cluster ansible-playbook install_ipa_client.yml -i inventory -e kerberos_admin_password="" -e ipa_server_hostname="" -e domain_name="" -e ipa_server_ipadress="" +**Hostname requirements** -.. include:: ../../Appendices/hostnamereqs.rst +* The hostname should not contain the following characters: , (comma), \. (period) or _ (underscore). However, the **domain name** is allowed with commas and periods. +* The hostname cannot start or end with a hyphen (-). +* No upper case characters are allowed in the hostname. +* The hostname cannot start with a number. +* The hostname and the domain name (that is: ``hostname00000x.domain.xxx``) cumulatively cannot exceed 64 characters. For example, if the ``node_name`` provided in ``input/provision_config.yml`` .. note:: - * Use the format specified under `NFS inventory in the Sample Files <../../samplefiles.html#nfs-server-inventory-file>`_ for inventory. - - * Omnia only supports ``/home`` as the ``homeDirectory``. - -To set up IPA services for the NFS node in the target cluster, run the following command from the ``utils/cluster`` folder on the control plane: :: - - cd utils/cluster - ansible-playbook install_ipa_client.yml -i inventory -e kerberos_admin_password="" -e ipa_server_hostname="" -e domain_name="" -e ipa_server_ipadress="" - - -.. include:: ../../Appendices/hostnamereqs.rst - -.. note:: Use the format specified under `NFS inventory in the Sample Files <../../samplefiles.html#nfs-server-inventory-file>`_ for inventory. \ No newline at end of file + * Use the format specified under `NFS inventory in the Sample Files <../OmniaInstallGuide/samplefiles.html#nfs-server-inventory-file>`_ for inventory. + * Omnia only supports ``/home`` as the ``homeDirectory``. \ No newline at end of file diff --git a/docs/source/Utils/index.rst b/docs/source/Utils/index.rst new file mode 100644 index 000000000..3337495e0 --- /dev/null +++ b/docs/source/Utils/index.rst @@ -0,0 +1,15 @@ +Utilities provided by Omnia +============================== + +Omnia provides an array of utilities as separate playbooks. Using these playbooks, you can do things like: + +.. toctree:: + :maxdepth: 2 + + software_update + KernelUpdate + AdditionalNIC + portcleanup + timescaledb_utility + freeipa_installation + tuneD \ No newline at end of file diff --git a/docs/source/Roles/Utils/kernel_param_update.rst b/docs/source/Utils/kernel_param_update.rst similarity index 70% rename from docs/source/Roles/Utils/kernel_param_update.rst rename to docs/source/Utils/kernel_param_update.rst index 7f3ed1604..0c621d9ce 100644 --- a/docs/source/Roles/Utils/kernel_param_update.rst +++ b/docs/source/Utils/kernel_param_update.rst @@ -8,16 +8,25 @@ For the supported kernel command-line parameters, `click here `_. The inventory file is case-sensitive. Follow the format provided in the sample file link. +Where the inventory refers to a file listing all nodes per the format provided in `inventory file <../OmniaInstallGuide/samplefiles.html>`_. The inventory file is case-sensitive. Follow the format provided in the sample file link. diff --git a/docs/source/Roles/Utils/portcleanup.rst b/docs/source/Utils/portcleanup.rst similarity index 100% rename from docs/source/Roles/Utils/portcleanup.rst rename to docs/source/Utils/portcleanup.rst diff --git a/docs/source/Roles/Utils/rhsm_subscription.rst b/docs/source/Utils/rhsm_subscription.rst similarity index 100% rename from docs/source/Roles/Utils/rhsm_subscription.rst rename to docs/source/Utils/rhsm_subscription.rst diff --git a/docs/source/Roles/Utils/software_update.rst b/docs/source/Utils/software_update.rst similarity index 69% rename from docs/source/Roles/Utils/software_update.rst rename to docs/source/Utils/software_update.rst index 7b6069492..fb2147d8b 100644 --- a/docs/source/Roles/Utils/software_update.rst +++ b/docs/source/Utils/software_update.rst @@ -6,16 +6,16 @@ To install multiple packages on cluster nodes in a bulk operation, the ``softwar **Prerequisites** * All cluster nodes should be running RHEL, Rocky Linux, or Ubuntu OS. - * Download the packages using ``local_repo.yml``. For more information, `click here <../../InstallationGuides/LocalRepo/index.html>`_. - * Verify that the cluster nodes are in the ``booted`` state. For more information, `click here <../../InstallationGuides/InstallingProvisionTool/ViewingDB.html>`_. + * Download the packages using ``local_repo.yml``. + * Verify that the cluster nodes are in the ``booted`` state. To customize the software update, enter the following parameters in ``utils/software_update/software_update_config.yml``: .. csv-table:: Parameters for software_update_config.yml - :file: ../../Tables/software_update_config.csv - :header-rows: 1 - :keepspace: + :file: ../Tables/software_update_config.csv + :header-rows: 1 + :keepspace: To run the playbook, run the following commands: :: diff --git a/docs/source/Roles/Utils/timescaledb_utility.rst b/docs/source/Utils/timescaledb_utility.rst similarity index 96% rename from docs/source/Roles/Utils/timescaledb_utility.rst rename to docs/source/Utils/timescaledb_utility.rst index 89a915e57..23f3363c4 100644 --- a/docs/source/Roles/Utils/timescaledb_utility.rst +++ b/docs/source/Utils/timescaledb_utility.rst @@ -1,7 +1,7 @@ TimescaleDB utility -------------------- +--------------------- -Telemetry metrics stored in a timescaleDB can be copied locally in a csv format. This file can be used to generate insights into key statistics in your cluster. +Telemetry metrics stored in a timescaleDB can be copied locally in a ``.csv`` format. This file can be used to generate insights into key statistics in your cluster. To customize the local copy of the timescale DB, fill out the below parameters in ``utils/timescaledb_utility/timescaledb_utility_config.yml``. diff --git a/docs/source/Utils/tuneD.rst b/docs/source/Utils/tuneD.rst new file mode 100644 index 000000000..934c63726 --- /dev/null +++ b/docs/source/Utils/tuneD.rst @@ -0,0 +1,56 @@ +Performance profile configuration +================================== + +.. caution:: Performance profile installation and accelerator configuration is supported exclusively for Intel Gaudi accelerators present in Ubuntu clusters. + +Performance profiles enable you to optimize system performance for specific workloads. Omnia supports the configuration of performance profiles for Ubuntu clusters that have Intel Gaudi accelerators. These profiles come with predefined settings tailored to different use cases. +For more information, `click here `_. + +**Prerequisites** + +* **Create an Inventory file** + +To configure performance profiles, list all the nodes for which you want to apply the profiles in an inventory file. A sample inventory looks like: :: + + node3 + node1 + +* **Configure Performance profiles** + +In the ``utils/performance_profile/performance_profile_config.yml`` file, under ``intel_gpu``, add or alter the values based on the following list of parameters: + +.. csv-table:: Parameters for performance profile configuration + :file: ../Tables/performance_config.csv + :header-rows: 1 + :keepspace: + +Here's a sample of the default ``performance_profile_config.yml`` file: :: + + intel_gpu: + performance_profile: "accelerator-performance" + performance_profile_plugin: + sysctl: + - vm.nr_hugepages: 156300 + reboot_required: "no" + +Here's an example for adding/modifying multiple plugins in the ``performance_profile_config.yml`` file: :: + + intel_gpu: + performance_profile: "accelerator-performance" + performance_profile_plugin: + sysctl: + - vm.nr_hugepages: 156300 + cpu: + - force_latency: 99 + disk: + - read_ahead_kb: 4096 + reboot_required: "no" + +.. note:: For Intel Gaudi accelerators, Omnia recommends to add the ``vm.nr_hugepages`` as a profile parameter under ``sysctl`` plugin and set its value to 156300. + +* **Execute the the playbook** + +Run the playbook using the following commands: :: + + cd utils/performance_profile + ansible-playbook performance_profile.yml -i inventory \ No newline at end of file diff --git a/docs/source/bestpractices.rst b/docs/source/bestpractices.rst index bd3e36daa..47cc4d964 100644 --- a/docs/source/bestpractices.rst +++ b/docs/source/bestpractices.rst @@ -1,13 +1,13 @@ Best Practices ============== -* Ensure that PowerCap policy is disabled and the BIOS system profile is set to 'Performance' on the Control Plane. +* Ensure that PowerCap policy is disabled and the BIOS system profile is set to 'Performance' on the OIM. * Always execute playbooks within the directory they reside in. That is, always change directories (``cd``) to the path where the playbook resides before running the playbook. -* Ensure that there is at least 50% (~50GB) free space on the Control Plane root partition before running Omnia. To maintain the free space required, place any ISO files used in the ``/home`` directory. -* Use a `PXE mapping file `_ even when using DHCP configuration to ensure that IP assignments remain persistent across Control Plane reboots. -* Avoid rebooting the Control Plane as much as possible to ensure that all network configuration does not get disturbed. -* Review the prerequisites before running Omnia Scripts. -* Ensure that the firefox version being used on the control plane is the latest available. This can be achieved using ``dnf update firefox -y`` +* Ensure that there is at least 50% (~50GB) free space on the OIM root partition before running Omnia. To maintain the free space required, place the required ISO files in the ``/home`` directory. +* Use a `PXE mapping file `_ even when using DHCP configuration to ensure that IP assignments remain persistent across OIM reboots. +* Avoid rebooting the OIM as much as possible to ensure that all network configuration does not get disturbed. +* Review the prerequisites before running Omnia scripts. +* Ensure that the firefox version being used on the RHEL/Rocky Linux OIM is the latest available. This can be achieved using ``dnf update firefox -y`` * It is recommended to configure devices using Omnia playbooks for better interoperability and ease of access. -* Ensure that the ``/var`` partition has adequate space to complete commands and store images. -* Run ``yum update --security`` routinely on the control plane for the latest security updates. \ No newline at end of file +* Ensure that the ``/var`` partition has adequate space to execute commands and store images. +* Run ``yum update --security`` routinely on the RHEL/Rocky Linux OIM for the latest security updates. \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index 17f9f78b7..80a1e6c29 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -8,9 +8,9 @@ project = 'Omnia' -copyright = '2024, Dell Technologies' +copyright = '2025, Dell Technologies' author = 'dell/omnia' -release = '1.6' +release = '1.7' rst_epilog = "If you have any feedback about Omnia documentation, please reach out at `omnia.readme@dell.com `_." import sys diff --git a/docs/source/images/CSI_1.png b/docs/source/images/CSI_1.png new file mode 100644 index 000000000..04698a86f Binary files /dev/null and b/docs/source/images/CSI_1.png differ diff --git a/docs/source/images/CSI_OneFS.png b/docs/source/images/CSI_OneFS.png new file mode 100644 index 000000000..c2cde8163 Binary files /dev/null and b/docs/source/images/CSI_OneFS.png differ diff --git a/docs/source/images/CSI_delete_deployment.png b/docs/source/images/CSI_delete_deployment.png new file mode 100644 index 000000000..552837691 Binary files /dev/null and b/docs/source/images/CSI_delete_deployment.png differ diff --git a/docs/source/images/CSI_delete_storageclass.png b/docs/source/images/CSI_delete_storageclass.png new file mode 100644 index 000000000..29684908e Binary files /dev/null and b/docs/source/images/CSI_delete_storageclass.png differ diff --git a/docs/source/images/CSI_get_deployment.png b/docs/source/images/CSI_get_deployment.png new file mode 100644 index 000000000..9c7092e9f Binary files /dev/null and b/docs/source/images/CSI_get_deployment.png differ diff --git a/docs/source/images/CSI_get_storageclass.png b/docs/source/images/CSI_get_storageclass.png new file mode 100644 index 000000000..22cd2c810 Binary files /dev/null and b/docs/source/images/CSI_get_storageclass.png differ diff --git a/docs/source/images/Dedicated_NT.png b/docs/source/images/Dedicated_NT.png new file mode 100644 index 000000000..0a2a87a72 Binary files /dev/null and b/docs/source/images/Dedicated_NT.png differ diff --git a/docs/source/images/DeepSpeed.png b/docs/source/images/DeepSpeed.png new file mode 100644 index 000000000..e2d8779ee Binary files /dev/null and b/docs/source/images/DeepSpeed.png differ diff --git a/docs/source/images/Hybrid_NT.png b/docs/source/images/Hybrid_NT.png new file mode 100644 index 000000000..2755b4a75 Binary files /dev/null and b/docs/source/images/Hybrid_NT.png differ diff --git a/docs/source/images/LOM_NT.png b/docs/source/images/LOM_NT.png new file mode 100644 index 000000000..6ec3fb574 Binary files /dev/null and b/docs/source/images/LOM_NT.png differ diff --git a/docs/source/images/Omnia_Architecture.png b/docs/source/images/Omnia_Architecture.png index 7fb0f5a4c..4c63d9f34 100644 Binary files a/docs/source/images/Omnia_Architecture.png and b/docs/source/images/Omnia_Architecture.png differ diff --git a/docs/source/images/Prometheus_datasource.png b/docs/source/images/Prometheus_datasource.png new file mode 100644 index 000000000..ad1178451 Binary files /dev/null and b/docs/source/images/Prometheus_datasource.png differ diff --git a/docs/source/images/Prometheus_datasource2.png b/docs/source/images/Prometheus_datasource2.png new file mode 100644 index 000000000..e5cc2e848 Binary files /dev/null and b/docs/source/images/Prometheus_datasource2.png differ diff --git a/docs/source/images/Prometheus_ui.png b/docs/source/images/Prometheus_ui.png new file mode 100644 index 000000000..34f97173e Binary files /dev/null and b/docs/source/images/Prometheus_ui.png differ diff --git a/docs/source/images/Prometheus_ui_2.png b/docs/source/images/Prometheus_ui_2.png new file mode 100644 index 000000000..0c532bff3 Binary files /dev/null and b/docs/source/images/Prometheus_ui_2.png differ diff --git a/docs/source/images/Prometheus_ui_3.png b/docs/source/images/Prometheus_ui_3.png new file mode 100644 index 000000000..570b04ab2 Binary files /dev/null and b/docs/source/images/Prometheus_ui_3.png differ diff --git a/docs/source/images/SecurityControlsMap.png b/docs/source/images/SecurityControlsMap.png new file mode 100644 index 000000000..6b5698a35 Binary files /dev/null and b/docs/source/images/SecurityControlsMap.png differ diff --git a/docs/source/images/TimescaleDB_Ports.png b/docs/source/images/TimescaleDB_Ports.png index 7b60c62a5..a3cba25fe 100644 Binary files a/docs/source/images/TimescaleDB_Ports.png and b/docs/source/images/TimescaleDB_Ports.png differ diff --git a/docs/source/images/installation_flow.png b/docs/source/images/installation_flow.png new file mode 100644 index 000000000..8c843e981 Binary files /dev/null and b/docs/source/images/installation_flow.png differ diff --git a/docs/source/images/intel_known_issue.png b/docs/source/images/intel_known_issue.png new file mode 100644 index 000000000..2c0ce2213 Binary files /dev/null and b/docs/source/images/intel_known_issue.png differ diff --git a/docs/source/images/roce_pod_failure.png b/docs/source/images/roce_pod_failure.png deleted file mode 100644 index 0907acb41..000000000 Binary files a/docs/source/images/roce_pod_failure.png and /dev/null differ diff --git a/docs/source/images/securityControlsMap.jpg b/docs/source/images/securityControlsMap.jpg deleted file mode 100644 index 5df334cfa..000000000 Binary files a/docs/source/images/securityControlsMap.jpg and /dev/null differ diff --git a/docs/source/images/slurm_epel.png b/docs/source/images/slurm_epel.png new file mode 100644 index 000000000..d5862d985 Binary files /dev/null and b/docs/source/images/slurm_epel.png differ diff --git a/docs/source/images/telemetry_mi300.png b/docs/source/images/telemetry_mi300.png new file mode 100644 index 000000000..e14a554b8 Binary files /dev/null and b/docs/source/images/telemetry_mi300.png differ diff --git a/docs/source/images/tuneD_intel_habana.png b/docs/source/images/tuneD_intel_habana.png new file mode 100644 index 000000000..ddf633bf1 Binary files /dev/null and b/docs/source/images/tuneD_intel_habana.png differ diff --git a/docs/source/images/virtual_env_1.png b/docs/source/images/virtual_env_1.png new file mode 100644 index 000000000..1cd6a5aa6 Binary files /dev/null and b/docs/source/images/virtual_env_1.png differ diff --git a/docs/source/images/virtual_env_2.png b/docs/source/images/virtual_env_2.png new file mode 100644 index 000000000..a7117fbd1 Binary files /dev/null and b/docs/source/images/virtual_env_2.png differ diff --git a/docs/source/images/virtual_env_deactivate.png b/docs/source/images/virtual_env_deactivate.png new file mode 100644 index 000000000..61665621f Binary files /dev/null and b/docs/source/images/virtual_env_deactivate.png differ diff --git a/docs/source/images/virtual_env_error_1.png b/docs/source/images/virtual_env_error_1.png new file mode 100644 index 000000000..545481d88 Binary files /dev/null and b/docs/source/images/virtual_env_error_1.png differ diff --git a/docs/source/images/virtual_env_error_2.png b/docs/source/images/virtual_env_error_2.png new file mode 100644 index 000000000..e2a6c21c5 Binary files /dev/null and b/docs/source/images/virtual_env_error_2.png differ diff --git a/docs/source/images/vllm_intel.png b/docs/source/images/vllm_intel.png new file mode 100644 index 000000000..e33f370de Binary files /dev/null and b/docs/source/images/vllm_intel.png differ diff --git a/docs/source/index.rst b/docs/source/index.rst index 6a32d0ff6..ee6bc4478 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -18,7 +18,7 @@ Ansible playbook-based deployment of Slurm and Kubernetes on servers running an Omnia is made available under the `Apache 2.0 license. `_ -.. note:: Omnia playbooks are licensed under the Apache 2.0 license. Once an end-user initiates Omnia, that end-user will deploy other open-source and/or third-party software that is licensed separately by their respective developer communities and/or third parties. For a comprehensive list of software and their licenses, `click here `_. Dell (or any other contributors) shall have no liability regarding (and no responsibility to provide support for) an end-users use of any open- source and/or third-party software and OMNIA users are solely responsible for ensuring that they are complying with all such licenses. Omnia is provided “as is” without any warranty, express or implied. Dell (or any other contributors) shall have no liability for any direct, indirect, incidental, punitive, special, or consequential damages for an end-user's use of Omnia. +.. note:: Omnia playbooks are licensed under the Apache 2.0 license. Once an end-user initiates Omnia, that end-user will deploy other open-source and/or third-party software that is licensed separately by their respective developer communities and/or third parties. For a comprehensive list of software and their licenses, `click here `_. Dell (or any other contributors) shall have no liability regarding (and no responsibility to provide support for) an end-users use of any open-source and/or third-party software and OMNIA users are solely responsible for ensuring that they are complying with all such licenses. Omnia is provided “as is” without any warranty, express or implied. Dell (or any other contributors) shall have no liability for any direct, indirect, incidental, punitive, special, or consequential damages for an end-user's use of Omnia. For a better understanding of what Omnia does, check out our `docs `_! @@ -50,19 +50,20 @@ For a better understanding of what Omnia does, check out our `docs `_. +- Currently, Omnia only supports the splitting of switch ports. Switch ports cannot be un-split using the switch configuration script. - The IP subnet 10.4.0.0 cannot be used for any networks on the Omnia cluster as it is reserved for Nerdctl. - Installation of vLLM and racadam via Omnia is not supported on Ubuntu 20.04. -- Omnia v1.6 does not support configuration of a DNS server on the control plane; that is, the ``DNS`` parameter in `input/network_spec.yml `_ is not supported. -- Minimal OS version of RHEL/Rocky Linux and "desktop image" version of Ubuntu is not supported on the control plane. +- The "desktop image" version of Ubuntu is not supported on the OIM. diff --git a/examples/ai_examples/intel/deepSpeed/ds_configuration.yml b/examples/ai_examples/intel/deepSpeed/ds_configuration.yml new file mode 100644 index 000000000..402dcae6b --- /dev/null +++ b/examples/ai_examples/intel/deepSpeed/ds_configuration.yml @@ -0,0 +1,148 @@ +--- +apiVersion: kubeflow.org/v2beta1 +kind: MPIJob +metadata: + name: gaudi-llm-ds-ft + namespace: workloads +spec: + slotsPerWorker: 8 + runPolicy: + cleanPodPolicy: Running + mpiReplicaSpecs: + Launcher: + replicas: 1 + template: + spec: + containers: + - image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest + name: gaudi-llm-ds-ft-launcher + env: + - name: HF_HOME + value: /storage/huggingface + - name: http_proxy + value: "" + - name: https_proxy + value: "" + - name: no_proxy + value: "127.0.0.1,localhost" + - name: LLM_MODEL + value: "meta-llama/Meta-Llama-3.1-8B-Instruct" + - name: NUM_HPU + value: "8" + - name: NUM_EPOCHS + value: "3" + - name: HUGGING_FACE_HUB_TOKEN + value: "" + command: ["/bin/bash", "-c"] + args: + - >- + HOSTSFILE=$OMPI_MCA_orte_default_hostfile; + MASTER_ADDR="$(head -n 1 $HOSTSFILE | sed -n s/[[:space:]]slots.*//p)"; + SETUP_CMD="git clone https://github.com/huggingface/optimum-habana /optimum-habana"; + export no_proxy=$no_proxy,$KUBERNETES_SERVICE_HOST; + NUM_NODES=$(wc -l < $HOSTSFILE); + N_CARDS=$((NUM_NODES*NUM_HPU)); + + git clone https://github.com/huggingface/optimum-habana /optimum-habana; + cd /optimum-habana; + git checkout v1.14.1; + sed -i '194s|deepspeed|deepspeed --force_multi|' optimum/habana/distributed/distributed_runner.py; + pip install .; + pip install -r examples/language-modeling/requirements.txt; + pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0; + + mpirun --npernode 1 \ + --tag-output \ + --allow-run-as-root \ + --prefix $MPI_ROOT \ + -x http_proxy=$http_proxy \ + -x https_proxy=$https_proxy \ + -x no_proxy=$no_proxy \ + -x LLM_MODEL=$LLM_MODEL \ + -x HF_HOME=$HF_HOME \ + -x NUM_HPU=$NUM_HPU \ + -x HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN \ + bash -i -c ' + git clone https://github.com/huggingface/optimum-habana /optimum-habana + cd /optimum-habana + git checkout v1.14.1 + hf_home_var="os.environ[\"HF_HOME\"] = \"${HF_HOME}\"" + token_var="os.environ[\"HUGGING_FACE_HUB_TOKEN\"] = \"${HUGGING_FACE_HUB_TOKEN}\"" + https_var="os.environ[\"https_proxy\"] = \"${https_proxy}\"" + http_var="os.environ[\"http_proxy\"] = \"${http_proxy}\"" + no_proxy_var="os.environ[\"no_proxy\"] = \"${no_proxy}\"" + sed -i "56i\\${https_var}" examples/language-modeling/run_lora_clm.py + sed -i "57i\\${http_var}" examples/language-modeling/run_lora_clm.py + sed -i "58i\\${hf_home_var}" examples/language-modeling/run_lora_clm.py + sed -i "59i\\${token_var}" examples/language-modeling/run_lora_clm.py + sed -i "60i\\${no_proxy_var}" examples/language-modeling/run_lora_clm.py + + pip install . + pip install -r examples/language-modeling/requirements.txt + pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0 + '; + + cd /optimum-habana/examples/language-modeling/; + python ../gaudi_spawn.py --hostfile=$HOSTSFILE --use_deepspeed --world_size $N_CARDS run_lora_clm.py --model_name_or_path $LLM_MODEL \ + --dataset_name tatsu-lab/alpaca \ + --bf16 True \ + --output_dir ./model_lora_llama_ddp \ + --num_train_epochs $NUM_EPOCHS \ + --per_device_train_batch_size 8 \ + --gradient_accumulation_steps 2 \ + --evaluation_strategy "no" \ + --save_strategy "no" \ + --learning_rate 3e-4 \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "constant" \ + --max_grad_norm 0.3 \ + --logging_steps 1 \ + --do_train \ + --do_eval \ + --use_habana \ + --use_lazy_mode \ + --throughput_warmup_steps 3 \ + --lora_rank=8 \ + --lora_alpha=16 \ + --lora_dropout=0.05 \ + --lora_target_modules "q_proj" "v_proj" \ + --dataset_concatenation \ + --max_seq_length 512 \ + --ddp_bucket_cap_mb 50 \ + --adam_epsilon 1e-08 \ + --validation_split_percentage 4 \ + --low_cpu_mem_usage True; + + volumeMounts: + - name: datasets + mountPath: /storage + volumes: + - name: datasets + persistentVolumeClaim: + claimName: shared-model + readOnly: false + Worker: + replicas: 1 + template: + spec: + hostIPC: true + containers: + - image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest + name: gaudi-llm-ds-ft-worker + resources: + limits: + habana.ai/gaudi: 8 + memory: 250Gi + hugepages-2Mi: 312600Mi + requests: + habana.ai/gaudi: 8 + memory: 250Gi + hugepages-2Mi: 312600Mi + volumeMounts: + - name: datasets + mountPath: /storage + volumes: + - name: datasets + persistentVolumeClaim: + claimName: shared-model + readOnly: false diff --git a/examples/ai_examples/intel/vllm/vllm_configuration.yml b/examples/ai_examples/intel/vllm/vllm_configuration.yml new file mode 100644 index 000000000..6d0d3ce5f --- /dev/null +++ b/examples/ai_examples/intel/vllm/vllm_configuration.yml @@ -0,0 +1,89 @@ +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app: vllm-llama-app + name: vllm-llama-svc + namespace: workloads +spec: + ports: + - port: 8000 + protocol: TCP + targetPort: 8000 + selector: + app: vllm-llama-app + type: NodePort + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app: vllm-llama-app + name: vllm-llama + namespace: workloads +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-llama-app + template: + metadata: + labels: + app: vllm-llama-app + spec: + containers: + - image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest + name: vllm-llama-openai + imagePullPolicy: Always + workingDir: /root + env: + - name: HF_HOME + value: /storage/huggingface + - name: http_proxy + value: "" + - name: https_proxy + value: "" + - name: no_proxy + value: "127.0.0.1,localhost" + - name: LLM_MODEL + value: "meta-llama/Meta-Llama-3.1-8B-Instruct" + - name: HUGGING_FACE_HUB_TOKEN + value: "" + - name: HABANA_VISIBLE_DEVICES + value: all + - name: NUM_HPU + value: "8" + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: "true" + command: + - "/bin/sh" + - "-c" + - | + git clone -b v0.5.3.post1+Gaudi-1.18.0 https://github.com/HabanaAI/vllm-fork.git + cd vllm-fork + pip install -e . + python -m vllm.entrypoints.openai.api_server --model $LLM_MODEL --dtype auto --block-size 128 --max-num-seqs 128 --gpu-memory-utilization 0.5 --tensor-parallel-size $NUM_HPU + ports: + - containerPort: 8000 + protocol: TCP + resources: + limits: + habana.ai/gaudi: 8 + memory: 400Gi + hugepages-2Mi: 312600Mi + requests: + habana.ai/gaudi: 8 + memory: 400Gi + hugepages-2Mi: 312600Mi + volumeMounts: + - name: datasets + mountPath: /storage + volumes: + - name: datasets + persistentVolumeClaim: + claimName: shared-model + readOnly: false diff --git a/examples/ai_examples/nvidia/dell_pretrained_model/dell_pretrained_model_nvidia.py b/examples/ai_examples/nvidia/dell_pretrained_model/dell_pretrained_model_nvidia.py new file mode 100644 index 000000000..7c6b4f63c --- /dev/null +++ b/examples/ai_examples/nvidia/dell_pretrained_model/dell_pretrained_model_nvidia.py @@ -0,0 +1,422 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This script allows you to deploy, infer, or delete a Dell authentic +pretrained model in a Kubernetes cluster for Nvidia Platforms. + +Prerequisites: +- Kubernetes (k8s) must be installed and configured on your cluster. +- Nvidia GPU should present in kube node and cuda must be installed. +- Verify PRETRAINED_MODEL_CONFIG section for any changes and + update user_HF_token if required by model. +- The Kube control plane needs an active Internet connection. If there is no active Internet + connection, you need to set the proxy environment variables to have access to the Internet. + The proxy environment variables should be set to the IP address of the Omnia Infrastructure Manager. + For example: + export http_proxy=http://:3128 + export https_proxy=http://:3128 + +Usage: + 1. Deploy the model and service: + python3 dell_pretrained_model_nvidia.py --deploy + + 2. Run an inference using a query, within compute cluster: + python3 dell_pretrained_model_nvidia.py --infer "" + + - If you omit the query string, a default query will be used: + python3 dell_pretrained_model_nvidia.py --infer + + 3. Run an inference using a specific service IP, from outside computer cluster: + python3 dell_pretrained_model_nvidia.py --infer "" + --service-ip + + Note: check service ip on kube control plane using kubectl get svc pretrained-model-service + + - If you omit the query string, a default query will be used with the provided service IP: + python3 dell_pretrained_model_nvidia.py --infer --service-ip + + 3. Delete the deployed model service: + python3 dell_pretrained_model_nvidia.py --delete +""" + +import subprocess +import time +import argparse +import logging +import sys +import ipaddress +import requests + +SERVICE_NAME = "pretrained-model-service" # must be same as defined in PRETRAINED_MODEL_CONFIG +# Define Dell Pretrained Model Deployment config YAML +PRETRAINED_MODEL_CONFIG = """ +apiVersion: v1 +kind: Service +metadata: + name: pretrained-model-service +spec: + type: LoadBalancer + ports: + - protocol: TCP + port: 80 + targetPort: 80 + selector: + app: pretrained-model-app +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: pretrained-model-app +spec: + replicas: 1 + selector: + matchLabels: + app: pretrained-model-app + template: + metadata: + labels: + app: pretrained-model-app + hf.co/model: meta-llama--Meta-Llama-3.1-8b-Instruct + hf.co/task: text-generation + spec: + runtimeClassName: nvidia + containers: + - name: pretrained-model-container + image: registry.dell.huggingface.co/enterprise-dell-inference-meta-llama-meta-llama-3.1-8b-instruct + resources: + limits: + nvidia.com/gpu: 1 + env: + - name: NUM_SHARD + value: "1" + - name: MAX_BATCH_PREFILL_TOKENS + value: "32768" + - name: MAX_INPUT_TOKENS + value: "8000" + - name: MAX_TOTAL_TOKENS + value: "8192" + - name: HF_TOKEN + value: "user_HF_token" + volumeMounts: + - mountPath: /dev/shm + name: dshm + volumes: + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 1Gi +""" + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +K8S_EXCEPTION = "not found" + +def run_command(command, input_data=None): + """ + Runs a shell command and returns the output. + Args: + command (str): The shell command to be executed. + input_data (str, optional): Data to be passed to the command via stdin. + Returns: + str: The output of the command. + Raises: + RuntimeError: If the command fails. + """ + try: + result = subprocess.run( + command, + shell=True, + check=True, + capture_output=True, + text=True, + input=input_data + ) + return result.stdout + except subprocess.CalledProcessError as e: + raise RuntimeError(f"Run Command '{command}' failed with error: {e.stderr.strip()}") from e + +def check_kubectl_availability(): + """ + Checks if Kubernetes kubectl is installed and available. + This function runs `kubectl version --client` to ensure that kubectl is + installed and can be executed. + """ + try: + run_command("kubectl get nodes") + logging.info("Prerequisites- k8s present.") + except FileNotFoundError as e: + logging.error( + "k8s, kubectl is not available or not configured properly: %s", + e + ) + sys.exit(1) + except RuntimeError as e: + logging.warning( + "k8s is not installed and configured. Run omnia.yml to setup K8s in the cluster: %s", + e + ) + sys.exit(1) + +def check_nvidia_device_plugin_availability(): + """ + Check if the Nvidia device plugin is present in the output of the 'kubectl get pods' command. + Returns: + None + """ + try: + result = run_command("""kubectl get pods -n nvidia-device-plugin | + grep -v -e 'gpu-feature-discovery' -e 'node-feature-discovery'""") + if "nvidia-device-plugin" not in result: + logging.warning( + "Nvidia device plugin is not present, Run omnia.yml to setup Nvidia device pluin." + ) + sys.exit(1) + logging.info("Prerequisites- Nvidia device plugin present") + except RuntimeError as e: + logging.warning("An error occurred while checking Nvidia device plugin: %s", + e + ) + sys.exit(1) + +def is_valid_ip(ip): + """ + Validates if the provided string is a valid IP address. + Args: + ip (str): The IP address to validate. + Returns: + bool: True if the IP address is valid, False otherwise. + """ + try: + ipaddress.ip_address(ip) + return True + except ValueError: + return False + +def check_existing_deployment(service_name): + """ + Checks if the pretrained model service is already deployed. + Args: + service_name (str): The name of the pretrained model service to check. + Returns: + bool: True if the service already exists, False otherwise. + """ + try: + existing_service = run_command(f"kubectl get svc {service_name}") + if existing_service: + logging.warning( + "Service '%s' already exists. Skipping deployment.", + service_name + ) + return True + except RuntimeError as e: + if "NotFound" in str(e): + logging.info( + "Service '%s' is not present. Proceeding with deployment.", + service_name + ) + else: + logging.error( + "Failed to check service '%s': %s", + service_name, e + ) + raise + return False + return False + +def deploy_pretrained_model(service_name): + """ + Deploys the pretrained model using the defined YAML config. + Args: + service_name (str): The name of the service to deploy. + """ + if not check_existing_deployment(service_name): + logging.info("Creating pretrained model deployment...") + run_command("kubectl apply -f -", input_data=PRETRAINED_MODEL_CONFIG) + logging.info("""Deployment initiated. Check deployment status using kubectl get pods + and kubectl get svc pretrained-model-service for external IP""") + +def delete_pretrained_model_resources(): + """ + Deletes the pretrained model resources defined in the YAML config. + This function deletes both the deployment and service for the pretrained model. + If the resources are not found, it informs the user and does not raise an error. + """ + logging.info("Deleting pretrained model deployment and service...") + try: + run_command("kubectl delete -f -", input_data=PRETRAINED_MODEL_CONFIG) + logging.info("Pretrained model deployment and service deleted.") + except RuntimeError as e: + if "NotFound" in str(e): + logging.warning( + "Pretrained model deployment or service not found." + "They may have already been deleted." + ) + elif K8S_EXCEPTION in str(e): + logging.warning( + "k8s, kubectl is not available or not configured properly" + ) + sys.exit(1) + else: + logging.error("Failed to delete pretrained model resources: %s", e) + raise + +def get_pretrained_model_service_ip(svc_name): + """ + Waits for the pretrained service to get an external IP and returns it. + Args: + service_name (str): The name of the pretrained model service to check. + Returns: + str: The external IP of the service, or None if the service does not exist. + """ + max_retries = 5 + retry_count = 0 + while retry_count < max_retries: + try: + svc_ip = run_command( + f"kubectl get svc {svc_name} -o jsonpath='{{.status.loadBalancer.ingress[0].ip}}'" + ) + if svc_ip: + return svc_ip + except RuntimeError as e: + if "NotFound" in str(e): + logging.warning( + "Service '%s' not found. It may not be deployed yet. " + "To deploy it use --deploy tag", + svc_name + ) + elif K8S_EXCEPTION in str(e): + logging.warning( + "Kubectl is not found, and not able to locate service." + "Try inferencing by giving --service-ip" + ) + return None + retry_count += 1 + logging.info("Pretrained Model Service is not yet available. Retrying in 10 seconds...") + time.sleep(10) + logging.error( + "Failed to get the external IP of the service '%s' after %d retries.", + svc_name, + max_retries + ) + return None + +def run_inferencing(service_ip, query): + """ + Runs the inferencing process against the model using the provided query. + + Args: + service_ip (str): The external IP of the pretrained model service. + query (str): The query string to be sent to the model for inferencing. + """ + url = f"http://{service_ip}:80/v1/chat/completions" + headers = {"Content-Type": "application/json"} + data = {"model": "pretrained_model", "messages": [{"role": "user", "content": query}]} + try: + response = requests.post(url, json=data, headers=headers, timeout=10) + response.raise_for_status() # Raise an exception for HTTP errors + try: + response_data = response.json() + logging.info("Inference response received.") + except ValueError: + logging.error("Failed to decode JSON response from the service.") + sys.exit(1) + except requests.exceptions.ConnectionError: + logging.error( + "Failed to connect to service at IP: %s. The service may be starting or not running." + "Try again after verifying service is running", + service_ip + ) + sys.exit(1) + except requests.exceptions.Timeout: + logging.error("The request to the service timed out.") + sys.exit(1) + except requests.exceptions.HTTPError as http_err: + logging.error("HTTP error occurred: %s", http_err) + sys.exit(1) + except requests.exceptions.RequestException as req_err: + logging.error("Inferencing failed due to an unexpected error: %s", req_err) + sys.exit(1) + + if response_data.get('choices'): + message = response_data['choices'][0]['message']['content'] + logging.info(message) + else: + logging.error("No valid response received from the pretrained model.") + sys.exit(1) + +def main(): + """ + Main function to handle argument parsing and orchestrate the deployment, + inferencing, and deletion. This function processes command-line arguments + for deploying the model, running inferencing, or deleting the Kubernetes resources. + """ + parser = argparse.ArgumentParser( + description="""Automated Dell Enterprise Pretrained Model Deployment and Inferencing. + Prerequisites: + - Kubernetes (k8s) must be installed and configured on your cluster. + - Nvidia GPU should present in kube node and cuda must be installed. + - Verify PRETRAINED_MODEL_CONFIG section for any changes and update user_HF_token if required by model. + - The Kube control plane needs an active Internet connection. If there is no active Internet connection, + you need to set the proxy environment variables to have access to the Internet. The proxy environment variables + should be set to the IP address of the Omnia Infrastructure Manager. For example: + export http_proxy=http://:3128 + export https_proxy=http://:3128 + """, + formatter_class=argparse.RawTextHelpFormatter + ) + parser.add_argument( + '--deploy', + action='store_true', + help="Deploy the Dell Enterprise Pretrained Model for Nvidia Platforms." + ) + parser.add_argument( + '--infer', + nargs='?', + const="What is Dell Technologies World?", + help="Run the inferencing. Optionally pass query string." + ) + parser.add_argument( + '--service-ip', + help="""Optional: Specify the External Load Balancer IP assigned to pretrained service + for inferencing.""" + ) + parser.add_argument( + '--delete', + action='store_true', + help="Delete the pretrained model Kubernetes resources." + ) + args = parser.parse_args() + if args.deploy: + check_kubectl_availability() # Pre-requisite check + check_nvidia_device_plugin_availability() # Pre-requisite check + deploy_pretrained_model(SERVICE_NAME) + elif args.infer: + # Check if service IP is provided as an argument + if args.service_ip: + if is_valid_ip(args.service_ip): + service_ip = args.service_ip + else: + logging.error("Invalid IP address format: %s", args.service_ip) + sys.exit(1) + else: + # If no service IP is provided, retrieve it using the service name + service_ip = get_pretrained_model_service_ip(SERVICE_NAME) + if service_ip: + run_inferencing(service_ip, args.infer) + elif args.delete: + delete_pretrained_model_resources() + +if __name__ == "__main__": + main() diff --git a/examples/host_inventory_file b/examples/inventory/host_inventory_file similarity index 94% rename from examples/host_inventory_file rename to examples/inventory/host_inventory_file index 0202d5ae2..8a1ea3be2 100644 --- a/examples/host_inventory_file +++ b/examples/inventory/host_inventory_file @@ -3,9 +3,6 @@ [slurm_control_node] node1 -[slurmdbd] -node2 - [slurm_node] node3 node4 diff --git a/examples/ip_rule_inv_template b/examples/inventory/ip_rule_inv_template similarity index 100% rename from examples/ip_rule_inv_template rename to examples/inventory/ip_rule_inv_template diff --git a/examples/nfs_server_inventory_file b/examples/inventory/nfs_server_inventory_file similarity index 100% rename from examples/nfs_server_inventory_file rename to examples/inventory/nfs_server_inventory_file diff --git a/examples/server_spec_inv b/examples/inventory/server_spec_inv similarity index 50% rename from examples/server_spec_inv rename to examples/inventory/server_spec_inv index 56e8a4e69..fe56681c7 100644 --- a/examples/server_spec_inv +++ b/examples/inventory/server_spec_inv @@ -1,12 +1,12 @@ -[node-group1] +[cluster1] 10.5.0.3 -[node-group1:vars] +[cluster1:vars] Categories=group-1 -[node-group2] +[cluster2] 10.5.0.4 10.5.0.5 -[node-group2:vars] +[cluster2:vars] Categories=group-2 diff --git a/examples/TensorRT-InferenceServer/README.md b/examples/obsolete/TensorRT-InferenceServer/README.md similarity index 100% rename from examples/TensorRT-InferenceServer/README.md rename to examples/obsolete/TensorRT-InferenceServer/README.md diff --git a/examples/TensorRT-InferenceServer/trt-client.yaml b/examples/obsolete/TensorRT-InferenceServer/trt-client.yaml similarity index 100% rename from examples/TensorRT-InferenceServer/trt-client.yaml rename to examples/obsolete/TensorRT-InferenceServer/trt-client.yaml diff --git a/examples/pxe_mapping_file.csv b/examples/pxe_mapping_file.csv index 77153661f..d5f6c2dad 100644 --- a/examples/pxe_mapping_file.csv +++ b/examples/pxe_mapping_file.csv @@ -1,3 +1,3 @@ SERVICE_TAG,HOSTNAME,ADMIN_MAC,ADMIN_IP,BMC_IP -6XCVT4,n1,xx:yy:zz:aa:bb:cc,10.5.0.101,10.3.0.101 -V345H5,n2,aa:bb:cc:dd:ee:ff,10.5.0.102,10.3.0.102 \ No newline at end of file +ABCD12,n1,xx:yy:zz:aa:bb:cc,10.5.0.101,10.3.0.101 +ABCD34,n2,aa:bb:cc:dd:ee:ff,10.5.0.102,10.3.0.102 diff --git a/examples/rhel_software_config.json b/examples/rhel_software_config.json index 4831fd714..85c0e9ef0 100644 --- a/examples/rhel_software_config.json +++ b/examples/rhel_software_config.json @@ -3,20 +3,20 @@ "cluster_os_version": "8.8", "repo_config": "partial", "softwares": [ - {"name": "amdgpu", "version": "6.0"}, + {"name": "amdgpu", "version": "6.2.2"}, {"name": "cuda", "version": "12.3.2"}, {"name": "ofed", "version": "24.01-0.3.3.1"}, {"name": "openldap"}, {"name": "nfs"}, {"name": "slurm"}, - {"name": "k8s", "version":"1.26.12"}, + {"name": "k8s", "version":"1.29.5"}, {"name": "jupyter"}, {"name": "pytorch"}, {"name": "tensorflow"} ], "amdgpu": [ - {"name": "rocm", "version": "6.0" } + {"name": "rocm", "version": "6.2.2" } ], "pytorch": [ {"name": "pytorch_cpu"}, diff --git a/examples/rocky_software_config.json b/examples/rocky_software_config.json index 145d058b1..f81b5661d 100644 --- a/examples/rocky_software_config.json +++ b/examples/rocky_software_config.json @@ -3,20 +3,20 @@ "cluster_os_version": "8.8", "repo_config": "partial", "softwares": [ - {"name": "amdgpu", "version": "6.0"}, + {"name": "amdgpu", "version": "6.2.2"}, {"name": "cuda", "version": "12.3.2"}, {"name": "ofed", "version": "24.01-0.3.3.1"}, {"name": "openldap"}, {"name": "nfs"}, {"name": "slurm"}, - {"name": "k8s", "version":"1.26.12"}, + {"name": "k8s", "version":"1.29.5"}, {"name": "jupyter"}, {"name": "pytorch"}, {"name": "tensorflow"} ], "amdgpu": [ - {"name": "rocm", "version": "6.0" } + {"name": "rocm", "version": "6.2.2" } ], "pytorch": [ {"name": "pytorch_cpu"}, diff --git a/examples/template_rhel_software_config.json b/examples/software_config_template/template_rhel_software_config.json similarity index 82% rename from examples/template_rhel_software_config.json rename to examples/software_config_template/template_rhel_software_config.json index 464f89fdf..0a4073ee4 100644 --- a/examples/template_rhel_software_config.json +++ b/examples/software_config_template/template_rhel_software_config.json @@ -3,7 +3,7 @@ "cluster_os_version": "8.8", "repo_config": "partial", "softwares": [ - {"name": "amdgpu", "version": "6.0"}, + {"name": "amdgpu", "version": "6.2.2"}, {"name": "cuda", "version": "12.3.2"}, {"name": "ofed", "version": "24.01-0.3.3.1"}, {"name": "freeipa"}, @@ -12,7 +12,7 @@ {"name": "nfs"}, {"name": "beegfs", "version": "7.4.2"}, {"name": "slurm"}, - {"name": "k8s", "version":"1.26.12"}, + {"name": "k8s", "version":"1.29.5"}, {"name": "jupyter"}, {"name": "kubeflow"}, {"name": "kserve"}, @@ -24,11 +24,12 @@ {"name": "amd_benchmarks"}, {"name": "utils"}, {"name": "ucx", "version": "1.15.0"}, - {"name": "openmpi", "version": "4.1.6"} + {"name": "openmpi", "version": "4.1.6"}, + {"name": "csi_driver_powerscale", "version":"v2.11.0"} ], "amdgpu": [ - {"name": "rocm", "version": "6.0" } + {"name": "rocm", "version": "6.2.2" } ], "vllm": [ {"name": "vllm_amd"}, diff --git a/examples/template_rocky_software_config.json b/examples/software_config_template/template_rocky_software_config.json similarity index 82% rename from examples/template_rocky_software_config.json rename to examples/software_config_template/template_rocky_software_config.json index 4946a75e1..a3e20ffee 100644 --- a/examples/template_rocky_software_config.json +++ b/examples/software_config_template/template_rocky_software_config.json @@ -3,7 +3,7 @@ "cluster_os_version": "8.8", "repo_config": "partial", "softwares": [ - {"name": "amdgpu", "version": "6.0"}, + {"name": "amdgpu", "version": "6.2.2"}, {"name": "cuda", "version": "12.3.2"}, {"name": "ofed", "version": "24.01-0.3.3.1"}, {"name": "freeipa"}, @@ -12,7 +12,7 @@ {"name": "nfs"}, {"name": "beegfs", "version": "7.4.2"}, {"name": "slurm"}, - {"name": "k8s", "version":"1.26.12"}, + {"name": "k8s", "version":"1.29.5"}, {"name": "jupyter"}, {"name": "kubeflow"}, {"name": "kserve"}, @@ -24,11 +24,12 @@ {"name": "amd_benchmarks"}, {"name": "utils"}, {"name": "ucx", "version": "1.15.0"}, - {"name": "openmpi", "version": "4.1.6"} + {"name": "openmpi", "version": "4.1.6"}, + {"name": "csi_driver_powerscale", "version":"v2.11.0"} ], "amdgpu": [ - {"name": "rocm", "version": "6.0" } + {"name": "rocm", "version": "6.2.2" } ], "vllm": [ {"name": "vllm_amd"}, diff --git a/examples/template_ubuntu_software_config.json b/examples/software_config_template/template_ubuntu_software_config.json similarity index 64% rename from examples/template_ubuntu_software_config.json rename to examples/software_config_template/template_ubuntu_software_config.json index 86c357d56..3bb1bbc01 100644 --- a/examples/template_ubuntu_software_config.json +++ b/examples/software_config_template/template_ubuntu_software_config.json @@ -3,15 +3,16 @@ "cluster_os_version": "22.04", "repo_config": "partial", "softwares": [ - {"name": "amdgpu", "version": "6.0"}, - {"name": "bcm_roce", "version": "229.2.61.0"}, + {"name": "amdgpu", "version": "6.2.2"}, + {"name": "bcm_roce", "version": "230.2.54.0"}, + {"name": "intelgaudi", "version": "1.18.0-524"}, {"name": "cuda", "version": "12.3.2"}, {"name": "ofed", "version": "24.01-0.3.3.1"}, {"name": "openldap"}, {"name": "secure_login_node"}, {"name": "nfs"}, {"name": "beegfs", "version": "7.4.2"}, - {"name": "k8s", "version":"1.26.12"}, + {"name": "k8s", "version":"1.29.5"}, {"name": "roce_plugin"}, {"name": "jupyter"}, {"name": "kubeflow"}, @@ -21,14 +22,18 @@ {"name": "vllm"}, {"name": "telemetry"}, {"name": "ucx", "version": "1.15.0"}, - {"name": "openmpi", "version": "4.1.6"} + {"name": "openmpi", "version": "4.1.6"}, + {"name": "csi_driver_powerscale", "version":"v2.11.0"} ], "bcm_roce": [ - {"name": "bcm_roce_libraries", "version": "229.2.61.0"} + {"name": "bcm_roce_libraries", "version": "230.2.54.0"} ], "amdgpu": [ - {"name": "rocm", "version": "6.0" } + {"name": "rocm", "version": "6.2.2" } + ], + "intelgaudi": [ + {"name": "habana"} ], "vllm": [ {"name": "vllm_amd"}, @@ -37,12 +42,13 @@ "pytorch": [ {"name": "pytorch_cpu"}, {"name": "pytorch_amd"}, - {"name": "pytorch_nvidia"} + {"name": "pytorch_nvidia"}, + {"name": "pytorch_gaudi"} ], "tensorflow": [ {"name": "tensorflow_cpu"}, {"name": "tensorflow_amd"}, {"name": "tensorflow_nvidia"} ] - -} \ No newline at end of file + +} diff --git a/examples/ubuntu_software_config.json b/examples/ubuntu_software_config.json index 76b294a5c..383c18401 100644 --- a/examples/ubuntu_software_config.json +++ b/examples/ubuntu_software_config.json @@ -3,27 +3,32 @@ "cluster_os_version": "22.04", "repo_config": "partial", "softwares": [ - {"name": "amdgpu", "version": "6.0"}, - {"name": "bcm_roce", "version": "229.2.61.0"}, + {"name": "amdgpu", "version": "6.2.2"}, + {"name": "bcm_roce", "version": "230.2.54.0"}, {"name": "openldap"}, {"name": "nfs"}, - {"name": "k8s", "version":"1.26.12"}, + {"name": "k8s", "version":"1.29.5"}, {"name": "roce_plugin"}, {"name": "jupyter"}, {"name": "pytorch"}, - {"name": "tensorflow"} + {"name": "tensorflow"}, + {"name": "intelgaudi", "version": "1.18.0-524"} ], "bcm_roce": [ - {"name": "bcm_roce_libraries", "version": "229.2.61.0"} + {"name": "bcm_roce_libraries", "version": "230.2.54.0"} ], "amdgpu": [ - {"name": "rocm", "version": "6.0" } + {"name": "rocm", "version": "6.2.2" } + ], + "intelgaudi": [ + {"name": "intel"} ], "pytorch": [ {"name": "pytorch_cpu"}, {"name": "pytorch_amd"}, - {"name": "pytorch_nvidia"} + {"name": "pytorch_nvidia"}, + {"name": "pytorch_gaudi"} ], "tensorflow": [ {"name": "tensorflow_cpu"}, @@ -31,4 +36,4 @@ {"name": "tensorflow_nvidia"} ] -} \ No newline at end of file +} diff --git a/input/accelerator_config.yml b/input/accelerator_config.yml index 2b1cafdb5..9cd7d80a9 100644 --- a/input/accelerator_config.yml +++ b/input/accelerator_config.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,10 +13,10 @@ # limitations under the License. --- -#*********************************************************************** -# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. # SIMPLY APPEND THE REQUIRD VALUES AGAINST THE PARAMETER OF YOUR CHOICE. -#*********************************************************************** +# *********************************************************************** # This variable is used to download cuda toolkit file. # By default latest cuda is installed unless cuda_toolkit_path is specified. diff --git a/input/config/rhel/8.6/k8s.json b/input/config/rhel/8.6/k8s.json index 035b62418..94e23d59f 100644 --- a/input/config/rhel/8.6/k8s.json +++ b/input/config/rhel/8.6/k8s.json @@ -21,6 +21,11 @@ "type": "rpm", "repo_name": "docker-ce-repo" }, + { + "package": "nvidia-container-toolkit", + "type": "rpm", + "repo_name": "nvidia-repo" + }, { "package": "kubectl", "type": "tarball", diff --git a/input/config/rhel/8.6/secure_login_node.json b/input/config/rhel/8.6/secure_login_node.json index 15a02fc86..9c00d3249 100644 --- a/input/config/rhel/8.6/secure_login_node.json +++ b/input/config/rhel/8.6/secure_login_node.json @@ -28,7 +28,7 @@ "type": "pip_module" }, { - "package": "cryptography==41.0.7", + "package": "cryptography==44.0.0", "type": "pip_module" }, { diff --git a/input/config/rhel/8.6/vllm.json b/input/config/rhel/8.6/vllm.json index 966d478bc..d8c60c818 100644 --- a/input/config/rhel/8.6/vllm.json +++ b/input/config/rhel/8.6/vllm.json @@ -37,4 +37,4 @@ } -} \ No newline at end of file +} diff --git a/input/config/rhel/8.7/k8s.json b/input/config/rhel/8.7/k8s.json index 5f38bbef9..44c89c617 100644 --- a/input/config/rhel/8.7/k8s.json +++ b/input/config/rhel/8.7/k8s.json @@ -21,6 +21,11 @@ "type": "rpm", "repo_name": "docker-ce-repo" }, + { + "package": "nvidia-container-toolkit", + "type": "rpm", + "repo_name": "nvidia-repo" + }, { "package": "kubectl", "type": "tarball", diff --git a/input/config/rhel/8.7/secure_login_node.json b/input/config/rhel/8.7/secure_login_node.json index 9c5807013..85105ef7f 100644 --- a/input/config/rhel/8.7/secure_login_node.json +++ b/input/config/rhel/8.7/secure_login_node.json @@ -28,7 +28,7 @@ "type": "pip_module" }, { - "package": "cryptography==41.0.7", + "package": "cryptography==44.0.0", "type": "pip_module" }, { diff --git a/input/config/rhel/8.8/amdgpu.json b/input/config/rhel/8.8/amdgpu.json index 4a24aed95..1cf28d26d 100644 --- a/input/config/rhel/8.8/amdgpu.json +++ b/input/config/rhel/8.8/amdgpu.json @@ -8,7 +8,7 @@ }, "rocm": { "cluster": [ - {"package": "rocm-hip-sdk{{ rocm_version }}*", "type": "rpm", "repo_name": "rocm"} + {"package": "rocm", "type": "rpm", "repo_name": "rocm"} ] } } \ No newline at end of file diff --git a/input/config/rhel/8.8/csi_driver_powerscale.json b/input/config/rhel/8.8/csi_driver_powerscale.json new file mode 100644 index 000000000..afc8fe181 --- /dev/null +++ b/input/config/rhel/8.8/csi_driver_powerscale.json @@ -0,0 +1,84 @@ +{ + "csi_driver_powerscale": { + "cluster": [ + { + "package": "csi-powerscale", + "url": "https://github.com/dell/csi-powerscale.git", + "type": "git", + "version": "v2.11.0" + }, + { + "package": "external-snapshotter", + "url": "https://github.com/kubernetes-csi/external-snapshotter.git", + "type": "git", + "version": "v8.0.1" + }, + { + "package": "helm-charts", + "url": "https://github.com/dell/helm-charts.git", + "type": "git", + "version": "csi-isilon-2.11.0" + }, + { + "package": "docker.io/dellemc/csi-isilon", + "tag": "v2.11.0", + "type": "image" + }, + { + "package": "registry.k8s.io/sig-storage/csi-attacher", + "tag": "v4.6.1", + "type": "image" + }, + { + "package": "registry.k8s.io/sig-storage/csi-provisioner", + "tag": "v5.0.1", + "type": "image" + }, + { + "package": "registry.k8s.io/sig-storage/csi-snapshotter", + "tag": "v8.0.1", + "type": "image" + }, + { + "package": "registry.k8s.io/sig-storage/csi-resizer", + "tag": "v1.11.1", + "type": "image" + }, + { + "package": "registry.k8s.io/sig-storage/csi-node-driver-registrar", + "tag": "v2.10.1", + "type": "image" + }, + { + "package": "registry.k8s.io/sig-storage/csi-external-health-monitor-controller", + "tag": "v0.12.1", + "type": "image" + }, + { + "package": "docker.io/dellemc/dell-csi-replicator", + "tag": "v1.9.0", + "type": "image" + }, + { + "package": "docker.io/dellemc/podmon", + "tag": "v1.10.0", + "type": "image" + }, + { + "package": "docker.io/dellemc/csm-authorization-sidecar", + "tag": "v1.11.0", + "type": "image" + }, + { + "package": "docker.io/dellemc/csi-metadata-retriever", + "tag": "v1.8.0", + "type": "image" + }, + { + "package": "docker.io/dellemc/csm-encryption", + "tag": "v0.6.0", + "type": "image" + } + ] + } +} diff --git a/input/config/rhel/8.8/k8s.json b/input/config/rhel/8.8/k8s.json index f38c5c46b..d47adec70 100644 --- a/input/config/rhel/8.8/k8s.json +++ b/input/config/rhel/8.8/k8s.json @@ -2,59 +2,44 @@ "k8s": { "cluster": [ { - "package": "containerd.io-1.6.16-3.1.el8", + "package": "nvidia-container-toolkit", "type": "rpm", - "repo_name": "docker-ce-repo" + "repo_name": "nvidia-repo" }, { - "package": "docker-ce-cli-1:20.10.20-3.el8", - "type": "rpm", - "repo_name": "docker-ce-repo" - }, - { - "package": "docker-ce-3:20.10.20-3.el8", - "type": "rpm", - "repo_name": "docker-ce-repo" - }, - { - "package": "docker-ce-rootless-extras", - "type": "rpm", - "repo_name": "docker-ce-repo" - }, - { - "package": "kubectl", + "package": "kubectl-{{ k8s_version }}", "type": "tarball", "url": "https://dl.k8s.io/release/v{{ k8s_version }}/bin/linux/amd64/kubectl" }, { - "package": "kubelet", + "package": "kubelet-{{ k8s_version }}", "type": "tarball", "url": "https://dl.k8s.io/release/v{{ k8s_version }}/bin/linux/amd64/kubelet" }, { - "package": "kubeadm", + "package": "kubeadm-{{ k8s_version }}", "type": "tarball", "url": "https://dl.k8s.io/release/v{{ k8s_version }}/bin/linux/amd64/kubeadm" }, { - "package": "calicoctl-v3.25.2", + "package": "calicoctl-v3.27.3", "type": "tarball", - "url": "https://github.com/projectcalico/calico/releases/download/v3.25.2/calicoctl-linux-amd64" + "url": "https://github.com/projectcalico/calico/releases/download/v3.27.3/calicoctl-linux-amd64" }, { - "package": "calicocrds-v3.25.2", + "package": "calicocrds-v3.27.3", "type": "tarball", - "url": "https://github.com/projectcalico/calico/archive/v3.25.2.tar.gz" + "url": "https://github.com/projectcalico/calico/archive/v3.27.3.tar.gz" }, { - "package": "cri-tools-v1.26.1", + "package": "cri-tools-v1.29.0", "type": "tarball", - "url": "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.26.1/crictl-v1.26.1-linux-amd64.tar.gz" + "url": "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.29.0/crictl-v1.29.0-linux-amd64.tar.gz" }, { - "package": "etcd-v3.5.10", + "package": "etcd-v3.5.12", "type": "tarball", - "url": "https://github.com/etcd-io/etcd/releases/download/v3.5.10/etcd-v3.5.10-linux-amd64.tar.gz" + "url": "https://github.com/etcd-io/etcd/releases/download/v3.5.12/etcd-v3.5.12-linux-amd64.tar.gz" }, { "package": "cni-plugins-v1.3.0", @@ -62,24 +47,24 @@ "url": "https://github.com/containernetworking/plugins/releases/download/v1.3.0/cni-plugins-linux-amd64-v1.3.0.tgz" }, { - "package": "runc.amd64", + "package": "runc.amd64.v1.1.12", "type": "tarball", - "url": "https://github.com/opencontainers/runc/releases/download/v1.1.9/runc.amd64" + "url": "https://github.com/opencontainers/runc/releases/download/v1.1.12/runc.amd64" }, { - "package": "nerdctl-v1.5.0", + "package": "nerdctl-v1.7.4", "type": "tarball", - "url": "https://github.com/containerd/nerdctl/releases/download/v1.5.0/nerdctl-1.5.0-linux-amd64.tar.gz" + "url": "https://github.com/containerd/nerdctl/releases/download/v1.7.4/nerdctl-1.7.4-linux-amd64.tar.gz" }, { - "package": "containerd-1.7.5", + "package": "containerd-1.7.16", "type": "tarball", - "url": "https://github.com/containerd/containerd/releases/download/v1.7.5/containerd-1.7.5-linux-amd64.tar.gz" + "url": "https://github.com/containerd/containerd/releases/download/v1.7.16/containerd-1.7.16-linux-amd64.tar.gz" }, { - "package": "helm-v3.12.3", + "package": "helm-v3.14.2", "type": "tarball", - "url": "https://get.helm.sh/helm-v3.12.3-linux-amd64.tar.gz" + "url": "https://get.helm.sh/helm-v3.14.2-linux-amd64.tar.gz" }, { "package": "nvidia-device-plugin", @@ -92,7 +77,7 @@ "url": "https://raw.githubusercontent.com/ROCm/k8s-device-plugin/r1.16/k8s-ds-amdgpu-dp.yaml" }, { - "package": "mpi-operator", + "package": "mpi-operator-v0.4.0", "type": "manifest", "url": "https://raw.githubusercontent.com/kubeflow/mpi-operator/v0.4.0/deploy/v2beta1/mpi-operator.yaml" }, @@ -118,7 +103,7 @@ }, { "package": "registry.k8s.io/coredns/coredns", - "tag": "v1.9.3", + "tag": "v1.11.1", "type": "image" }, { @@ -158,27 +143,27 @@ }, { "package": "quay.io/coreos/etcd", - "tag": "v3.5.10", + "tag": "v3.5.12", "type": "image" }, { "package": "quay.io/calico/cni", - "tag": "v3.25.2", + "tag": "v3.27.3", "type": "image" }, { "package": "quay.io/calico/kube-controllers", - "tag": "v3.25.2", + "tag": "v3.27.3", "type": "image" }, { "package": "quay.io/calico/pod2daemon-flexvol", - "tag": "v3.25.2", + "tag": "v3.27.3", "type": "image" }, { "package": "quay.io/calico/node", - "tag": "v3.25.2", + "tag": "v3.27.3", "type": "image" }, { @@ -241,11 +226,6 @@ "tag": "v0.8.2", "type": "image" }, - { - "package": "registry.k8s.io/nfd/node-feature-discovery", - "tag": "v0.12.1", - "type": "image" - }, { "package": "registry.k8s.io/sig-storage/nfs-subdir-external-provisioner", "tag": "v4.0.2", @@ -253,4 +233,4 @@ } ] } -} \ No newline at end of file +} diff --git a/input/config/rhel/8.8/kserve.json b/input/config/rhel/8.8/kserve.json index c2d2869dc..8632d3869 100644 --- a/input/config/rhel/8.8/kserve.json +++ b/input/config/rhel/8.8/kserve.json @@ -4,111 +4,111 @@ { "package": "istio", "type": "tarball", - "url": "https://github.com/istio/istio/releases/download/1.17.0/istio-1.17.0-linux-amd64.tar.gz" + "url": "https://github.com/istio/istio/releases/download/1.20.4/istio-1.20.4-linux-amd64.tar.gz" }, { "package": "docker.io/istio/proxyv2", - "tag": "1.17.0", + "tag": "1.20.4", "type": "image" }, { "package": "docker.io/istio/pilot", - "tag": "1.17.0", + "tag": "1.20.4", "type": "image" }, { "package": "knative_serving_crds_manifest", "type": "manifest", - "url": "https://github.com/knative/serving/releases/download/knative-v1.11.0/serving-crds.yaml" + "url": "https://github.com/knative/serving/releases/download/knative-v1.13.1/serving-crds.yaml" }, { "package": "knative_serving_core_manifest", "type": "manifest", - "url": "https://github.com/knative/serving/releases/download/knative-v1.11.0/serving-core.yaml" + "url": "https://github.com/knative/serving/releases/download/knative-v1.13.1/serving-core.yaml" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/queue", - "digest": "987f53e3ead58627e3022c8ccbb199ed71b965f10c59485bab8015ecf18b44af", + "digest": "e52286fc4843470383e917abc9c1b0c8d10f585c4274c57b612279869bc86f0d", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/activator", - "digest": "6b98eed95dd6dcc3d957e673aea3d271b768225442504316d713c08524f44ebe", + "digest": "21f8e11a44bf1e260602d30e6762a3dc433c608d1dd0e309c0ff89728e71901d", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/autoscaler", - "digest": "5b52cc9aa521ee236645db57f19b70f2a0e8f6ef27dfa9181409a0f96406e2ad", + "digest": "34796e9f760bb67065c6f101296513b38d04d39d11888e919692ac46fa6dc7c2", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/controller", - "digest": "b11dbcba050eac9084edd021b7e0eee16b39c9e397b245bc4227266af1893404", + "digest": "53d9aa4d2c7a82f5a01202e386f7503b21839cbe2e5e62f1e9bda2aa5f11b518", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/webhook", - "digest": "7b138c73fcaaf0b9bb2d414b8a89a780f8c09371d24c6f57969be1694acf4aaa", + "digest": "700c69915dc7cd86dffb61c26b0ba34427fab809de1e3344589dd955b6440882", "type": "image" }, { "package": "knative_net_istio_manifest", "type": "manifest", - "url": "https://github.com/knative/net-istio/releases/download/knative-v1.11.0/net-istio.yaml" + "url": "https://github.com/knative/net-istio/releases/download/knative-v1.13.1/net-istio.yaml" }, { "package": "gcr.io/knative-releases/knative.dev/net-istio/cmd/controller", - "digest": "27e7beb7c62036216fc464fb2181e56b030158ad4ceb57a7de172f54b4fe43db", + "digest": "a5b041ba3c9ea40198b2331617bd1571942961c1416ef683b4de8ef162755a88", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/net-istio/cmd/webhook", - "digest": "0cdef272e39c57971ce9977765f164dd8e3abb9395a4f60e7a4160d57dcc09f2", + "digest": "f066376eee17505d14881b7635a7ca7531fce0f30cf968232fc0a93adc952ed5", "type": "image" }, { "package": "cert_manager_manifest", "type": "manifest", - "url": "https://github.com/cert-manager/cert-manager/releases/download/v1.13.0/cert-manager.yaml" + "url": "https://github.com/cert-manager/cert-manager/releases/download/v1.14.5/cert-manager.yaml" }, { "package": "quay.io/jetstack/cert-manager-cainjector", - "tag": "v1.13.0", + "tag": "v1.14.5", "type": "image" }, { "package": "quay.io/jetstack/cert-manager-controller", - "tag": "v1.13.0", + "tag": "v1.14.5", "type": "image" }, { "package": "quay.io/jetstack/cert-manager-webhook", - "tag": "v1.13.0", + "tag": "v1.14.5", "type": "image" }, { "package": "kserve_manifest", "type": "manifest", - "url": "https://github.com/kserve/kserve/releases/download/v0.11.2/kserve.yaml" + "url": "https://github.com/kserve/kserve/releases/download/v0.13.0/kserve.yaml" }, { "package": "docker.io/kserve/kserve-controller", - "tag": "v0.11.2", + "tag": "v0.13.0", "type": "image" }, { "package": "docker.io/kserve/storage-initializer", - "tag": "v0.11.2", + "tag": "v0.13.0", "type": "image" }, { "package": "docker.io/kserve/router", - "tag": "v0.11.2", + "tag": "v0.13.0", "type": "image" }, { "package": "docker.io/kserve/agent", - "tag": "v0.11.2", + "tag": "v0.13.0", "type": "image" }, { @@ -119,7 +119,7 @@ { "package": "kserve_runtimes_manifest", "type": "manifest", - "url": "https://github.com/kserve/kserve/releases/download/v0.11.2/kserve-runtimes.yaml" + "url": "https://github.com/kserve/kserve/releases/download/v0.13.0/kserve-cluster-resources.yaml" }, { "package": "docker.io/seldonio/mlserver", @@ -128,7 +128,7 @@ }, { "package": "docker.io/kserve/sklearnserver", - "tag": "v0.11.2", + "tag": "v0.13.0", "type": "image" } ] diff --git a/input/config/rhel/8.8/kubeflow.json b/input/config/rhel/8.8/kubeflow.json index e8e47c143..d040667bf 100644 --- a/input/config/rhel/8.8/kubeflow.json +++ b/input/config/rhel/8.8/kubeflow.json @@ -4,43 +4,47 @@ { "package": "kubeflow", "type": "git", - "url": "https://github.com/kubeflow/manifests.git" - , - "version": "v1.8.0" + "url": "https://github.com/kubeflow/manifests.git", + "version": "v1.9.1" }, { "package": "kustomize", "type": "tarball", - "url": "https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize%2Fv5.0.3/kustomize_v5.0.3_linux_amd64.tar.gz" + "url": "https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize%2Fv5.4.3/kustomize_v5.4.3_linux_amd64.tar.gz" }, { "package": "ghcr.io/dexidp/dex", - "tag": "v2.36.0", + "tag": "v2.39.1", + "type": "image" + }, + { + "package": "ghcr.io/dexidp/dex", + "tag": "v2.35.0", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/apiserver_receive_adapter", - "digest": "828db8155996e40c13b77c1d039dba98153dcfcbe272248e92866bd7b6d6a17d", + "digest": "4ed3e39a11f4fc3358787433beaea4a9e72773ea7710bf4beb95aa8770515c9e", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/webhook", - "digest": "ebf93652f0254ac56600bedf4a7d81611b3e1e7f6526c6998da5dd24cdc67ee1", + "digest": "cd577cb977a2830b29bb799cf146bbffe0241d65eef1c680ec158af97b18d4fa", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/activator", - "digest": "c2994c2b6c2c7f38ad1b85c71789bf1753cc8979926423c83231e62258837cb9", + "digest": "ad42ddc9bc4e25fdc88c240d7cbfad4b2708eb7d26e07ae904d258011141116e", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/autoscaler", - "digest": "8319aa662b4912e8175018bd7cc90c63838562a27515197b803bdcd5634c7007", + "digest": "66aa0dbceee62691d5327e423bbd7cbd411903747adeab61fdc81b14590793d4", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/controller", - "digest": "98a2cc7fd62ee95e137116504e7166c32c65efef42c3d1454630780410abf943", + "digest": "e5b7b6edd265b66d32f424bd245c06455154462ade6ce05698472212248d5657", "type": "image" }, { @@ -55,97 +59,97 @@ }, { "package": "gcr.io/knative-releases/knative.dev/net-istio/cmd/controller", - "digest": "421aa67057240fa0c56ebf2c6e5b482a12842005805c46e067129402d1751220", + "digest": "5782b4a6b1a106d7cafe77d044b30905a9fecbbd2e0029946cb8a4b3507b40a4", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/net-istio/cmd/webhook", - "digest": "bfa1dfea77aff6dfa7959f4822d8e61c4f7933053874cd3f27352323e6ecd985", + "digest": "eeff0ad31550f3ff519d988bb36bfe214e5b60c1ec4349c1f9bb2b2d8cad9479", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/webhook", - "digest": "4305209ce498caf783f39c8f3e85dfa635ece6947033bf50b0b627983fd65953", + "digest": "48aee2733721ecc77956abc5a2ca072853a669ebc97519beb48f7b3da8455e67", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/pkg/apiextensions/storageversion/cmd/migrate", - "digest": "bc91e1fdaf3b67876ca33de1ce15b1268ed0ca8da203102b7699286fae97cf58", + "digest": "232d6ffd88dfc0d0ec02c6f3a95520283d076c16b77543cee04f4ef276e0b7ae", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/queue", - "digest": "dabaecec38860ca4c972e6821d5dc825549faf50c6feb8feb4c04802f2338b8a", - "type": "image" - }, - { - "package": "gcr.io/knative-releases/knative.dev/net-istio/cmd/webhook", - "digest": "bfa1dfea77aff6dfa7959f4822d8e61c4f7933053874cd3f27352323e6ecd985", + "digest": "89e6f90141f1b63405883fbb4de0d3b6d80f8b77e530904c4d29bdcd1dc5a167", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/net-istio/cmd/controller", - "digest": "421aa67057240fa0c56ebf2c6e5b482a12842005805c46e067129402d1751220", + "digest": "5782b4a6b1a106d7cafe77d044b30905a9fecbbd2e0029946cb8a4b3507b40a4", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/pkg/apiextensions/storageversion/cmd/migrate", - "digest": "56780f69e6496bb4790b0c147deb652a2b020ff81e08d58cc58a61cd649b1121", + "digest": "d438c3ad2fcef3c7ea1b3abb910f5fa911c8a1466d6460ac0b11bf034797d6f6", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/mtchannel_broker", - "digest": "4040ffc2d34e950b7969b4ba90cec29e65e506126ddb195faf3a56cb2fa653e8", + "digest": "9dc9e0b00325f1ec994ef6f48761ba7d9217333fa0c2cbfccfa9b204e3f616a9", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/broker/ingress", - "digest": "7f3b05f6e0abae19e9438fac44dd9938ddd2293014ef0fb8d388450c9ff63000", + "digest": "65412cf797d0bb7c7e22454431f57f8d9dcedf93620769f4c1206947acf05abb", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/broker/filter", - "digest": "29bd9f43359153c0ea39cf382d5f25ca43f55abbbce3d802ca37cc4d5c4a6942", + "digest": "4e3cf0703024129c60b66529f41a1d29310f61f6aced24d25fd241e43b1a2e8e", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/in_memory/channel_dispatcher", - "digest": "521234b4cff9d3cd32f8264cd7c830caa06f9982637b4866e983591fa1abc418", + "digest": "fa64db1ad126874f4e5ce1c17c2414b0fc3dde2a7e0db6fde939cafdbd4d96cd", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/in_memory/channel_controller", - "digest": "e004174a896811aec46520b1f2857f1973762389426bb0e0fc5d2332d5e36c7a", + "digest": "5386029f1fdcce1398dcca436864051a2f7eb5abed176453104f41b7b9b587f9", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/webhook", - "digest": "ebf93652f0254ac56600bedf4a7d81611b3e1e7f6526c6998da5dd24cdc67ee1", + "digest": "cd577cb977a2830b29bb799cf146bbffe0241d65eef1c680ec158af97b18d4fa", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/mtping", - "digest": "6d35cc98baa098fc0c5b4290859e363a8350a9dadc31d1191b0b5c9796958223", + "digest": "9d74e8c69d671ad10fdfd84d33569fde5c16c9f95824ea288d2cb6fd69e32f4d", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/controller", - "digest": "92967bab4ad8f7d55ce3a77ba8868f3f2ce173c010958c28b9a690964ad6ee9b", + "digest": "7579c5a8b1dee07c382120a8bc1a6594aea4519d0cf652989f5d9a675b11a0de", "type": "image" }, { "package": "quay.io/jetstack/cert-manager-controller", - "tag": "v1.12.2", + "tag": "v1.14.5", "type": "image" }, { "package": "quay.io/jetstack/cert-manager-cainjector", - "tag": "v1.12.2", + "tag": "v1.14.5", "type": "image" }, { "package": "quay.io/jetstack/cert-manager-webhook", - "tag": "v1.12.2", + "tag": "v1.14.5", + "type": "image" + }, + { + "package": "docker.io/istio/proxyv2", + "tag": "1.22.1", "type": "image" }, { @@ -155,7 +159,7 @@ }, { "package": "docker.io/istio/pilot", - "tag": "1.17.5", + "tag": "1.22.1", "type": "image" }, { @@ -165,7 +169,7 @@ }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/controller", - "digest": "92967bab4ad8f7d55ce3a77ba8868f3f2ce173c010958c28b9a690964ad6ee9b", + "digest": "7579c5a8b1dee07c382120a8bc1a6594aea4519d0cf652989f5d9a675b11a0de", "type": "image" }, @@ -179,9 +183,14 @@ "tag": "latest", "type": "image" }, + { + "package": "docker.io/kubeflowkatib/pytorch-mnist", + "tag": "v1beta1-45c5727", + "type": "image" + }, { "package": "docker.io/kubeflownotebookswg/jupyter-scipy", - "tag": "v1.6.1", + "tag": "v1.9.2", "type": "image" }, { @@ -189,6 +198,36 @@ "tag": "latest", "type": "image" }, + { + "package": "docker.io/busybox", + "tag": "1.28", + "type": "image" + }, + { + "package": "docker.io/busybox", + "tag": "1.34.1", + "type": "image" + }, + { + "package": "docker.io/bentoml/fraud_detection", + "tag": "o5smnagbncigycvj", + "type": "image" + }, + { + "package": "docker.io/istio/install-cni", + "tag": "1.22.1", + "type": "image" + }, + { + "package": "docker.io/istio/pilot", + "tag": "1.22.1", + "type": "image" + }, + { + "package": "docker.io/istio/proxyv2", + "tag": "1.22.1", + "type": "image" + }, { "package": "gcr.io/kubeflow-images-public/profile-controller", "tag": "v20190228-v0.4.0-rc.1-192-g1a802656-dirty-f95773", @@ -196,7 +235,7 @@ }, { "package": "docker.io/seldonio/seldon-core-operator", - "tag": "1.17.1", + "tag": "1.18.1", "type": "image" }, { @@ -206,7 +245,7 @@ }, { "package": "docker.io/rayproject/ray", - "tag": "2.2.0-py38-cpu", + "tag": "2.23.0-py311-cpu", "type": "image" }, { @@ -216,12 +255,22 @@ }, { "package": "docker.io/kserve/kserve-controller", - "tag": "v0.11.2", + "tag": "v0.13.1", "type": "image" }, + { + "package": "docker.io/kserve/huggingfaceserver", + "tag": "v0.13.1", + "type": "image" + }, + { + "package": "docker.io/kserve/storage-initializer", + "tag": "v0.13.1", + "type": "image" + }, { "package": "docker.io/kserve/xgbserver", - "tag": "v0.11.2", + "tag": "v0.13.1", "type": "image" }, { @@ -231,7 +280,7 @@ }, { "package": "docker.io/pytorch/torchserve-kfs", - "tag": "0.8.2", + "tag": "0.9.0", "type": "image" }, { @@ -241,17 +290,17 @@ }, { "package": "docker.io/kserve/sklearnserver", - "tag": "v0.11.2", + "tag": "v0.13.1", "type": "image" }, { "package": "docker.io/kserve/pmmlserver", - "tag": "v0.11.2", + "tag": "v0.13.1", "type": "image" }, { "package": "docker.io/kserve/paddleserver", - "tag": "v0.11.2", + "tag": "v0.13.1", "type": "image" }, { @@ -261,7 +310,7 @@ }, { "package": "docker.io/kserve/lgbserver", - "tag": "v0.11.2", + "tag": "v0.13.1", "type": "image" }, { @@ -271,14 +320,24 @@ }, { "package": "quay.io/bentoml/yatai-image-builder", - "tag": "1.1.3", + "tag": "1.2.28", "type": "image" }, + { + "package": "quay.io/bentoml/yatai-deployment", + "tag": "1.1.21", + "type": "image" + }, { "package": "quay.io/oauth2-proxy/oauth2-proxy", "tag": "latest", "type": "image" }, + { + "package": "quay.io/oauth2-proxy/oauth2-proxy", + "tag": "v7.6.0", + "type": "image" + }, { "package": "docker.io/prom/prometheus", "tag": "latest", @@ -306,9 +365,24 @@ }, { "package": "docker.io/postgres", - "tag": "12-alpine", + "tag": "14.5-alpine", "type": "image" }, + { + "package": "docker.io/postgres", + "tag": "12-alpine", + "type": "image" + }, + { + "package": "docker.io/postgres", + "tag": "14.7-alpine", + "type": "image" + }, + { + "package": "docker.io/postgres", + "tag": "latest", + "type": "image" + }, { "package": "quay.io/argoproj/argocli", "tag": "latest", @@ -336,7 +410,7 @@ }, { "package": "gcr.io/tfx-oss-public/ml_metadata_store_server", - "tag": "1.5.0", + "tag": "1.14.0", "type": "image" }, { @@ -349,9 +423,14 @@ "tag": "nightly", "type": "image" }, + { + "package": "quay.io/aipipeline/pipelineloop-controller", + "tag": "nightly", + "type": "image" + }, { "package": "gcr.io/kubebuilder/kube-rbac-proxy", - "tag": "v0.4.0", + "tag": "v0.11.0", "type": "image" }, { @@ -361,87 +440,87 @@ }, { "package": "docker.io/kubeflowkatib/enas-cnn-cifar10-cpu", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/pytorch-mnist-cpu", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/file-metrics-collector", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/tfevent-metrics-collector", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-hyperopt", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-optuna", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-hyperband", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-skopt", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-goptuna", - "tag": "v0.16.0-rc.1", - "type": "image" - }, - { - "package": "docker.io/kubeflowkatib/suggestion-optuna", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-enas", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-darts", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-pbt", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/earlystopping-medianstop", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kserve/kserve-controller", - "tag": "v0.11.1", + "tag": "v0.13.1", "type": "image" }, { "package": "gcr.io/ml-pipeline/frontend", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" }, { "package": "gcr.io/ml-pipeline/visualization-server", - "tag": "2.0.3", + "tag": "2.3.0", + "type": "image" + }, + { + "package": "docker.io/kubeflownotebookswg/poddefaults-webhook", + "tag": "v1.9.2", "type": "image" }, { @@ -451,17 +530,17 @@ }, { "package": "gcr.io/ml-pipeline/cache-server", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/centraldashboard", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/jupyter-web-app", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { @@ -469,6 +548,11 @@ "tag": "8.0.26", "type": "image" }, + { + "package": "docker.io/kubeflowkatib/katib-controller", + "tag": "v0.17.0", + "type": "image" + }, { "package": "docker.io/kubeflowkatib/katib-controller", "tag": "v0.16.0", @@ -476,7 +560,7 @@ }, { "package": "docker.io/kubeflowkatib/katib-db-manager", - "tag": "v0.16.0", + "tag": "v0.17.0", "type": "image" }, { @@ -486,17 +570,22 @@ }, { "package": "docker.io/kubeflowkatib/katib-ui", - "tag": "v0.16.0", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kserve/kserve-controller", - "tag": "v0.11.1", + "tag": "v0.13.1", + "type": "image" + }, + { + "package": "docker.io/kserve/models-web-app", + "tag": "v0.13.0-rc.0", "type": "image" }, { "package": "docker.io/kserve/models-web-app", - "tag": "v0.10.0", + "tag": "latest", "type": "image" }, { @@ -511,7 +600,7 @@ }, { "package": "gcr.io/ml-pipeline/metadata-envoy", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" }, { @@ -521,7 +610,7 @@ }, { "package": "gcr.io/ml-pipeline/metadata-writer", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" }, { @@ -531,52 +620,52 @@ }, { "package": "gcr.io/ml-pipeline/api-server", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" }, { "package": "gcr.io/ml-pipeline/persistenceagent", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" }, { "package": "gcr.io/ml-pipeline/scheduledworkflow", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/tensorboard-controller", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/tensorboards-web-app", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { "package": "docker.io/kubeflow/training-operator", - "tag": "v1-855e096", + "tag": "v1-04f9f13", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/notebook-controller", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/kfam", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/profile-controller", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { "package": "gcr.io/kubebuilder/kube-rbac-proxy", - "tag": "v0.13.1", + "tag": "v0.4.0", "type": "image" }, { @@ -586,17 +675,12 @@ }, { "package": "docker.io/kubeflownotebookswg/tensorboards-web-app", - "tag": "v1.8.0", - "type": "image" - }, - { - "package": "docker.io/mariadb", - "tag": "latest", + "tag": "v1.9.2", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/pvcviewer-controller", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { @@ -604,16 +688,26 @@ "tag": "v3.3.10-license-compliance", "type": "image" }, + { + "package": "gcr.io/ml-pipeline/workflow-controller", + "tag": "v3.3.8-license-compliance", + "type": "image" + }, + { + "package": "gcr.io/ml-pipeline/workflow-controller", + "tag": "v3.4.17-license-compliance", + "type": "image" + }, { "package": "docker.io/kubeflownotebookswg/volumes-web-app", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { "package": "gcr.io/ml-pipeline/viewer-crd-controller", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" } ] } - } + } \ No newline at end of file diff --git a/input/config/rhel/8.8/openldap.json b/input/config/rhel/8.8/openldap.json index 053f90f51..592788650 100644 --- a/input/config/rhel/8.8/openldap.json +++ b/input/config/rhel/8.8/openldap.json @@ -8,7 +8,8 @@ { "package": "ansible-role-ldaptoolbox-openldap", "type": "git", "url": "https://github.com/ltb-project/ansible-role-ldaptoolbox-openldap.git", - "version": "main" + "version": "main", + "commit": "695a689ff91a83b47fbc6f575be37e1f811bd719" } ] } diff --git a/input/config/rhel/8.8/secure_login_node.json b/input/config/rhel/8.8/secure_login_node.json index 9c5807013..38cd25bfe 100644 --- a/input/config/rhel/8.8/secure_login_node.json +++ b/input/config/rhel/8.8/secure_login_node.json @@ -22,17 +22,17 @@ {"package": "wget", "type": "rpm", "repo_name": "appstream"}, {"package": "psacct", "type": "rpm", "repo_name": "baseos"}, {"package": "psacct", "type": "rpm", "repo_name": "baseos"}, - {"package": "python3.9", "type": "rpm", "repo_name": "appstream"}, - { - "package": "ansible==7.7.0", + {"package": "python3.11", "type": "rpm", "repo_name": "appstream"}, + { + "package": "ansible==9.5.1", "type": "pip_module" }, - { - "package": "cryptography==41.0.7", + { + "package": "cryptography==44.0.0", "type": "pip_module" }, - { - "package": "jinja2==3.1.2", + { + "package": "jinja2==3.1.2", "type": "pip_module" } diff --git a/input/config/rhel/8.8/telemetry.json b/input/config/rhel/8.8/telemetry.json index 60a108fcd..70eac1bb4 100644 --- a/input/config/rhel/8.8/telemetry.json +++ b/input/config/rhel/8.8/telemetry.json @@ -1,211 +1,248 @@ { - "telemetry": { - "cluster": [ - { + "telemetry": { + "cluster": [ + { "package": "buildkit", "type": "git", "url": "https://github.com/moby/buildkit.git", "version": "v0.13.1" - }, - { "package": "smartmontools", - "type": "rpm", - "repo_name": "baseos"}, - { - "package": "containerd.io-1.6.16-3.1.el8", - "type": "rpm", - "repo_name": "docker-ce-repo" - }, - { - "package": "docker-ce-cli-1:20.10.20-3.el8", - "type": "rpm", - "repo_name": "docker-ce-repo" - }, - { - "package": "docker-ce-3:20.10.20-3.el8", - "type": "rpm", - "repo_name": "docker-ce-repo" - }, - { - "package": "docker-ce-rootless-extras", - "type": "rpm", - "repo_name": "docker-ce-repo" - }, - { - "package": "kubespray", - "type": "git", - "url": "https://github.com/kubernetes-sigs/kubespray.git", - "version": "release-2.23" - }, - { - "package": "kubectl", - "type": "tarball", - "url": "https://dl.k8s.io/release/v1.26.12/bin/linux/amd64/kubectl" - }, - { - "package": "kubelet", - "type": "tarball", - "url": "https://dl.k8s.io/release/v1.26.12/bin/linux/amd64/kubelet" - }, - { - "package": "kubeadm", - "type": "tarball", - "url": "https://dl.k8s.io/release/v1.26.12/bin/linux/amd64/kubeadm" - }, - { - "package": "calicoctl-v3.25.2", - "type": "tarball", - "url": "https://github.com/projectcalico/calico/releases/download/v3.25.2/calicoctl-linux-amd64" - }, - { - "package": "calicocrds-v3.25.2", - "type": "tarball", - "url": "https://github.com/projectcalico/calico/archive/v3.25.2.tar.gz" - }, - { - "package": "cri-tools-v1.26.1", - "type": "tarball", - "url": "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.26.1/crictl-v1.26.1-linux-amd64.tar.gz" - }, - { - "package": "etcd-v3.5.10", - "type": "tarball", - "url": "https://github.com/etcd-io/etcd/releases/download/v3.5.10/etcd-v3.5.10-linux-amd64.tar.gz" - }, - { - "package": "cni-plugins-v1.3.0", - "type": "tarball", - "url": "https://github.com/containernetworking/plugins/releases/download/v1.3.0/cni-plugins-linux-amd64-v1.3.0.tgz" - }, - { - "package": "runc.amd64", - "type": "tarball", - "url": "https://github.com/opencontainers/runc/releases/download/v1.1.9/runc.amd64" - }, - { - "package": "nerdctl-v1.5.0", - "type": "tarball", - "url": "https://github.com/containerd/nerdctl/releases/download/v1.5.0/nerdctl-1.5.0-linux-amd64.tar.gz" - }, - { - "package": "containerd-1.7.5", - "type": "tarball", - "url": "https://github.com/containerd/containerd/releases/download/v1.7.5/containerd-1.7.5-linux-amd64.tar.gz" - }, - { - "package": "helm-v3.12.3", - "type": "tarball", - "url": "https://get.helm.sh/helm-v3.12.3-linux-amd64.tar.gz" - }, - { - "package": "nfs-subdir-external-provisioner-4.0.18", - "type": "tarball", - "url": "https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz" - }, - { - "package": "docker.io/library/nginx", - "tag": "1.25.2-alpine", - "type": "image" - }, - { - "package": "registry.k8s.io/coredns/coredns", - "tag": "v1.9.3", - "type": "image" - }, - { - "package": "registry.k8s.io/cpa/cluster-proportional-autoscaler", - "tag": "v1.8.8", - "type": "image" - }, - { - "package": "registry.k8s.io/dns/k8s-dns-node-cache", - "tag": "1.22.28", - "type": "image" - }, - { - "package": "registry.k8s.io/kube-apiserver", - "tag": "v1.26.12", - "type": "image" - }, - { - "package": "registry.k8s.io/kube-controller-manager", - "tag": "v1.26.12", - "type": "image" - }, - { - "package": "registry.k8s.io/kube-proxy", - "tag": "v1.26.12", - "type": "image" - }, - { - "package": "registry.k8s.io/kube-scheduler", - "tag": "v1.26.12", - "type": "image" - }, - { - "package": "registry.k8s.io/pause", - "tag": "3.9", - "type": "image" - }, - { - "package": "quay.io/coreos/etcd", - "tag": "v3.5.10", - "type": "image" - }, - { - "package": "quay.io/calico/cni", - "tag": "v3.25.2", - "type": "image" - }, - { - "package": "quay.io/calico/kube-controllers", - "tag": "v3.25.2", - "type": "image" - }, - { - "package": "quay.io/calico/pod2daemon-flexvol", - "tag": "v3.25.2", - "type": "image" - }, - { - "package": "quay.io/calico/node", - "tag": "v3.25.2", - "type": "image" - }, - { - "package": "docker.io/flannel/flannel-cni-plugin", - "tag": "v1.1.2", - "type": "image" - }, - { - "package": "docker.io/flannel/flannel", - "tag": "v0.22.0", - "type": "image" - }, - { - "package": "quay.io/metallb/speaker", - "tag": "v0.13.9", - "type": "image" - }, - { - "package": "quay.io/metallb/controller", - "tag": "v0.13.9", - "type": "image" - }, - { - "package": "docker.io/kubernetesui/dashboard", - "tag": "v2.2.0", - "type": "image" - }, - { - "package": "docker.io/kubernetesui/metrics-scraper", - "tag": "v1.0.6", - "type": "image" - }, - { - "package": "docker.io/grafana/grafana-enterprise", - "tag": "8.3.2", - "type": "image" - } - ] - } + }, + { + "package": "smartmontools", + "type": "rpm", + "repo_name": "baseos" + }, + { + "package": "docker-ce-cli-1:20.10.20-3.el8", + "type": "rpm", + "repo_name": "docker-ce-repo" + }, + { + "package": "docker-ce-3:20.10.20-3.el8", + "type": "rpm", + "repo_name": "docker-ce-repo" + }, + { + "package": "docker-ce-rootless-extras", + "type": "rpm", + "repo_name": "docker-ce-repo" + }, + { + "package": "kubespray-v2.25.0", + "type": "git", + "url": "https://github.com/kubernetes-sigs/kubespray.git", + "version": "v2.25.0" + }, + { + "package": "kubectl-1.29.5", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.29.5/bin/linux/amd64/kubectl" + }, + { + "package": "kubelet-1.29.5", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.29.5/bin/linux/amd64/kubelet" + }, + { + "package": "kubeadm-1.29.5", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.29.5/bin/linux/amd64/kubeadm" + }, + { + "package": "calicoctl-v3.27.3", + "type": "tarball", + "url": "https://github.com/projectcalico/calico/releases/download/v3.27.3/calicoctl-linux-amd64" + }, + { + "package": "calicocrds-v3.27.3", + "type": "tarball", + "url": "https://github.com/projectcalico/calico/archive/v3.27.3.tar.gz" + }, + { + "package": "cri-tools-v1.29.0", + "type": "tarball", + "url": "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.29.0/crictl-v1.29.0-linux-amd64.tar.gz" + }, + { + "package": "etcd-v3.5.12", + "type": "tarball", + "url": "https://github.com/etcd-io/etcd/releases/download/v3.5.12/etcd-v3.5.12-linux-amd64.tar.gz" + }, + { + "package": "cni-plugins-v1.3.0", + "type": "tarball", + "url": "https://github.com/containernetworking/plugins/releases/download/v1.3.0/cni-plugins-linux-amd64-v1.3.0.tgz" + }, + { + "package": "runc.amd64-v1.1.12", + "type": "tarball", + "url": "https://github.com/opencontainers/runc/releases/download/v1.1.12/runc.amd64" + }, + { + "package": "nerdctl-v1.7.4", + "type": "tarball", + "url": "https://github.com/containerd/nerdctl/releases/download/v1.7.4/nerdctl-1.7.4-linux-amd64.tar.gz" + }, + { + "package": "containerd-1.7.16", + "type": "tarball", + "url": "https://github.com/containerd/containerd/releases/download/v1.7.16/containerd-1.7.16-linux-amd64.tar.gz" + }, + { + "package": "helm-v3.14.2", + "type": "tarball", + "url": "https://get.helm.sh/helm-v3.14.2-linux-amd64.tar.gz" + }, + { + "package": "nfs-subdir-external-provisioner-4.0.18", + "type": "tarball", + "url": "https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz" + }, + { + "package": "docker.io/library/nginx", + "tag": "1.25.2-alpine", + "type": "image" + }, + { + "package": "registry.k8s.io/coredns/coredns", + "tag": "v1.11.1", + "type": "image" + }, + { + "package": "registry.k8s.io/cpa/cluster-proportional-autoscaler", + "tag": "v1.8.8", + "type": "image" + }, + { + "package": "registry.k8s.io/dns/k8s-dns-node-cache", + "tag": "1.22.28", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-apiserver", + "tag": "v1.29.5", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-controller-manager", + "tag": "v1.29.5", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-proxy", + "tag": "v1.29.5", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-scheduler", + "tag": "v1.29.5", + "type": "image" + }, + { + "package": "registry.k8s.io/pause", + "tag": "3.9", + "type": "image" + }, + { + "package": "quay.io/coreos/etcd", + "tag": "v3.5.12", + "type": "image" + }, + { + "package": "quay.io/calico/cni", + "tag": "v3.27.3", + "type": "image" + }, + { + "package": "quay.io/calico/kube-controllers", + "tag": "v3.27.3", + "type": "image" + }, + { + "package": "quay.io/calico/pod2daemon-flexvol", + "tag": "v3.27.3", + "type": "image" + }, + { + "package": "quay.io/calico/node", + "tag": "v3.27.3", + "type": "image" + }, + { + "package": "docker.io/flannel/flannel-cni-plugin", + "tag": "v1.1.2", + "type": "image" + }, + { + "package": "docker.io/flannel/flannel", + "tag": "v0.22.0", + "type": "image" + }, + { + "package": "quay.io/metallb/speaker", + "tag": "v0.13.9", + "type": "image" + }, + { + "package": "quay.io/metallb/controller", + "tag": "v0.13.9", + "type": "image" + }, + { + "package": "docker.io/kubernetesui/dashboard", + "tag": "v2.7.0", + "type": "image" + }, + { + "package": "docker.io/kubernetesui/metrics-scraper", + "tag": "v1.0.8", + "type": "image" + }, + { + "package": "docker.io/grafana/grafana-enterprise", + "tag": "8.3.2", + "type": "image" + }, + { + "package": "kube-prometheus-stack-62.3.0", + "type": "tarball", + "url": "https://github.com/prometheus-community/helm-charts/releases/download/kube-prometheus-stack-62.3.0/kube-prometheus-stack-62.3.0.tgz" + }, + { + "package": "quay.io/prometheus-operator/prometheus-operator", + "tag": "v0.76.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-state-metrics/kube-state-metrics", + "tag": "v2.13.0", + "type": "image" + }, + { + "package": "quay.io/prometheus-operator/prometheus-config-reloader", + "tag": "v0.76.0", + "type": "image" + }, + { + "package": "quay.io/prometheus/alertmanager", + "tag": "v0.27.0", + "type": "image" + }, + { + "package": "quay.io/prometheus/node-exporter", + "tag": "v1.8.2", + "type": "image" + }, + { + "package": "quay.io/prometheus/prometheus", + "tag": "v2.54.0", + "type": "image" + }, + { + "package": "registry.k8s.io/ingress-nginx/kube-webhook-certgen", + "tag": "v20221220-controller-v1.5.1-58-g787ea74b6", + "type": "image" + } + ] + } } diff --git a/input/config/rhel/8.8/vllm.json b/input/config/rhel/8.8/vllm.json index 966d478bc..f8d95a830 100644 --- a/input/config/rhel/8.8/vllm.json +++ b/input/config/rhel/8.8/vllm.json @@ -1,40 +1,44 @@ -{ +{ - "vllm": { + "vllm": { "cluster": [] }, - "vllm_amd": { + "vllm_amd": { - "cluster": [ - { - "package": "docker.io/embeddedllminfo/vllm-rocm", - "tag": "vllm-v0.2.4", - "type": "image" + "cluster": [ + { + "package": "docker.io/embeddedllminfo/vllm-rocm", + "tag": "vllm-v0.2.4", + "type": "image" } ] }, - "vllm_nvidia": { + "vllm_nvidia": { - "cluster": [ - { - "package": "python3.9", + "cluster": [ + { + "package": "python3.11", "type": "rpm", "repo_name": "appstream" - - }, - { - "package": "https://download.pytorch.org/whl/cu121/torch-2.1.2%2Bcu121-cp39-cp39-linux_x86_64.whl", + + }, + { + "package": "https://download.pytorch.org/whl/cu121/torch-2.1.2%2Bcu121-cp311-cp311-linux_x86_64.whl", + "type": "pip_module" + }, + { + "package": "https://github.com/vllm-project/vllm/releases/download/v0.4.0/vllm-0.4.0-cp311-cp311-manylinux1_x86_64.whl", "type": "pip_module" }, - { - "package": "https://github.com/vllm-project/vllm/releases/download/v0.4.0/vllm-0.4.0-cp39-cp39-manylinux1_x86_64.whl", + { + "package": "numpy<2", "type": "pip_module" } ] } -} \ No newline at end of file +} diff --git a/input/config/rocky/8.6/k8s.json b/input/config/rocky/8.6/k8s.json index 035b62418..94e23d59f 100644 --- a/input/config/rocky/8.6/k8s.json +++ b/input/config/rocky/8.6/k8s.json @@ -21,6 +21,11 @@ "type": "rpm", "repo_name": "docker-ce-repo" }, + { + "package": "nvidia-container-toolkit", + "type": "rpm", + "repo_name": "nvidia-repo" + }, { "package": "kubectl", "type": "tarball", diff --git a/input/config/rocky/8.6/secure_login_node.json b/input/config/rocky/8.6/secure_login_node.json index 15a02fc86..9c00d3249 100644 --- a/input/config/rocky/8.6/secure_login_node.json +++ b/input/config/rocky/8.6/secure_login_node.json @@ -28,7 +28,7 @@ "type": "pip_module" }, { - "package": "cryptography==41.0.7", + "package": "cryptography==44.0.0", "type": "pip_module" }, { diff --git a/input/config/rocky/8.6/vllm.json b/input/config/rocky/8.6/vllm.json index fe9d6a598..2559dece9 100644 --- a/input/config/rocky/8.6/vllm.json +++ b/input/config/rocky/8.6/vllm.json @@ -37,4 +37,4 @@ } -} \ No newline at end of file +} diff --git a/input/config/rocky/8.7/k8s.json b/input/config/rocky/8.7/k8s.json index 5f38bbef9..44c89c617 100644 --- a/input/config/rocky/8.7/k8s.json +++ b/input/config/rocky/8.7/k8s.json @@ -21,6 +21,11 @@ "type": "rpm", "repo_name": "docker-ce-repo" }, + { + "package": "nvidia-container-toolkit", + "type": "rpm", + "repo_name": "nvidia-repo" + }, { "package": "kubectl", "type": "tarball", diff --git a/input/config/rocky/8.7/secure_login_node.json b/input/config/rocky/8.7/secure_login_node.json index e20d0a2dd..e7465e93c 100644 --- a/input/config/rocky/8.7/secure_login_node.json +++ b/input/config/rocky/8.7/secure_login_node.json @@ -28,7 +28,7 @@ "type": "pip_module" }, { - "package": "cryptography==41.0.7", + "package": "cryptography==44.0.0", "type": "pip_module" }, { diff --git a/input/config/rocky/8.8/amdgpu.json b/input/config/rocky/8.8/amdgpu.json index 4a24aed95..1cf28d26d 100644 --- a/input/config/rocky/8.8/amdgpu.json +++ b/input/config/rocky/8.8/amdgpu.json @@ -8,7 +8,7 @@ }, "rocm": { "cluster": [ - {"package": "rocm-hip-sdk{{ rocm_version }}*", "type": "rpm", "repo_name": "rocm"} + {"package": "rocm", "type": "rpm", "repo_name": "rocm"} ] } } \ No newline at end of file diff --git a/input/config/rocky/8.8/csi_driver_powerscale.json b/input/config/rocky/8.8/csi_driver_powerscale.json new file mode 100644 index 000000000..afc8fe181 --- /dev/null +++ b/input/config/rocky/8.8/csi_driver_powerscale.json @@ -0,0 +1,84 @@ +{ + "csi_driver_powerscale": { + "cluster": [ + { + "package": "csi-powerscale", + "url": "https://github.com/dell/csi-powerscale.git", + "type": "git", + "version": "v2.11.0" + }, + { + "package": "external-snapshotter", + "url": "https://github.com/kubernetes-csi/external-snapshotter.git", + "type": "git", + "version": "v8.0.1" + }, + { + "package": "helm-charts", + "url": "https://github.com/dell/helm-charts.git", + "type": "git", + "version": "csi-isilon-2.11.0" + }, + { + "package": "docker.io/dellemc/csi-isilon", + "tag": "v2.11.0", + "type": "image" + }, + { + "package": "registry.k8s.io/sig-storage/csi-attacher", + "tag": "v4.6.1", + "type": "image" + }, + { + "package": "registry.k8s.io/sig-storage/csi-provisioner", + "tag": "v5.0.1", + "type": "image" + }, + { + "package": "registry.k8s.io/sig-storage/csi-snapshotter", + "tag": "v8.0.1", + "type": "image" + }, + { + "package": "registry.k8s.io/sig-storage/csi-resizer", + "tag": "v1.11.1", + "type": "image" + }, + { + "package": "registry.k8s.io/sig-storage/csi-node-driver-registrar", + "tag": "v2.10.1", + "type": "image" + }, + { + "package": "registry.k8s.io/sig-storage/csi-external-health-monitor-controller", + "tag": "v0.12.1", + "type": "image" + }, + { + "package": "docker.io/dellemc/dell-csi-replicator", + "tag": "v1.9.0", + "type": "image" + }, + { + "package": "docker.io/dellemc/podmon", + "tag": "v1.10.0", + "type": "image" + }, + { + "package": "docker.io/dellemc/csm-authorization-sidecar", + "tag": "v1.11.0", + "type": "image" + }, + { + "package": "docker.io/dellemc/csi-metadata-retriever", + "tag": "v1.8.0", + "type": "image" + }, + { + "package": "docker.io/dellemc/csm-encryption", + "tag": "v0.6.0", + "type": "image" + } + ] + } +} diff --git a/input/config/rocky/8.8/k8s.json b/input/config/rocky/8.8/k8s.json index 5f38bbef9..d47adec70 100644 --- a/input/config/rocky/8.8/k8s.json +++ b/input/config/rocky/8.8/k8s.json @@ -2,59 +2,44 @@ "k8s": { "cluster": [ { - "package": "containerd.io-1.6.16-3.1.el8", + "package": "nvidia-container-toolkit", "type": "rpm", - "repo_name": "docker-ce-repo" + "repo_name": "nvidia-repo" }, { - "package": "docker-ce-cli-1:20.10.20-3.el8", - "type": "rpm", - "repo_name": "docker-ce-repo" - }, - { - "package": "docker-ce-3:20.10.20-3.el8", - "type": "rpm", - "repo_name": "docker-ce-repo" - }, - { - "package": "docker-ce-rootless-extras", - "type": "rpm", - "repo_name": "docker-ce-repo" - }, - { - "package": "kubectl", + "package": "kubectl-{{ k8s_version }}", "type": "tarball", "url": "https://dl.k8s.io/release/v{{ k8s_version }}/bin/linux/amd64/kubectl" }, { - "package": "kubelet", + "package": "kubelet-{{ k8s_version }}", "type": "tarball", "url": "https://dl.k8s.io/release/v{{ k8s_version }}/bin/linux/amd64/kubelet" }, { - "package": "kubeadm", + "package": "kubeadm-{{ k8s_version }}", "type": "tarball", "url": "https://dl.k8s.io/release/v{{ k8s_version }}/bin/linux/amd64/kubeadm" }, { - "package": "calicoctl-v3.25.2", + "package": "calicoctl-v3.27.3", "type": "tarball", - "url": "https://github.com/projectcalico/calico/releases/download/v3.25.2/calicoctl-linux-amd64" + "url": "https://github.com/projectcalico/calico/releases/download/v3.27.3/calicoctl-linux-amd64" }, { - "package": "calicocrds-v3.25.2", + "package": "calicocrds-v3.27.3", "type": "tarball", - "url": "https://github.com/projectcalico/calico/archive/v3.25.2.tar.gz" + "url": "https://github.com/projectcalico/calico/archive/v3.27.3.tar.gz" }, { - "package": "cri-tools-v1.26.1", + "package": "cri-tools-v1.29.0", "type": "tarball", - "url": "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.26.1/crictl-v1.26.1-linux-amd64.tar.gz" + "url": "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.29.0/crictl-v1.29.0-linux-amd64.tar.gz" }, { - "package": "etcd-v3.5.10", + "package": "etcd-v3.5.12", "type": "tarball", - "url": "https://github.com/etcd-io/etcd/releases/download/v3.5.10/etcd-v3.5.10-linux-amd64.tar.gz" + "url": "https://github.com/etcd-io/etcd/releases/download/v3.5.12/etcd-v3.5.12-linux-amd64.tar.gz" }, { "package": "cni-plugins-v1.3.0", @@ -62,24 +47,24 @@ "url": "https://github.com/containernetworking/plugins/releases/download/v1.3.0/cni-plugins-linux-amd64-v1.3.0.tgz" }, { - "package": "runc.amd64", + "package": "runc.amd64.v1.1.12", "type": "tarball", - "url": "https://github.com/opencontainers/runc/releases/download/v1.1.9/runc.amd64" + "url": "https://github.com/opencontainers/runc/releases/download/v1.1.12/runc.amd64" }, { - "package": "nerdctl-v1.5.0", + "package": "nerdctl-v1.7.4", "type": "tarball", - "url": "https://github.com/containerd/nerdctl/releases/download/v1.5.0/nerdctl-1.5.0-linux-amd64.tar.gz" + "url": "https://github.com/containerd/nerdctl/releases/download/v1.7.4/nerdctl-1.7.4-linux-amd64.tar.gz" }, { - "package": "containerd-1.7.5", + "package": "containerd-1.7.16", "type": "tarball", - "url": "https://github.com/containerd/containerd/releases/download/v1.7.5/containerd-1.7.5-linux-amd64.tar.gz" + "url": "https://github.com/containerd/containerd/releases/download/v1.7.16/containerd-1.7.16-linux-amd64.tar.gz" }, { - "package": "helm-v3.12.3", + "package": "helm-v3.14.2", "type": "tarball", - "url": "https://get.helm.sh/helm-v3.12.3-linux-amd64.tar.gz" + "url": "https://get.helm.sh/helm-v3.14.2-linux-amd64.tar.gz" }, { "package": "nvidia-device-plugin", @@ -92,7 +77,7 @@ "url": "https://raw.githubusercontent.com/ROCm/k8s-device-plugin/r1.16/k8s-ds-amdgpu-dp.yaml" }, { - "package": "mpi-operator", + "package": "mpi-operator-v0.4.0", "type": "manifest", "url": "https://raw.githubusercontent.com/kubeflow/mpi-operator/v0.4.0/deploy/v2beta1/mpi-operator.yaml" }, @@ -118,7 +103,7 @@ }, { "package": "registry.k8s.io/coredns/coredns", - "tag": "v1.9.3", + "tag": "v1.11.1", "type": "image" }, { @@ -158,27 +143,27 @@ }, { "package": "quay.io/coreos/etcd", - "tag": "v3.5.10", + "tag": "v3.5.12", "type": "image" }, { "package": "quay.io/calico/cni", - "tag": "v3.25.2", + "tag": "v3.27.3", "type": "image" }, { "package": "quay.io/calico/kube-controllers", - "tag": "v3.25.2", + "tag": "v3.27.3", "type": "image" }, { "package": "quay.io/calico/pod2daemon-flexvol", - "tag": "v3.25.2", + "tag": "v3.27.3", "type": "image" }, { "package": "quay.io/calico/node", - "tag": "v3.25.2", + "tag": "v3.27.3", "type": "image" }, { @@ -241,11 +226,6 @@ "tag": "v0.8.2", "type": "image" }, - { - "package": "registry.k8s.io/nfd/node-feature-discovery", - "tag": "v0.12.1", - "type": "image" - }, { "package": "registry.k8s.io/sig-storage/nfs-subdir-external-provisioner", "tag": "v4.0.2", diff --git a/input/config/rocky/8.8/kserve.json b/input/config/rocky/8.8/kserve.json index c2d2869dc..8632d3869 100644 --- a/input/config/rocky/8.8/kserve.json +++ b/input/config/rocky/8.8/kserve.json @@ -4,111 +4,111 @@ { "package": "istio", "type": "tarball", - "url": "https://github.com/istio/istio/releases/download/1.17.0/istio-1.17.0-linux-amd64.tar.gz" + "url": "https://github.com/istio/istio/releases/download/1.20.4/istio-1.20.4-linux-amd64.tar.gz" }, { "package": "docker.io/istio/proxyv2", - "tag": "1.17.0", + "tag": "1.20.4", "type": "image" }, { "package": "docker.io/istio/pilot", - "tag": "1.17.0", + "tag": "1.20.4", "type": "image" }, { "package": "knative_serving_crds_manifest", "type": "manifest", - "url": "https://github.com/knative/serving/releases/download/knative-v1.11.0/serving-crds.yaml" + "url": "https://github.com/knative/serving/releases/download/knative-v1.13.1/serving-crds.yaml" }, { "package": "knative_serving_core_manifest", "type": "manifest", - "url": "https://github.com/knative/serving/releases/download/knative-v1.11.0/serving-core.yaml" + "url": "https://github.com/knative/serving/releases/download/knative-v1.13.1/serving-core.yaml" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/queue", - "digest": "987f53e3ead58627e3022c8ccbb199ed71b965f10c59485bab8015ecf18b44af", + "digest": "e52286fc4843470383e917abc9c1b0c8d10f585c4274c57b612279869bc86f0d", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/activator", - "digest": "6b98eed95dd6dcc3d957e673aea3d271b768225442504316d713c08524f44ebe", + "digest": "21f8e11a44bf1e260602d30e6762a3dc433c608d1dd0e309c0ff89728e71901d", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/autoscaler", - "digest": "5b52cc9aa521ee236645db57f19b70f2a0e8f6ef27dfa9181409a0f96406e2ad", + "digest": "34796e9f760bb67065c6f101296513b38d04d39d11888e919692ac46fa6dc7c2", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/controller", - "digest": "b11dbcba050eac9084edd021b7e0eee16b39c9e397b245bc4227266af1893404", + "digest": "53d9aa4d2c7a82f5a01202e386f7503b21839cbe2e5e62f1e9bda2aa5f11b518", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/webhook", - "digest": "7b138c73fcaaf0b9bb2d414b8a89a780f8c09371d24c6f57969be1694acf4aaa", + "digest": "700c69915dc7cd86dffb61c26b0ba34427fab809de1e3344589dd955b6440882", "type": "image" }, { "package": "knative_net_istio_manifest", "type": "manifest", - "url": "https://github.com/knative/net-istio/releases/download/knative-v1.11.0/net-istio.yaml" + "url": "https://github.com/knative/net-istio/releases/download/knative-v1.13.1/net-istio.yaml" }, { "package": "gcr.io/knative-releases/knative.dev/net-istio/cmd/controller", - "digest": "27e7beb7c62036216fc464fb2181e56b030158ad4ceb57a7de172f54b4fe43db", + "digest": "a5b041ba3c9ea40198b2331617bd1571942961c1416ef683b4de8ef162755a88", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/net-istio/cmd/webhook", - "digest": "0cdef272e39c57971ce9977765f164dd8e3abb9395a4f60e7a4160d57dcc09f2", + "digest": "f066376eee17505d14881b7635a7ca7531fce0f30cf968232fc0a93adc952ed5", "type": "image" }, { "package": "cert_manager_manifest", "type": "manifest", - "url": "https://github.com/cert-manager/cert-manager/releases/download/v1.13.0/cert-manager.yaml" + "url": "https://github.com/cert-manager/cert-manager/releases/download/v1.14.5/cert-manager.yaml" }, { "package": "quay.io/jetstack/cert-manager-cainjector", - "tag": "v1.13.0", + "tag": "v1.14.5", "type": "image" }, { "package": "quay.io/jetstack/cert-manager-controller", - "tag": "v1.13.0", + "tag": "v1.14.5", "type": "image" }, { "package": "quay.io/jetstack/cert-manager-webhook", - "tag": "v1.13.0", + "tag": "v1.14.5", "type": "image" }, { "package": "kserve_manifest", "type": "manifest", - "url": "https://github.com/kserve/kserve/releases/download/v0.11.2/kserve.yaml" + "url": "https://github.com/kserve/kserve/releases/download/v0.13.0/kserve.yaml" }, { "package": "docker.io/kserve/kserve-controller", - "tag": "v0.11.2", + "tag": "v0.13.0", "type": "image" }, { "package": "docker.io/kserve/storage-initializer", - "tag": "v0.11.2", + "tag": "v0.13.0", "type": "image" }, { "package": "docker.io/kserve/router", - "tag": "v0.11.2", + "tag": "v0.13.0", "type": "image" }, { "package": "docker.io/kserve/agent", - "tag": "v0.11.2", + "tag": "v0.13.0", "type": "image" }, { @@ -119,7 +119,7 @@ { "package": "kserve_runtimes_manifest", "type": "manifest", - "url": "https://github.com/kserve/kserve/releases/download/v0.11.2/kserve-runtimes.yaml" + "url": "https://github.com/kserve/kserve/releases/download/v0.13.0/kserve-cluster-resources.yaml" }, { "package": "docker.io/seldonio/mlserver", @@ -128,7 +128,7 @@ }, { "package": "docker.io/kserve/sklearnserver", - "tag": "v0.11.2", + "tag": "v0.13.0", "type": "image" } ] diff --git a/input/config/rocky/8.8/kubeflow.json b/input/config/rocky/8.8/kubeflow.json index e8e47c143..d040667bf 100644 --- a/input/config/rocky/8.8/kubeflow.json +++ b/input/config/rocky/8.8/kubeflow.json @@ -4,43 +4,47 @@ { "package": "kubeflow", "type": "git", - "url": "https://github.com/kubeflow/manifests.git" - , - "version": "v1.8.0" + "url": "https://github.com/kubeflow/manifests.git", + "version": "v1.9.1" }, { "package": "kustomize", "type": "tarball", - "url": "https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize%2Fv5.0.3/kustomize_v5.0.3_linux_amd64.tar.gz" + "url": "https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize%2Fv5.4.3/kustomize_v5.4.3_linux_amd64.tar.gz" }, { "package": "ghcr.io/dexidp/dex", - "tag": "v2.36.0", + "tag": "v2.39.1", + "type": "image" + }, + { + "package": "ghcr.io/dexidp/dex", + "tag": "v2.35.0", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/apiserver_receive_adapter", - "digest": "828db8155996e40c13b77c1d039dba98153dcfcbe272248e92866bd7b6d6a17d", + "digest": "4ed3e39a11f4fc3358787433beaea4a9e72773ea7710bf4beb95aa8770515c9e", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/webhook", - "digest": "ebf93652f0254ac56600bedf4a7d81611b3e1e7f6526c6998da5dd24cdc67ee1", + "digest": "cd577cb977a2830b29bb799cf146bbffe0241d65eef1c680ec158af97b18d4fa", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/activator", - "digest": "c2994c2b6c2c7f38ad1b85c71789bf1753cc8979926423c83231e62258837cb9", + "digest": "ad42ddc9bc4e25fdc88c240d7cbfad4b2708eb7d26e07ae904d258011141116e", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/autoscaler", - "digest": "8319aa662b4912e8175018bd7cc90c63838562a27515197b803bdcd5634c7007", + "digest": "66aa0dbceee62691d5327e423bbd7cbd411903747adeab61fdc81b14590793d4", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/controller", - "digest": "98a2cc7fd62ee95e137116504e7166c32c65efef42c3d1454630780410abf943", + "digest": "e5b7b6edd265b66d32f424bd245c06455154462ade6ce05698472212248d5657", "type": "image" }, { @@ -55,97 +59,97 @@ }, { "package": "gcr.io/knative-releases/knative.dev/net-istio/cmd/controller", - "digest": "421aa67057240fa0c56ebf2c6e5b482a12842005805c46e067129402d1751220", + "digest": "5782b4a6b1a106d7cafe77d044b30905a9fecbbd2e0029946cb8a4b3507b40a4", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/net-istio/cmd/webhook", - "digest": "bfa1dfea77aff6dfa7959f4822d8e61c4f7933053874cd3f27352323e6ecd985", + "digest": "eeff0ad31550f3ff519d988bb36bfe214e5b60c1ec4349c1f9bb2b2d8cad9479", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/webhook", - "digest": "4305209ce498caf783f39c8f3e85dfa635ece6947033bf50b0b627983fd65953", + "digest": "48aee2733721ecc77956abc5a2ca072853a669ebc97519beb48f7b3da8455e67", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/pkg/apiextensions/storageversion/cmd/migrate", - "digest": "bc91e1fdaf3b67876ca33de1ce15b1268ed0ca8da203102b7699286fae97cf58", + "digest": "232d6ffd88dfc0d0ec02c6f3a95520283d076c16b77543cee04f4ef276e0b7ae", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/queue", - "digest": "dabaecec38860ca4c972e6821d5dc825549faf50c6feb8feb4c04802f2338b8a", - "type": "image" - }, - { - "package": "gcr.io/knative-releases/knative.dev/net-istio/cmd/webhook", - "digest": "bfa1dfea77aff6dfa7959f4822d8e61c4f7933053874cd3f27352323e6ecd985", + "digest": "89e6f90141f1b63405883fbb4de0d3b6d80f8b77e530904c4d29bdcd1dc5a167", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/net-istio/cmd/controller", - "digest": "421aa67057240fa0c56ebf2c6e5b482a12842005805c46e067129402d1751220", + "digest": "5782b4a6b1a106d7cafe77d044b30905a9fecbbd2e0029946cb8a4b3507b40a4", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/pkg/apiextensions/storageversion/cmd/migrate", - "digest": "56780f69e6496bb4790b0c147deb652a2b020ff81e08d58cc58a61cd649b1121", + "digest": "d438c3ad2fcef3c7ea1b3abb910f5fa911c8a1466d6460ac0b11bf034797d6f6", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/mtchannel_broker", - "digest": "4040ffc2d34e950b7969b4ba90cec29e65e506126ddb195faf3a56cb2fa653e8", + "digest": "9dc9e0b00325f1ec994ef6f48761ba7d9217333fa0c2cbfccfa9b204e3f616a9", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/broker/ingress", - "digest": "7f3b05f6e0abae19e9438fac44dd9938ddd2293014ef0fb8d388450c9ff63000", + "digest": "65412cf797d0bb7c7e22454431f57f8d9dcedf93620769f4c1206947acf05abb", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/broker/filter", - "digest": "29bd9f43359153c0ea39cf382d5f25ca43f55abbbce3d802ca37cc4d5c4a6942", + "digest": "4e3cf0703024129c60b66529f41a1d29310f61f6aced24d25fd241e43b1a2e8e", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/in_memory/channel_dispatcher", - "digest": "521234b4cff9d3cd32f8264cd7c830caa06f9982637b4866e983591fa1abc418", + "digest": "fa64db1ad126874f4e5ce1c17c2414b0fc3dde2a7e0db6fde939cafdbd4d96cd", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/in_memory/channel_controller", - "digest": "e004174a896811aec46520b1f2857f1973762389426bb0e0fc5d2332d5e36c7a", + "digest": "5386029f1fdcce1398dcca436864051a2f7eb5abed176453104f41b7b9b587f9", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/webhook", - "digest": "ebf93652f0254ac56600bedf4a7d81611b3e1e7f6526c6998da5dd24cdc67ee1", + "digest": "cd577cb977a2830b29bb799cf146bbffe0241d65eef1c680ec158af97b18d4fa", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/mtping", - "digest": "6d35cc98baa098fc0c5b4290859e363a8350a9dadc31d1191b0b5c9796958223", + "digest": "9d74e8c69d671ad10fdfd84d33569fde5c16c9f95824ea288d2cb6fd69e32f4d", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/controller", - "digest": "92967bab4ad8f7d55ce3a77ba8868f3f2ce173c010958c28b9a690964ad6ee9b", + "digest": "7579c5a8b1dee07c382120a8bc1a6594aea4519d0cf652989f5d9a675b11a0de", "type": "image" }, { "package": "quay.io/jetstack/cert-manager-controller", - "tag": "v1.12.2", + "tag": "v1.14.5", "type": "image" }, { "package": "quay.io/jetstack/cert-manager-cainjector", - "tag": "v1.12.2", + "tag": "v1.14.5", "type": "image" }, { "package": "quay.io/jetstack/cert-manager-webhook", - "tag": "v1.12.2", + "tag": "v1.14.5", + "type": "image" + }, + { + "package": "docker.io/istio/proxyv2", + "tag": "1.22.1", "type": "image" }, { @@ -155,7 +159,7 @@ }, { "package": "docker.io/istio/pilot", - "tag": "1.17.5", + "tag": "1.22.1", "type": "image" }, { @@ -165,7 +169,7 @@ }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/controller", - "digest": "92967bab4ad8f7d55ce3a77ba8868f3f2ce173c010958c28b9a690964ad6ee9b", + "digest": "7579c5a8b1dee07c382120a8bc1a6594aea4519d0cf652989f5d9a675b11a0de", "type": "image" }, @@ -179,9 +183,14 @@ "tag": "latest", "type": "image" }, + { + "package": "docker.io/kubeflowkatib/pytorch-mnist", + "tag": "v1beta1-45c5727", + "type": "image" + }, { "package": "docker.io/kubeflownotebookswg/jupyter-scipy", - "tag": "v1.6.1", + "tag": "v1.9.2", "type": "image" }, { @@ -189,6 +198,36 @@ "tag": "latest", "type": "image" }, + { + "package": "docker.io/busybox", + "tag": "1.28", + "type": "image" + }, + { + "package": "docker.io/busybox", + "tag": "1.34.1", + "type": "image" + }, + { + "package": "docker.io/bentoml/fraud_detection", + "tag": "o5smnagbncigycvj", + "type": "image" + }, + { + "package": "docker.io/istio/install-cni", + "tag": "1.22.1", + "type": "image" + }, + { + "package": "docker.io/istio/pilot", + "tag": "1.22.1", + "type": "image" + }, + { + "package": "docker.io/istio/proxyv2", + "tag": "1.22.1", + "type": "image" + }, { "package": "gcr.io/kubeflow-images-public/profile-controller", "tag": "v20190228-v0.4.0-rc.1-192-g1a802656-dirty-f95773", @@ -196,7 +235,7 @@ }, { "package": "docker.io/seldonio/seldon-core-operator", - "tag": "1.17.1", + "tag": "1.18.1", "type": "image" }, { @@ -206,7 +245,7 @@ }, { "package": "docker.io/rayproject/ray", - "tag": "2.2.0-py38-cpu", + "tag": "2.23.0-py311-cpu", "type": "image" }, { @@ -216,12 +255,22 @@ }, { "package": "docker.io/kserve/kserve-controller", - "tag": "v0.11.2", + "tag": "v0.13.1", "type": "image" }, + { + "package": "docker.io/kserve/huggingfaceserver", + "tag": "v0.13.1", + "type": "image" + }, + { + "package": "docker.io/kserve/storage-initializer", + "tag": "v0.13.1", + "type": "image" + }, { "package": "docker.io/kserve/xgbserver", - "tag": "v0.11.2", + "tag": "v0.13.1", "type": "image" }, { @@ -231,7 +280,7 @@ }, { "package": "docker.io/pytorch/torchserve-kfs", - "tag": "0.8.2", + "tag": "0.9.0", "type": "image" }, { @@ -241,17 +290,17 @@ }, { "package": "docker.io/kserve/sklearnserver", - "tag": "v0.11.2", + "tag": "v0.13.1", "type": "image" }, { "package": "docker.io/kserve/pmmlserver", - "tag": "v0.11.2", + "tag": "v0.13.1", "type": "image" }, { "package": "docker.io/kserve/paddleserver", - "tag": "v0.11.2", + "tag": "v0.13.1", "type": "image" }, { @@ -261,7 +310,7 @@ }, { "package": "docker.io/kserve/lgbserver", - "tag": "v0.11.2", + "tag": "v0.13.1", "type": "image" }, { @@ -271,14 +320,24 @@ }, { "package": "quay.io/bentoml/yatai-image-builder", - "tag": "1.1.3", + "tag": "1.2.28", "type": "image" }, + { + "package": "quay.io/bentoml/yatai-deployment", + "tag": "1.1.21", + "type": "image" + }, { "package": "quay.io/oauth2-proxy/oauth2-proxy", "tag": "latest", "type": "image" }, + { + "package": "quay.io/oauth2-proxy/oauth2-proxy", + "tag": "v7.6.0", + "type": "image" + }, { "package": "docker.io/prom/prometheus", "tag": "latest", @@ -306,9 +365,24 @@ }, { "package": "docker.io/postgres", - "tag": "12-alpine", + "tag": "14.5-alpine", "type": "image" }, + { + "package": "docker.io/postgres", + "tag": "12-alpine", + "type": "image" + }, + { + "package": "docker.io/postgres", + "tag": "14.7-alpine", + "type": "image" + }, + { + "package": "docker.io/postgres", + "tag": "latest", + "type": "image" + }, { "package": "quay.io/argoproj/argocli", "tag": "latest", @@ -336,7 +410,7 @@ }, { "package": "gcr.io/tfx-oss-public/ml_metadata_store_server", - "tag": "1.5.0", + "tag": "1.14.0", "type": "image" }, { @@ -349,9 +423,14 @@ "tag": "nightly", "type": "image" }, + { + "package": "quay.io/aipipeline/pipelineloop-controller", + "tag": "nightly", + "type": "image" + }, { "package": "gcr.io/kubebuilder/kube-rbac-proxy", - "tag": "v0.4.0", + "tag": "v0.11.0", "type": "image" }, { @@ -361,87 +440,87 @@ }, { "package": "docker.io/kubeflowkatib/enas-cnn-cifar10-cpu", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/pytorch-mnist-cpu", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/file-metrics-collector", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/tfevent-metrics-collector", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-hyperopt", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-optuna", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-hyperband", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-skopt", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-goptuna", - "tag": "v0.16.0-rc.1", - "type": "image" - }, - { - "package": "docker.io/kubeflowkatib/suggestion-optuna", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-enas", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-darts", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-pbt", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/earlystopping-medianstop", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kserve/kserve-controller", - "tag": "v0.11.1", + "tag": "v0.13.1", "type": "image" }, { "package": "gcr.io/ml-pipeline/frontend", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" }, { "package": "gcr.io/ml-pipeline/visualization-server", - "tag": "2.0.3", + "tag": "2.3.0", + "type": "image" + }, + { + "package": "docker.io/kubeflownotebookswg/poddefaults-webhook", + "tag": "v1.9.2", "type": "image" }, { @@ -451,17 +530,17 @@ }, { "package": "gcr.io/ml-pipeline/cache-server", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/centraldashboard", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/jupyter-web-app", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { @@ -469,6 +548,11 @@ "tag": "8.0.26", "type": "image" }, + { + "package": "docker.io/kubeflowkatib/katib-controller", + "tag": "v0.17.0", + "type": "image" + }, { "package": "docker.io/kubeflowkatib/katib-controller", "tag": "v0.16.0", @@ -476,7 +560,7 @@ }, { "package": "docker.io/kubeflowkatib/katib-db-manager", - "tag": "v0.16.0", + "tag": "v0.17.0", "type": "image" }, { @@ -486,17 +570,22 @@ }, { "package": "docker.io/kubeflowkatib/katib-ui", - "tag": "v0.16.0", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kserve/kserve-controller", - "tag": "v0.11.1", + "tag": "v0.13.1", + "type": "image" + }, + { + "package": "docker.io/kserve/models-web-app", + "tag": "v0.13.0-rc.0", "type": "image" }, { "package": "docker.io/kserve/models-web-app", - "tag": "v0.10.0", + "tag": "latest", "type": "image" }, { @@ -511,7 +600,7 @@ }, { "package": "gcr.io/ml-pipeline/metadata-envoy", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" }, { @@ -521,7 +610,7 @@ }, { "package": "gcr.io/ml-pipeline/metadata-writer", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" }, { @@ -531,52 +620,52 @@ }, { "package": "gcr.io/ml-pipeline/api-server", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" }, { "package": "gcr.io/ml-pipeline/persistenceagent", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" }, { "package": "gcr.io/ml-pipeline/scheduledworkflow", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/tensorboard-controller", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/tensorboards-web-app", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { "package": "docker.io/kubeflow/training-operator", - "tag": "v1-855e096", + "tag": "v1-04f9f13", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/notebook-controller", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/kfam", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/profile-controller", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { "package": "gcr.io/kubebuilder/kube-rbac-proxy", - "tag": "v0.13.1", + "tag": "v0.4.0", "type": "image" }, { @@ -586,17 +675,12 @@ }, { "package": "docker.io/kubeflownotebookswg/tensorboards-web-app", - "tag": "v1.8.0", - "type": "image" - }, - { - "package": "docker.io/mariadb", - "tag": "latest", + "tag": "v1.9.2", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/pvcviewer-controller", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { @@ -604,16 +688,26 @@ "tag": "v3.3.10-license-compliance", "type": "image" }, + { + "package": "gcr.io/ml-pipeline/workflow-controller", + "tag": "v3.3.8-license-compliance", + "type": "image" + }, + { + "package": "gcr.io/ml-pipeline/workflow-controller", + "tag": "v3.4.17-license-compliance", + "type": "image" + }, { "package": "docker.io/kubeflownotebookswg/volumes-web-app", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { "package": "gcr.io/ml-pipeline/viewer-crd-controller", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" } ] } - } + } \ No newline at end of file diff --git a/input/config/rocky/8.8/openldap.json b/input/config/rocky/8.8/openldap.json index 053f90f51..592788650 100644 --- a/input/config/rocky/8.8/openldap.json +++ b/input/config/rocky/8.8/openldap.json @@ -8,7 +8,8 @@ { "package": "ansible-role-ldaptoolbox-openldap", "type": "git", "url": "https://github.com/ltb-project/ansible-role-ldaptoolbox-openldap.git", - "version": "main" + "version": "main", + "commit": "695a689ff91a83b47fbc6f575be37e1f811bd719" } ] } diff --git a/input/config/rocky/8.8/secure_login_node.json b/input/config/rocky/8.8/secure_login_node.json index e20d0a2dd..9d190dffb 100644 --- a/input/config/rocky/8.8/secure_login_node.json +++ b/input/config/rocky/8.8/secure_login_node.json @@ -22,17 +22,17 @@ {"package": "wget", "type": "rpm", "repo_name": "appstream"}, {"package": "psacct", "type": "rpm", "repo_name": "baseos"}, {"package": "psacct", "type": "rpm", "repo_name": "baseos"}, - {"package": "python3.9", "type": "rpm", "repo_name": "appstream"}, - { - "package": "ansible==7.7.0", + {"package": "python3.11", "type": "rpm", "repo_name": "appstream"}, + { + "package": "ansible==9.5.1", "type": "pip_module" }, - { - "package": "cryptography==41.0.7", + { + "package": "cryptography==44.0.0", "type": "pip_module" }, - { - "package": "jinja2==3.1.2", + { + "package": "jinja2==3.1.2", "type": "pip_module" } diff --git a/input/config/rocky/8.8/telemetry.json b/input/config/rocky/8.8/telemetry.json index 60a108fcd..70eac1bb4 100644 --- a/input/config/rocky/8.8/telemetry.json +++ b/input/config/rocky/8.8/telemetry.json @@ -1,211 +1,248 @@ { - "telemetry": { - "cluster": [ - { + "telemetry": { + "cluster": [ + { "package": "buildkit", "type": "git", "url": "https://github.com/moby/buildkit.git", "version": "v0.13.1" - }, - { "package": "smartmontools", - "type": "rpm", - "repo_name": "baseos"}, - { - "package": "containerd.io-1.6.16-3.1.el8", - "type": "rpm", - "repo_name": "docker-ce-repo" - }, - { - "package": "docker-ce-cli-1:20.10.20-3.el8", - "type": "rpm", - "repo_name": "docker-ce-repo" - }, - { - "package": "docker-ce-3:20.10.20-3.el8", - "type": "rpm", - "repo_name": "docker-ce-repo" - }, - { - "package": "docker-ce-rootless-extras", - "type": "rpm", - "repo_name": "docker-ce-repo" - }, - { - "package": "kubespray", - "type": "git", - "url": "https://github.com/kubernetes-sigs/kubespray.git", - "version": "release-2.23" - }, - { - "package": "kubectl", - "type": "tarball", - "url": "https://dl.k8s.io/release/v1.26.12/bin/linux/amd64/kubectl" - }, - { - "package": "kubelet", - "type": "tarball", - "url": "https://dl.k8s.io/release/v1.26.12/bin/linux/amd64/kubelet" - }, - { - "package": "kubeadm", - "type": "tarball", - "url": "https://dl.k8s.io/release/v1.26.12/bin/linux/amd64/kubeadm" - }, - { - "package": "calicoctl-v3.25.2", - "type": "tarball", - "url": "https://github.com/projectcalico/calico/releases/download/v3.25.2/calicoctl-linux-amd64" - }, - { - "package": "calicocrds-v3.25.2", - "type": "tarball", - "url": "https://github.com/projectcalico/calico/archive/v3.25.2.tar.gz" - }, - { - "package": "cri-tools-v1.26.1", - "type": "tarball", - "url": "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.26.1/crictl-v1.26.1-linux-amd64.tar.gz" - }, - { - "package": "etcd-v3.5.10", - "type": "tarball", - "url": "https://github.com/etcd-io/etcd/releases/download/v3.5.10/etcd-v3.5.10-linux-amd64.tar.gz" - }, - { - "package": "cni-plugins-v1.3.0", - "type": "tarball", - "url": "https://github.com/containernetworking/plugins/releases/download/v1.3.0/cni-plugins-linux-amd64-v1.3.0.tgz" - }, - { - "package": "runc.amd64", - "type": "tarball", - "url": "https://github.com/opencontainers/runc/releases/download/v1.1.9/runc.amd64" - }, - { - "package": "nerdctl-v1.5.0", - "type": "tarball", - "url": "https://github.com/containerd/nerdctl/releases/download/v1.5.0/nerdctl-1.5.0-linux-amd64.tar.gz" - }, - { - "package": "containerd-1.7.5", - "type": "tarball", - "url": "https://github.com/containerd/containerd/releases/download/v1.7.5/containerd-1.7.5-linux-amd64.tar.gz" - }, - { - "package": "helm-v3.12.3", - "type": "tarball", - "url": "https://get.helm.sh/helm-v3.12.3-linux-amd64.tar.gz" - }, - { - "package": "nfs-subdir-external-provisioner-4.0.18", - "type": "tarball", - "url": "https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz" - }, - { - "package": "docker.io/library/nginx", - "tag": "1.25.2-alpine", - "type": "image" - }, - { - "package": "registry.k8s.io/coredns/coredns", - "tag": "v1.9.3", - "type": "image" - }, - { - "package": "registry.k8s.io/cpa/cluster-proportional-autoscaler", - "tag": "v1.8.8", - "type": "image" - }, - { - "package": "registry.k8s.io/dns/k8s-dns-node-cache", - "tag": "1.22.28", - "type": "image" - }, - { - "package": "registry.k8s.io/kube-apiserver", - "tag": "v1.26.12", - "type": "image" - }, - { - "package": "registry.k8s.io/kube-controller-manager", - "tag": "v1.26.12", - "type": "image" - }, - { - "package": "registry.k8s.io/kube-proxy", - "tag": "v1.26.12", - "type": "image" - }, - { - "package": "registry.k8s.io/kube-scheduler", - "tag": "v1.26.12", - "type": "image" - }, - { - "package": "registry.k8s.io/pause", - "tag": "3.9", - "type": "image" - }, - { - "package": "quay.io/coreos/etcd", - "tag": "v3.5.10", - "type": "image" - }, - { - "package": "quay.io/calico/cni", - "tag": "v3.25.2", - "type": "image" - }, - { - "package": "quay.io/calico/kube-controllers", - "tag": "v3.25.2", - "type": "image" - }, - { - "package": "quay.io/calico/pod2daemon-flexvol", - "tag": "v3.25.2", - "type": "image" - }, - { - "package": "quay.io/calico/node", - "tag": "v3.25.2", - "type": "image" - }, - { - "package": "docker.io/flannel/flannel-cni-plugin", - "tag": "v1.1.2", - "type": "image" - }, - { - "package": "docker.io/flannel/flannel", - "tag": "v0.22.0", - "type": "image" - }, - { - "package": "quay.io/metallb/speaker", - "tag": "v0.13.9", - "type": "image" - }, - { - "package": "quay.io/metallb/controller", - "tag": "v0.13.9", - "type": "image" - }, - { - "package": "docker.io/kubernetesui/dashboard", - "tag": "v2.2.0", - "type": "image" - }, - { - "package": "docker.io/kubernetesui/metrics-scraper", - "tag": "v1.0.6", - "type": "image" - }, - { - "package": "docker.io/grafana/grafana-enterprise", - "tag": "8.3.2", - "type": "image" - } - ] - } + }, + { + "package": "smartmontools", + "type": "rpm", + "repo_name": "baseos" + }, + { + "package": "docker-ce-cli-1:20.10.20-3.el8", + "type": "rpm", + "repo_name": "docker-ce-repo" + }, + { + "package": "docker-ce-3:20.10.20-3.el8", + "type": "rpm", + "repo_name": "docker-ce-repo" + }, + { + "package": "docker-ce-rootless-extras", + "type": "rpm", + "repo_name": "docker-ce-repo" + }, + { + "package": "kubespray-v2.25.0", + "type": "git", + "url": "https://github.com/kubernetes-sigs/kubespray.git", + "version": "v2.25.0" + }, + { + "package": "kubectl-1.29.5", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.29.5/bin/linux/amd64/kubectl" + }, + { + "package": "kubelet-1.29.5", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.29.5/bin/linux/amd64/kubelet" + }, + { + "package": "kubeadm-1.29.5", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.29.5/bin/linux/amd64/kubeadm" + }, + { + "package": "calicoctl-v3.27.3", + "type": "tarball", + "url": "https://github.com/projectcalico/calico/releases/download/v3.27.3/calicoctl-linux-amd64" + }, + { + "package": "calicocrds-v3.27.3", + "type": "tarball", + "url": "https://github.com/projectcalico/calico/archive/v3.27.3.tar.gz" + }, + { + "package": "cri-tools-v1.29.0", + "type": "tarball", + "url": "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.29.0/crictl-v1.29.0-linux-amd64.tar.gz" + }, + { + "package": "etcd-v3.5.12", + "type": "tarball", + "url": "https://github.com/etcd-io/etcd/releases/download/v3.5.12/etcd-v3.5.12-linux-amd64.tar.gz" + }, + { + "package": "cni-plugins-v1.3.0", + "type": "tarball", + "url": "https://github.com/containernetworking/plugins/releases/download/v1.3.0/cni-plugins-linux-amd64-v1.3.0.tgz" + }, + { + "package": "runc.amd64-v1.1.12", + "type": "tarball", + "url": "https://github.com/opencontainers/runc/releases/download/v1.1.12/runc.amd64" + }, + { + "package": "nerdctl-v1.7.4", + "type": "tarball", + "url": "https://github.com/containerd/nerdctl/releases/download/v1.7.4/nerdctl-1.7.4-linux-amd64.tar.gz" + }, + { + "package": "containerd-1.7.16", + "type": "tarball", + "url": "https://github.com/containerd/containerd/releases/download/v1.7.16/containerd-1.7.16-linux-amd64.tar.gz" + }, + { + "package": "helm-v3.14.2", + "type": "tarball", + "url": "https://get.helm.sh/helm-v3.14.2-linux-amd64.tar.gz" + }, + { + "package": "nfs-subdir-external-provisioner-4.0.18", + "type": "tarball", + "url": "https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz" + }, + { + "package": "docker.io/library/nginx", + "tag": "1.25.2-alpine", + "type": "image" + }, + { + "package": "registry.k8s.io/coredns/coredns", + "tag": "v1.11.1", + "type": "image" + }, + { + "package": "registry.k8s.io/cpa/cluster-proportional-autoscaler", + "tag": "v1.8.8", + "type": "image" + }, + { + "package": "registry.k8s.io/dns/k8s-dns-node-cache", + "tag": "1.22.28", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-apiserver", + "tag": "v1.29.5", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-controller-manager", + "tag": "v1.29.5", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-proxy", + "tag": "v1.29.5", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-scheduler", + "tag": "v1.29.5", + "type": "image" + }, + { + "package": "registry.k8s.io/pause", + "tag": "3.9", + "type": "image" + }, + { + "package": "quay.io/coreos/etcd", + "tag": "v3.5.12", + "type": "image" + }, + { + "package": "quay.io/calico/cni", + "tag": "v3.27.3", + "type": "image" + }, + { + "package": "quay.io/calico/kube-controllers", + "tag": "v3.27.3", + "type": "image" + }, + { + "package": "quay.io/calico/pod2daemon-flexvol", + "tag": "v3.27.3", + "type": "image" + }, + { + "package": "quay.io/calico/node", + "tag": "v3.27.3", + "type": "image" + }, + { + "package": "docker.io/flannel/flannel-cni-plugin", + "tag": "v1.1.2", + "type": "image" + }, + { + "package": "docker.io/flannel/flannel", + "tag": "v0.22.0", + "type": "image" + }, + { + "package": "quay.io/metallb/speaker", + "tag": "v0.13.9", + "type": "image" + }, + { + "package": "quay.io/metallb/controller", + "tag": "v0.13.9", + "type": "image" + }, + { + "package": "docker.io/kubernetesui/dashboard", + "tag": "v2.7.0", + "type": "image" + }, + { + "package": "docker.io/kubernetesui/metrics-scraper", + "tag": "v1.0.8", + "type": "image" + }, + { + "package": "docker.io/grafana/grafana-enterprise", + "tag": "8.3.2", + "type": "image" + }, + { + "package": "kube-prometheus-stack-62.3.0", + "type": "tarball", + "url": "https://github.com/prometheus-community/helm-charts/releases/download/kube-prometheus-stack-62.3.0/kube-prometheus-stack-62.3.0.tgz" + }, + { + "package": "quay.io/prometheus-operator/prometheus-operator", + "tag": "v0.76.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-state-metrics/kube-state-metrics", + "tag": "v2.13.0", + "type": "image" + }, + { + "package": "quay.io/prometheus-operator/prometheus-config-reloader", + "tag": "v0.76.0", + "type": "image" + }, + { + "package": "quay.io/prometheus/alertmanager", + "tag": "v0.27.0", + "type": "image" + }, + { + "package": "quay.io/prometheus/node-exporter", + "tag": "v1.8.2", + "type": "image" + }, + { + "package": "quay.io/prometheus/prometheus", + "tag": "v2.54.0", + "type": "image" + }, + { + "package": "registry.k8s.io/ingress-nginx/kube-webhook-certgen", + "tag": "v20221220-controller-v1.5.1-58-g787ea74b6", + "type": "image" + } + ] + } } diff --git a/input/config/rocky/8.8/vllm.json b/input/config/rocky/8.8/vllm.json index fe9d6a598..f722492de 100644 --- a/input/config/rocky/8.8/vllm.json +++ b/input/config/rocky/8.8/vllm.json @@ -1,40 +1,40 @@ -{ +{ - "vllm": { + "vllm": { "cluster": [] }, - "vllm_amd": { + "vllm_amd": { - "cluster": [ - { - "package": "docker.io/embeddedllminfo/vllm-rocm", - "tag": "vllm-v0.2.4", - "type": "image" + "cluster": [ + { + "package": "docker.io/embeddedllminfo/vllm-rocm", + "tag": "vllm-v0.2.4", + "type": "image" } ] }, - "vllm_nvidia": { + "vllm_nvidia": { - "cluster": [ - { - "package": "python3.9", + "cluster": [ + { + "package": "python3.11", "type": "rpm", "repo_name": "appstream" - - }, - { - "package": "https://download.pytorch.org/whl/cu121/torch-2.1.2%2Bcu121-cp39-cp39-linux_x86_64.whl", + + }, + { + "package": "https://download.pytorch.org/whl/cu121/torch-2.1.2%2Bcu121-cp311-cp311-linux_x86_64.whl", "type": "pip_module" }, - { - "package": "vllm", + { + "package": "vllm", "type": "pip_module" } ] } -} \ No newline at end of file +} diff --git a/input/config/ubuntu/20.04/amdgpu.json b/input/config/ubuntu/20.04/amdgpu.json index febf44666..dca5998c9 100644 --- a/input/config/ubuntu/20.04/amdgpu.json +++ b/input/config/ubuntu/20.04/amdgpu.json @@ -8,7 +8,7 @@ }, "rocm": { "cluster": [ - {"package": "rocm-hip-sdk{{ rocm_version }}*", "type": "deb", "repo_name": "rocm"} + {"package": "rocm", "type": "deb", "repo_name": "rocm"} ] } } diff --git a/input/config/ubuntu/20.04/k8s.json b/input/config/ubuntu/20.04/k8s.json index c313afd98..66c07a686 100644 --- a/input/config/ubuntu/20.04/k8s.json +++ b/input/config/ubuntu/20.04/k8s.json @@ -1,257 +1,247 @@ { - "k8s": { - "cluster": [ - { - "package": "containerd.io=1.6.20-1", - "type": "deb", - "repo_name": "docker-ce-repo" - }, - { - "package": "docker-ce-cli=5:20.10.10~3-0~ubuntu-focal", - "type": "deb", - "repo_name": "docker-ce-repo" - }, - { - "package": "docker-ce=5:20.10.20~3-0~ubuntu-focal", - "type": "deb", - "repo_name": "docker-ce-repo" - }, - { - "package": "docker-ce-rootless-extras=5:20.10.20~3-0~ubuntu-focal", - "type": "deb", - "repo_name": "docker-ce-repo" - }, - { - "package": "kubectl", - "type": "tarball", - "url": "https://dl.k8s.io/release/v{{ k8s_version }}/bin/linux/amd64/kubectl" - }, - { - "package": "kubelet", - "type": "tarball", - "url": "https://dl.k8s.io/release/v{{ k8s_version }}/bin/linux/amd64/kubelet" - }, - { - "package": "kubeadm", - "type": "tarball", - "url": "https://dl.k8s.io/release/v{{ k8s_version }}/bin/linux/amd64/kubeadm" - }, - { - "package": "calicoctl-v3.25.2", - "type": "tarball", - "url": "https://github.com/projectcalico/calico/releases/download/v3.25.2/calicoctl-linux-amd64" - }, - { - "package": "calicocrds-v3.25.2", - "type": "tarball", - "url": "https://github.com/projectcalico/calico/archive/v3.25.2.tar.gz" - }, - { - "package": "cri-tools-v1.26.1", - "type": "tarball", - "url": "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.26.1/crictl-v1.26.1-linux-amd64.tar.gz" - }, - { - "package": "etcd-v3.5.10", - "type": "tarball", - "url": "https://github.com/etcd-io/etcd/releases/download/v3.5.10/etcd-v3.5.10-linux-amd64.tar.gz" - }, - { - "package": "cni-plugins-v1.3.0", - "type": "tarball", - "url": "https://github.com/containernetworking/plugins/releases/download/v1.3.0/cni-plugins-linux-amd64-v1.3.0.tgz" - }, - { - "package": "runc.amd64", - "type": "tarball", - "url": "https://github.com/opencontainers/runc/releases/download/v1.1.9/runc.amd64" - }, - { - "package": "nerdctl-v1.5.0", - "type": "tarball", - "url": "https://github.com/containerd/nerdctl/releases/download/v1.5.0/nerdctl-1.5.0-linux-amd64.tar.gz" - }, - { - "package": "containerd-1.7.5", - "type": "tarball", - "url": "https://github.com/containerd/containerd/releases/download/v1.7.5/containerd-1.7.5-linux-amd64.tar.gz" - }, - { - "package": "helm-v3.12.3", - "type": "tarball", - "url": "https://get.helm.sh/helm-v3.12.3-linux-amd64.tar.gz" - }, - { - "package": "nvidia-device-plugin", - "type": "tarball", - "url": "https://nvidia.github.io/k8s-device-plugin/stable/nvidia-device-plugin-0.14.4.tgz" - }, - { - "package": "rocm-device-plugin", - "type": "manifest", - "url": "https://raw.githubusercontent.com/ROCm/k8s-device-plugin/r1.16/k8s-ds-amdgpu-dp.yaml" - }, - { - "package": "mpi-operator", - "type": "manifest", - "url": "https://raw.githubusercontent.com/kubeflow/mpi-operator/v0.4.0/deploy/v2beta1/mpi-operator.yaml" - }, - { - "package": "xilinx-device-plugin", - "type": "manifest", - "url": "https://raw.githubusercontent.com/Xilinx/FPGA_as_a_Service/master/k8s-device-plugin/k8s-device-plugin.yml" - }, - { - "package": "spark-operator-v1beta2-1.3.8-3.1.1", - "type": "tarball", - "url": "https://github.com/GoogleCloudPlatform/spark-on-k8s-operator/archive/refs/tags/v1beta2-1.3.8-3.1.1.tar.gz" - }, - { - "package": "nfs-subdir-external-provisioner-4.0.18", - "type": "tarball", - "url": "https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz" - }, - { - "package": "docker.io/library/nginx", - "tag": "1.25.2-alpine", - "type": "image" - }, - { - "package": "registry.k8s.io/coredns/coredns", - "tag": "v1.9.3", - "type": "image" - }, - { - "package": "registry.k8s.io/cpa/cluster-proportional-autoscaler", - "tag": "v1.8.8", - "type": "image" - }, - { - "package": "registry.k8s.io/dns/k8s-dns-node-cache", - "tag": "1.22.28", - "type": "image" - }, - { - "package": "registry.k8s.io/kube-apiserver", - "tag": "v{{ k8s_version }}", - "type": "image" - }, - { - "package": "registry.k8s.io/kube-controller-manager", - "tag": "v{{ k8s_version }}", - "type": "image" - }, - { - "package": "registry.k8s.io/kube-proxy", - "tag": "v{{ k8s_version }}", - "type": "image" - }, - { - "package": "registry.k8s.io/kube-scheduler", - "tag": "v{{ k8s_version }}", - "type": "image" - }, - { - "package": "registry.k8s.io/pause", - "tag": "3.9", - "type": "image" - }, - { - "package": "quay.io/coreos/etcd", - "tag": "v3.5.10", - "type": "image" - }, - { - "package": "quay.io/calico/cni", - "tag": "v3.25.2", - "type": "image" - }, - { - "package": "quay.io/calico/kube-controllers", - "tag": "v3.25.2", - "type": "image" - }, - { - "package": "quay.io/calico/pod2daemon-flexvol", - "tag": "v3.25.2", - "type": "image" - }, - { - "package": "quay.io/calico/node", - "tag": "v3.25.2", - "type": "image" - }, - { - "package": "docker.io/flannel/flannel-cni-plugin", - "tag": "v1.1.2", - "type": "image" - }, - { - "package": "docker.io/flannel/flannel", - "tag": "v0.22.0", - "type": "image" - }, - { - "package": "docker.io/kubernetesui/dashboard", - "tag": "v2.7.0", - "type": "image" - }, - { - "package": "docker.io/kubernetesui/metrics-scraper", - "tag": "v1.0.8", - "type": "image" - }, - { - "package": "quay.io/metallb/speaker", - "tag": "v0.13.9", - "type": "image" - }, - { - "package": "quay.io/metallb/controller", - "tag": "v0.13.9", - "type": "image" - }, - { - "package": "registry.k8s.io/nfd/node-feature-discovery", - "tag": "v0.12.1", - "type": "image" - }, - { - "package": "docker.io/rocm/k8s-device-plugin", - "tag": "latest", - "type": "image" - }, - { - "package": "docker.io/mpioperator/mpi-operator", - "tag": "master", - "type": "image" - }, - { - "package": "public.ecr.aws/xilinx_dcg/k8s-device-plugin", - "tag": "1.3.0", - "type": "image" - }, - { - "package": "nvcr.io/nvidia/k8s-device-plugin", - "tag": "v0.14.4", - "type": "image" - }, - { - "package": "nvcr.io/nvidia/gpu-feature-discovery", - "tag": "v0.8.2", - "type": "image" - }, - { - "package": "registry.k8s.io/nfd/node-feature-discovery", - "tag": "v0.12.1", - "type": "image" - }, - { - "package": "registry.k8s.io/sig-storage/nfs-subdir-external-provisioner", - "tag": "v4.0.2", - "type": "image" - } - ] + "k8s": { + "cluster": [ + { + "package": "nvidia-container-toolkit", + "type": "deb", + "repo_name": "nvidia-repo" + }, + { + "package": "kubectl-{{ k8s_version }}", + "type": "tarball", + "url": "https://dl.k8s.io/release/v{{ k8s_version }}/bin/linux/amd64/kubectl" + }, + { + "package": "kubelet-{{ k8s_version }}", + "type": "tarball", + "url": "https://dl.k8s.io/release/v{{ k8s_version }}/bin/linux/amd64/kubelet" + }, + { + "package": "kubeadm-{{ k8s_version }}", + "type": "tarball", + "url": "https://dl.k8s.io/release/v{{ k8s_version }}/bin/linux/amd64/kubeadm" + }, + { + "package": "calicoctl-v3.27.3", + "type": "tarball", + "url": "https://github.com/projectcalico/calico/releases/download/v3.27.3/calicoctl-linux-amd64" + }, + { + "package": "calicocrds-v3.27.3", + "type": "tarball", + "url": "https://github.com/projectcalico/calico/archive/v3.27.3.tar.gz" + }, + { + "package": "cri-tools-v1.29.0", + "type": "tarball", + "url": "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.29.0/crictl-v1.29.0-linux-amd64.tar.gz" + }, + + { + "package": "etcd-v3.5.12", + "type": "tarball", + "url": "https://github.com/etcd-io/etcd/releases/download/v3.5.12/etcd-v3.5.12-linux-amd64.tar.gz" + }, + { + "package": "cni-plugins-v1.3.0", + "type": "tarball", + "url": "https://github.com/containernetworking/plugins/releases/download/v1.3.0/cni-plugins-linux-amd64-v1.3.0.tgz" + }, + { + "package": "runc.amd64.v1.1.12", + "type": "tarball", + "url": "https://github.com/opencontainers/runc/releases/download/v1.1.12/runc.amd64" + }, + { + "package": "nerdctl-v1.7.4", + "type": "tarball", + "url": "https://github.com/containerd/nerdctl/releases/download/v1.7.4/nerdctl-1.7.4-linux-amd64.tar.gz" + }, + { + "package": "containerd-1.7.16", + "type": "tarball", + "url": "https://github.com/containerd/containerd/releases/download/v1.7.16/containerd-1.7.16-linux-amd64.tar.gz" + }, + { + "package": "helm-v3.14.2", + "type": "tarball", + "url": "https://get.helm.sh/helm-v3.14.2-linux-amd64.tar.gz" + }, + { + "package": "nvidia-device-plugin", + "type": "tarball", + "url": "https://nvidia.github.io/k8s-device-plugin/stable/nvidia-device-plugin-0.14.4.tgz" + }, + { + "package": "rocm-device-plugin", + "type": "manifest", + "url": "https://raw.githubusercontent.com/ROCm/k8s-device-plugin/r1.16/k8s-ds-amdgpu-dp.yaml" + }, + { + "package": "habana-device-plugin", + "type": "manifest", + "url": "https://vault.habana.ai/artifactory/docker-k8s-device-plugin/habana-k8s-device-plugin.yaml" + }, + { + "package": "mpi-operator-v0.5.0", + "type": "manifest", + "url": "https://raw.githubusercontent.com/kubeflow/mpi-operator/v0.5.0/deploy/v2beta1/mpi-operator.yaml" + }, + { + "package": "xilinx-device-plugin", + "type": "manifest", + "url": "https://raw.githubusercontent.com/Xilinx/FPGA_as_a_Service/master/k8s-device-plugin/k8s-device-plugin.yml" + }, + { + "package": "spark-operator-v1beta2-1.3.8-3.1.1", + "type": "tarball", + "url": "https://github.com/GoogleCloudPlatform/spark-on-k8s-operator/archive/refs/tags/v1beta2-1.3.8-3.1.1.tar.gz" + }, + { + "package": "nfs-subdir-external-provisioner-4.0.18", + "type": "tarball", + "url": "https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz" + }, + { + "package": "docker.io/library/nginx", + "tag": "1.25.2-alpine", + "type": "image" + }, + { + "package": "registry.k8s.io/coredns/coredns", + "tag": "v1.11.1", + "type": "image" + }, + { + "package": "registry.k8s.io/cpa/cluster-proportional-autoscaler", + "tag": "v1.8.8", + "type": "image" + }, + { + "package": "registry.k8s.io/dns/k8s-dns-node-cache", + "tag": "1.22.28", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-apiserver", + "tag": "v{{ k8s_version }}", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-controller-manager", + "tag": "v{{ k8s_version }}", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-proxy", + "tag": "v{{ k8s_version }}", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-scheduler", + "tag": "v{{ k8s_version }}", + "type": "image" + }, + { + "package": "registry.k8s.io/pause", + "tag": "3.9", + "type": "image" + }, + { + "package": "quay.io/coreos/etcd", + "tag": "v3.5.12", + "type": "image" + }, + { + "package": "quay.io/calico/cni", + "tag": "v3.27.3", + "type": "image" + }, + { + "package": "quay.io/calico/kube-controllers", + "tag": "v3.27.3", + "type": "image" + }, + { + "package": "quay.io/calico/pod2daemon-flexvol", + "tag": "v3.27.3", + "type": "image" + }, + { + "package": "quay.io/calico/node", + "tag": "v3.27.3", + "type": "image" + }, + { + "package": "docker.io/flannel/flannel-cni-plugin", + "tag": "v1.1.2", + "type": "image" + }, + { + "package": "docker.io/flannel/flannel", + "tag": "v0.22.0", + "type": "image" + }, + { + "package": "docker.io/kubernetesui/dashboard", + "tag": "v2.7.0", + "type": "image" + }, + { + "package": "docker.io/kubernetesui/metrics-scraper", + "tag": "v1.0.8", + "type": "image" + }, + { + "package": "quay.io/metallb/speaker", + "tag": "v0.13.9", + "type": "image" + }, + { + "package": "quay.io/metallb/controller", + "tag": "v0.13.9", + "type": "image" + }, + { + "package": "registry.k8s.io/nfd/node-feature-discovery", + "tag": "v0.12.1", + "type": "image" + }, + { + "package": "docker.io/rocm/k8s-device-plugin", + "tag": "latest", + "type": "image" + }, + { + "package": "docker.io/mpioperator/mpi-operator", + "tag": "0.5.0", + "type": "image" + }, + { + "package": "public.ecr.aws/xilinx_dcg/k8s-device-plugin", + "tag": "1.3.0", + "type": "image" + }, + { + "package": "nvcr.io/nvidia/k8s-device-plugin", + "tag": "v0.14.4", + "type": "image" + }, + { + "package": "vault.habana.ai/docker-k8s-device-plugin/docker-k8s-device-plugin", + "tag": "1.16.2", + "type": "image" + }, + { + "package": "nvcr.io/nvidia/gpu-feature-discovery", + "tag": "v0.8.2", + "type": "image" + }, + { + "package": "registry.k8s.io/sig-storage/nfs-subdir-external-provisioner", + "tag": "v4.0.2", + "type": "image" } + ] } - \ No newline at end of file +} diff --git a/input/config/ubuntu/20.04/kserve.json b/input/config/ubuntu/20.04/kserve.json index c2d2869dc..8632d3869 100644 --- a/input/config/ubuntu/20.04/kserve.json +++ b/input/config/ubuntu/20.04/kserve.json @@ -4,111 +4,111 @@ { "package": "istio", "type": "tarball", - "url": "https://github.com/istio/istio/releases/download/1.17.0/istio-1.17.0-linux-amd64.tar.gz" + "url": "https://github.com/istio/istio/releases/download/1.20.4/istio-1.20.4-linux-amd64.tar.gz" }, { "package": "docker.io/istio/proxyv2", - "tag": "1.17.0", + "tag": "1.20.4", "type": "image" }, { "package": "docker.io/istio/pilot", - "tag": "1.17.0", + "tag": "1.20.4", "type": "image" }, { "package": "knative_serving_crds_manifest", "type": "manifest", - "url": "https://github.com/knative/serving/releases/download/knative-v1.11.0/serving-crds.yaml" + "url": "https://github.com/knative/serving/releases/download/knative-v1.13.1/serving-crds.yaml" }, { "package": "knative_serving_core_manifest", "type": "manifest", - "url": "https://github.com/knative/serving/releases/download/knative-v1.11.0/serving-core.yaml" + "url": "https://github.com/knative/serving/releases/download/knative-v1.13.1/serving-core.yaml" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/queue", - "digest": "987f53e3ead58627e3022c8ccbb199ed71b965f10c59485bab8015ecf18b44af", + "digest": "e52286fc4843470383e917abc9c1b0c8d10f585c4274c57b612279869bc86f0d", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/activator", - "digest": "6b98eed95dd6dcc3d957e673aea3d271b768225442504316d713c08524f44ebe", + "digest": "21f8e11a44bf1e260602d30e6762a3dc433c608d1dd0e309c0ff89728e71901d", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/autoscaler", - "digest": "5b52cc9aa521ee236645db57f19b70f2a0e8f6ef27dfa9181409a0f96406e2ad", + "digest": "34796e9f760bb67065c6f101296513b38d04d39d11888e919692ac46fa6dc7c2", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/controller", - "digest": "b11dbcba050eac9084edd021b7e0eee16b39c9e397b245bc4227266af1893404", + "digest": "53d9aa4d2c7a82f5a01202e386f7503b21839cbe2e5e62f1e9bda2aa5f11b518", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/webhook", - "digest": "7b138c73fcaaf0b9bb2d414b8a89a780f8c09371d24c6f57969be1694acf4aaa", + "digest": "700c69915dc7cd86dffb61c26b0ba34427fab809de1e3344589dd955b6440882", "type": "image" }, { "package": "knative_net_istio_manifest", "type": "manifest", - "url": "https://github.com/knative/net-istio/releases/download/knative-v1.11.0/net-istio.yaml" + "url": "https://github.com/knative/net-istio/releases/download/knative-v1.13.1/net-istio.yaml" }, { "package": "gcr.io/knative-releases/knative.dev/net-istio/cmd/controller", - "digest": "27e7beb7c62036216fc464fb2181e56b030158ad4ceb57a7de172f54b4fe43db", + "digest": "a5b041ba3c9ea40198b2331617bd1571942961c1416ef683b4de8ef162755a88", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/net-istio/cmd/webhook", - "digest": "0cdef272e39c57971ce9977765f164dd8e3abb9395a4f60e7a4160d57dcc09f2", + "digest": "f066376eee17505d14881b7635a7ca7531fce0f30cf968232fc0a93adc952ed5", "type": "image" }, { "package": "cert_manager_manifest", "type": "manifest", - "url": "https://github.com/cert-manager/cert-manager/releases/download/v1.13.0/cert-manager.yaml" + "url": "https://github.com/cert-manager/cert-manager/releases/download/v1.14.5/cert-manager.yaml" }, { "package": "quay.io/jetstack/cert-manager-cainjector", - "tag": "v1.13.0", + "tag": "v1.14.5", "type": "image" }, { "package": "quay.io/jetstack/cert-manager-controller", - "tag": "v1.13.0", + "tag": "v1.14.5", "type": "image" }, { "package": "quay.io/jetstack/cert-manager-webhook", - "tag": "v1.13.0", + "tag": "v1.14.5", "type": "image" }, { "package": "kserve_manifest", "type": "manifest", - "url": "https://github.com/kserve/kserve/releases/download/v0.11.2/kserve.yaml" + "url": "https://github.com/kserve/kserve/releases/download/v0.13.0/kserve.yaml" }, { "package": "docker.io/kserve/kserve-controller", - "tag": "v0.11.2", + "tag": "v0.13.0", "type": "image" }, { "package": "docker.io/kserve/storage-initializer", - "tag": "v0.11.2", + "tag": "v0.13.0", "type": "image" }, { "package": "docker.io/kserve/router", - "tag": "v0.11.2", + "tag": "v0.13.0", "type": "image" }, { "package": "docker.io/kserve/agent", - "tag": "v0.11.2", + "tag": "v0.13.0", "type": "image" }, { @@ -119,7 +119,7 @@ { "package": "kserve_runtimes_manifest", "type": "manifest", - "url": "https://github.com/kserve/kserve/releases/download/v0.11.2/kserve-runtimes.yaml" + "url": "https://github.com/kserve/kserve/releases/download/v0.13.0/kserve-cluster-resources.yaml" }, { "package": "docker.io/seldonio/mlserver", @@ -128,7 +128,7 @@ }, { "package": "docker.io/kserve/sklearnserver", - "tag": "v0.11.2", + "tag": "v0.13.0", "type": "image" } ] diff --git a/input/config/ubuntu/20.04/kubeflow.json b/input/config/ubuntu/20.04/kubeflow.json index e8e47c143..d17bbf9d1 100644 --- a/input/config/ubuntu/20.04/kubeflow.json +++ b/input/config/ubuntu/20.04/kubeflow.json @@ -4,43 +4,47 @@ { "package": "kubeflow", "type": "git", - "url": "https://github.com/kubeflow/manifests.git" - , - "version": "v1.8.0" + "url": "https://github.com/kubeflow/manifests.git", + "version": "v1.9.1" }, { "package": "kustomize", "type": "tarball", - "url": "https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize%2Fv5.0.3/kustomize_v5.0.3_linux_amd64.tar.gz" + "url": "https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize%2Fv5.4.3/kustomize_v5.4.3_linux_amd64.tar.gz" }, { "package": "ghcr.io/dexidp/dex", - "tag": "v2.36.0", + "tag": "v2.39.1", + "type": "image" + }, + { + "package": "ghcr.io/dexidp/dex", + "tag": "v2.35.0", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/apiserver_receive_adapter", - "digest": "828db8155996e40c13b77c1d039dba98153dcfcbe272248e92866bd7b6d6a17d", + "digest": "4ed3e39a11f4fc3358787433beaea4a9e72773ea7710bf4beb95aa8770515c9e", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/webhook", - "digest": "ebf93652f0254ac56600bedf4a7d81611b3e1e7f6526c6998da5dd24cdc67ee1", + "digest": "cd577cb977a2830b29bb799cf146bbffe0241d65eef1c680ec158af97b18d4fa", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/activator", - "digest": "c2994c2b6c2c7f38ad1b85c71789bf1753cc8979926423c83231e62258837cb9", + "digest": "ad42ddc9bc4e25fdc88c240d7cbfad4b2708eb7d26e07ae904d258011141116e", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/autoscaler", - "digest": "8319aa662b4912e8175018bd7cc90c63838562a27515197b803bdcd5634c7007", + "digest": "66aa0dbceee62691d5327e423bbd7cbd411903747adeab61fdc81b14590793d4", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/controller", - "digest": "98a2cc7fd62ee95e137116504e7166c32c65efef42c3d1454630780410abf943", + "digest": "e5b7b6edd265b66d32f424bd245c06455154462ade6ce05698472212248d5657", "type": "image" }, { @@ -55,97 +59,97 @@ }, { "package": "gcr.io/knative-releases/knative.dev/net-istio/cmd/controller", - "digest": "421aa67057240fa0c56ebf2c6e5b482a12842005805c46e067129402d1751220", + "digest": "5782b4a6b1a106d7cafe77d044b30905a9fecbbd2e0029946cb8a4b3507b40a4", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/net-istio/cmd/webhook", - "digest": "bfa1dfea77aff6dfa7959f4822d8e61c4f7933053874cd3f27352323e6ecd985", + "digest": "eeff0ad31550f3ff519d988bb36bfe214e5b60c1ec4349c1f9bb2b2d8cad9479", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/webhook", - "digest": "4305209ce498caf783f39c8f3e85dfa635ece6947033bf50b0b627983fd65953", + "digest": "48aee2733721ecc77956abc5a2ca072853a669ebc97519beb48f7b3da8455e67", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/pkg/apiextensions/storageversion/cmd/migrate", - "digest": "bc91e1fdaf3b67876ca33de1ce15b1268ed0ca8da203102b7699286fae97cf58", + "digest": "232d6ffd88dfc0d0ec02c6f3a95520283d076c16b77543cee04f4ef276e0b7ae", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/queue", - "digest": "dabaecec38860ca4c972e6821d5dc825549faf50c6feb8feb4c04802f2338b8a", - "type": "image" - }, - { - "package": "gcr.io/knative-releases/knative.dev/net-istio/cmd/webhook", - "digest": "bfa1dfea77aff6dfa7959f4822d8e61c4f7933053874cd3f27352323e6ecd985", + "digest": "89e6f90141f1b63405883fbb4de0d3b6d80f8b77e530904c4d29bdcd1dc5a167", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/net-istio/cmd/controller", - "digest": "421aa67057240fa0c56ebf2c6e5b482a12842005805c46e067129402d1751220", + "digest": "5782b4a6b1a106d7cafe77d044b30905a9fecbbd2e0029946cb8a4b3507b40a4", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/pkg/apiextensions/storageversion/cmd/migrate", - "digest": "56780f69e6496bb4790b0c147deb652a2b020ff81e08d58cc58a61cd649b1121", + "digest": "d438c3ad2fcef3c7ea1b3abb910f5fa911c8a1466d6460ac0b11bf034797d6f6", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/mtchannel_broker", - "digest": "4040ffc2d34e950b7969b4ba90cec29e65e506126ddb195faf3a56cb2fa653e8", + "digest": "9dc9e0b00325f1ec994ef6f48761ba7d9217333fa0c2cbfccfa9b204e3f616a9", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/broker/ingress", - "digest": "7f3b05f6e0abae19e9438fac44dd9938ddd2293014ef0fb8d388450c9ff63000", + "digest": "65412cf797d0bb7c7e22454431f57f8d9dcedf93620769f4c1206947acf05abb", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/broker/filter", - "digest": "29bd9f43359153c0ea39cf382d5f25ca43f55abbbce3d802ca37cc4d5c4a6942", + "digest": "4e3cf0703024129c60b66529f41a1d29310f61f6aced24d25fd241e43b1a2e8e", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/in_memory/channel_dispatcher", - "digest": "521234b4cff9d3cd32f8264cd7c830caa06f9982637b4866e983591fa1abc418", + "digest": "fa64db1ad126874f4e5ce1c17c2414b0fc3dde2a7e0db6fde939cafdbd4d96cd", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/in_memory/channel_controller", - "digest": "e004174a896811aec46520b1f2857f1973762389426bb0e0fc5d2332d5e36c7a", + "digest": "5386029f1fdcce1398dcca436864051a2f7eb5abed176453104f41b7b9b587f9", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/webhook", - "digest": "ebf93652f0254ac56600bedf4a7d81611b3e1e7f6526c6998da5dd24cdc67ee1", + "digest": "cd577cb977a2830b29bb799cf146bbffe0241d65eef1c680ec158af97b18d4fa", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/mtping", - "digest": "6d35cc98baa098fc0c5b4290859e363a8350a9dadc31d1191b0b5c9796958223", + "digest": "9d74e8c69d671ad10fdfd84d33569fde5c16c9f95824ea288d2cb6fd69e32f4d", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/controller", - "digest": "92967bab4ad8f7d55ce3a77ba8868f3f2ce173c010958c28b9a690964ad6ee9b", + "digest": "7579c5a8b1dee07c382120a8bc1a6594aea4519d0cf652989f5d9a675b11a0de", "type": "image" }, { "package": "quay.io/jetstack/cert-manager-controller", - "tag": "v1.12.2", + "tag": "v1.14.5", "type": "image" }, { "package": "quay.io/jetstack/cert-manager-cainjector", - "tag": "v1.12.2", + "tag": "v1.14.5", "type": "image" }, { "package": "quay.io/jetstack/cert-manager-webhook", - "tag": "v1.12.2", + "tag": "v1.14.5", + "type": "image" + }, + { + "package": "docker.io/istio/proxyv2", + "tag": "1.22.1", "type": "image" }, { @@ -155,7 +159,7 @@ }, { "package": "docker.io/istio/pilot", - "tag": "1.17.5", + "tag": "1.22.1", "type": "image" }, { @@ -165,7 +169,7 @@ }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/controller", - "digest": "92967bab4ad8f7d55ce3a77ba8868f3f2ce173c010958c28b9a690964ad6ee9b", + "digest": "7579c5a8b1dee07c382120a8bc1a6594aea4519d0cf652989f5d9a675b11a0de", "type": "image" }, @@ -179,9 +183,14 @@ "tag": "latest", "type": "image" }, + { + "package": "docker.io/kubeflowkatib/pytorch-mnist", + "tag": "v1beta1-45c5727", + "type": "image" + }, { "package": "docker.io/kubeflownotebookswg/jupyter-scipy", - "tag": "v1.6.1", + "tag": "v1.9.2", "type": "image" }, { @@ -189,6 +198,36 @@ "tag": "latest", "type": "image" }, + { + "package": "docker.io/busybox", + "tag": "1.28", + "type": "image" + }, + { + "package": "docker.io/busybox", + "tag": "1.34.1", + "type": "image" + }, + { + "package": "docker.io/bentoml/fraud_detection", + "tag": "o5smnagbncigycvj", + "type": "image" + }, + { + "package": "docker.io/istio/install-cni", + "tag": "1.22.1", + "type": "image" + }, + { + "package": "docker.io/istio/pilot", + "tag": "1.22.1", + "type": "image" + }, + { + "package": "docker.io/istio/proxyv2", + "tag": "1.22.1", + "type": "image" + }, { "package": "gcr.io/kubeflow-images-public/profile-controller", "tag": "v20190228-v0.4.0-rc.1-192-g1a802656-dirty-f95773", @@ -196,7 +235,7 @@ }, { "package": "docker.io/seldonio/seldon-core-operator", - "tag": "1.17.1", + "tag": "1.18.1", "type": "image" }, { @@ -206,7 +245,7 @@ }, { "package": "docker.io/rayproject/ray", - "tag": "2.2.0-py38-cpu", + "tag": "2.23.0-py311-cpu", "type": "image" }, { @@ -216,12 +255,22 @@ }, { "package": "docker.io/kserve/kserve-controller", - "tag": "v0.11.2", + "tag": "v0.13.1", "type": "image" }, + { + "package": "docker.io/kserve/huggingfaceserver", + "tag": "v0.13.1", + "type": "image" + }, + { + "package": "docker.io/kserve/storage-initializer", + "tag": "v0.13.1", + "type": "image" + }, { "package": "docker.io/kserve/xgbserver", - "tag": "v0.11.2", + "tag": "v0.13.1", "type": "image" }, { @@ -231,7 +280,7 @@ }, { "package": "docker.io/pytorch/torchserve-kfs", - "tag": "0.8.2", + "tag": "0.9.0", "type": "image" }, { @@ -241,17 +290,17 @@ }, { "package": "docker.io/kserve/sklearnserver", - "tag": "v0.11.2", + "tag": "v0.13.1", "type": "image" }, { "package": "docker.io/kserve/pmmlserver", - "tag": "v0.11.2", + "tag": "v0.13.1", "type": "image" }, { "package": "docker.io/kserve/paddleserver", - "tag": "v0.11.2", + "tag": "v0.13.1", "type": "image" }, { @@ -261,7 +310,7 @@ }, { "package": "docker.io/kserve/lgbserver", - "tag": "v0.11.2", + "tag": "v0.13.1", "type": "image" }, { @@ -271,14 +320,24 @@ }, { "package": "quay.io/bentoml/yatai-image-builder", - "tag": "1.1.3", + "tag": "1.2.28", "type": "image" }, + { + "package": "quay.io/bentoml/yatai-deployment", + "tag": "1.1.21", + "type": "image" + }, { "package": "quay.io/oauth2-proxy/oauth2-proxy", "tag": "latest", "type": "image" }, + { + "package": "quay.io/oauth2-proxy/oauth2-proxy", + "tag": "v7.6.0", + "type": "image" + }, { "package": "docker.io/prom/prometheus", "tag": "latest", @@ -306,9 +365,24 @@ }, { "package": "docker.io/postgres", - "tag": "12-alpine", + "tag": "14.5-alpine", "type": "image" }, + { + "package": "docker.io/postgres", + "tag": "12-alpine", + "type": "image" + }, + { + "package": "docker.io/postgres", + "tag": "14.7-alpine", + "type": "image" + }, + { + "package": "docker.io/postgres", + "tag": "latest", + "type": "image" + }, { "package": "quay.io/argoproj/argocli", "tag": "latest", @@ -336,7 +410,7 @@ }, { "package": "gcr.io/tfx-oss-public/ml_metadata_store_server", - "tag": "1.5.0", + "tag": "1.14.0", "type": "image" }, { @@ -349,9 +423,14 @@ "tag": "nightly", "type": "image" }, + { + "package": "quay.io/aipipeline/pipelineloop-controller", + "tag": "nightly", + "type": "image" + }, { "package": "gcr.io/kubebuilder/kube-rbac-proxy", - "tag": "v0.4.0", + "tag": "v0.11.0", "type": "image" }, { @@ -361,87 +440,87 @@ }, { "package": "docker.io/kubeflowkatib/enas-cnn-cifar10-cpu", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/pytorch-mnist-cpu", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/file-metrics-collector", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/tfevent-metrics-collector", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-hyperopt", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-optuna", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-hyperband", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-skopt", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-goptuna", - "tag": "v0.16.0-rc.1", - "type": "image" - }, - { - "package": "docker.io/kubeflowkatib/suggestion-optuna", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-enas", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-darts", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-pbt", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/earlystopping-medianstop", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kserve/kserve-controller", - "tag": "v0.11.1", + "tag": "v0.13.1", "type": "image" }, { "package": "gcr.io/ml-pipeline/frontend", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" }, { "package": "gcr.io/ml-pipeline/visualization-server", - "tag": "2.0.3", + "tag": "2.3.0", + "type": "image" + }, + { + "package": "docker.io/kubeflownotebookswg/poddefaults-webhook", + "tag": "v1.9.2", "type": "image" }, { @@ -451,17 +530,17 @@ }, { "package": "gcr.io/ml-pipeline/cache-server", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/centraldashboard", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/jupyter-web-app", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { @@ -469,6 +548,11 @@ "tag": "8.0.26", "type": "image" }, + { + "package": "docker.io/kubeflowkatib/katib-controller", + "tag": "v0.17.0", + "type": "image" + }, { "package": "docker.io/kubeflowkatib/katib-controller", "tag": "v0.16.0", @@ -476,7 +560,7 @@ }, { "package": "docker.io/kubeflowkatib/katib-db-manager", - "tag": "v0.16.0", + "tag": "v0.17.0", "type": "image" }, { @@ -486,17 +570,22 @@ }, { "package": "docker.io/kubeflowkatib/katib-ui", - "tag": "v0.16.0", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kserve/kserve-controller", - "tag": "v0.11.1", + "tag": "v0.13.1", + "type": "image" + }, + { + "package": "docker.io/kserve/models-web-app", + "tag": "v0.13.0-rc.0", "type": "image" }, { "package": "docker.io/kserve/models-web-app", - "tag": "v0.10.0", + "tag": "latest", "type": "image" }, { @@ -511,7 +600,7 @@ }, { "package": "gcr.io/ml-pipeline/metadata-envoy", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" }, { @@ -521,7 +610,7 @@ }, { "package": "gcr.io/ml-pipeline/metadata-writer", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" }, { @@ -531,52 +620,52 @@ }, { "package": "gcr.io/ml-pipeline/api-server", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" }, { "package": "gcr.io/ml-pipeline/persistenceagent", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" }, { "package": "gcr.io/ml-pipeline/scheduledworkflow", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/tensorboard-controller", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/tensorboards-web-app", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { "package": "docker.io/kubeflow/training-operator", - "tag": "v1-855e096", + "tag": "v1-04f9f13", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/notebook-controller", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/kfam", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/profile-controller", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { "package": "gcr.io/kubebuilder/kube-rbac-proxy", - "tag": "v0.13.1", + "tag": "v0.4.0", "type": "image" }, { @@ -586,17 +675,12 @@ }, { "package": "docker.io/kubeflownotebookswg/tensorboards-web-app", - "tag": "v1.8.0", - "type": "image" - }, - { - "package": "docker.io/mariadb", - "tag": "latest", + "tag": "v1.9.2", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/pvcviewer-controller", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { @@ -604,14 +688,24 @@ "tag": "v3.3.10-license-compliance", "type": "image" }, + { + "package": "gcr.io/ml-pipeline/workflow-controller", + "tag": "v3.3.8-license-compliance", + "type": "image" + }, + { + "package": "gcr.io/ml-pipeline/workflow-controller", + "tag": "v3.4.17-license-compliance", + "type": "image" + }, { "package": "docker.io/kubeflownotebookswg/volumes-web-app", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { "package": "gcr.io/ml-pipeline/viewer-crd-controller", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" } ] diff --git a/input/config/ubuntu/20.04/openldap.json b/input/config/ubuntu/20.04/openldap.json index e46652bc4..b81dd622c 100644 --- a/input/config/ubuntu/20.04/openldap.json +++ b/input/config/ubuntu/20.04/openldap.json @@ -9,7 +9,8 @@ { "package": "ansible-role-ldaptoolbox-openldap", "type": "git", "url": "https://github.com/ltb-project/ansible-role-ldaptoolbox-openldap.git", - "version": "main" + "version": "main", + "commit": "695a689ff91a83b47fbc6f575be37e1f811bd719" } ] } diff --git a/input/config/ubuntu/20.04/secure_login_node.json b/input/config/ubuntu/20.04/secure_login_node.json index f03050a94..f46509984 100644 --- a/input/config/ubuntu/20.04/secure_login_node.json +++ b/input/config/ubuntu/20.04/secure_login_node.json @@ -21,13 +21,13 @@ {"package": "tar", "type": "deb", "repo_name": "focal"}, {"package": "wget", "type": "deb", "repo_name": "focal"}, {"package": "acct", "type": "deb", "repo_name": "focal"}, - {"package": "python3.9", "type": "deb", "repo_name": "deadsnake-ppa"}, + {"package": "python3.11", "type": "deb", "repo_name": "deadsnake-ppa"}, { - "package": "ansible==7.7.0", + "package": "ansible==9.5.1", "type": "pip_module" }, { - "package": "cryptography==41.0.7", + "package": "cryptography==44.0.0", "type": "pip_module" }, { diff --git a/input/config/ubuntu/20.04/telemetry.json b/input/config/ubuntu/20.04/telemetry.json index cd40f70ad..78b9ae9e1 100644 --- a/input/config/ubuntu/20.04/telemetry.json +++ b/input/config/ubuntu/20.04/telemetry.json @@ -10,11 +10,6 @@ { "package": "smartmontools", "type": "deb", "repo_name": "focal" - }, - { - "package": "containerd.io=1.6.20-1", - "type": "deb", - "repo_name": "docker-ce-repo" }, { "package": "docker-ce-cli=5:20.10.20~3-0~ubuntu-focal", @@ -32,45 +27,45 @@ "repo_name": "docker-ce-repo" }, { - "package": "kubespray", + "package": "kubespray-v2.25.0", "type": "git", "url": "https://github.com/kubernetes-sigs/kubespray.git", - "version": "release-2.23" + "version": "v2.25.0" }, { - "package": "kubectl", + "package": "kubectl-1.29.5", "type": "tarball", - "url": "https://dl.k8s.io/release/v1.26.12/bin/linux/amd64/kubectl" + "url": "https://dl.k8s.io/release/v1.29.5/bin/linux/amd64/kubectl" }, { - "package": "kubelet", + "package": "kubelet-1.29.5", "type": "tarball", - "url": "https://dl.k8s.io/release/v1.26.12/bin/linux/amd64/kubelet" + "url": "https://dl.k8s.io/release/v1.29.5/bin/linux/amd64/kubelet" }, { - "package": "kubeadm", + "package": "kubeadm-1.29.5", "type": "tarball", - "url": "https://dl.k8s.io/release/v1.26.12/bin/linux/amd64/kubeadm" + "url": "https://dl.k8s.io/release/v1.29.5/bin/linux/amd64/kubeadm" }, { - "package": "calicoctl-v3.25.2", + "package": "calicoctl-v3.27.3", "type": "tarball", - "url": "https://github.com/projectcalico/calico/releases/download/v3.25.2/calicoctl-linux-amd64" + "url": "https://github.com/projectcalico/calico/releases/download/v3.27.3/calicoctl-linux-amd64" }, { - "package": "calicocrds-v3.25.2", + "package": "calicocrds-v3.27.3", "type": "tarball", - "url": "https://github.com/projectcalico/calico/archive/v3.25.2.tar.gz" + "url": "https://github.com/projectcalico/calico/archive/v3.27.3.tar.gz" }, { - "package": "cri-tools-v1.26.1", + "package": "cri-tools-v1.29.0", "type": "tarball", - "url": "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.26.1/crictl-v1.26.1-linux-amd64.tar.gz" + "url": "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.29.0/crictl-v1.29.0-linux-amd64.tar.gz" }, { - "package": "etcd-v3.5.10", + "package": "etcd-v3.5.12", "type": "tarball", - "url": "https://github.com/etcd-io/etcd/releases/download/v3.5.10/etcd-v3.5.10-linux-amd64.tar.gz" + "url": "https://github.com/etcd-io/etcd/releases/download/v3.5.12/etcd-v3.5.12-linux-amd64.tar.gz" }, { "package": "cni-plugins-v1.3.0", @@ -78,24 +73,24 @@ "url": "https://github.com/containernetworking/plugins/releases/download/v1.3.0/cni-plugins-linux-amd64-v1.3.0.tgz" }, { - "package": "runc.amd64", + "package": "runc.amd64-v1.1.12", "type": "tarball", - "url": "https://github.com/opencontainers/runc/releases/download/v1.1.9/runc.amd64" + "url": "https://github.com/opencontainers/runc/releases/download/v1.1.12/runc.amd64" }, { - "package": "nerdctl-v1.5.0", + "package": "nerdctl-v1.7.4", "type": "tarball", - "url": "https://github.com/containerd/nerdctl/releases/download/v1.5.0/nerdctl-1.5.0-linux-amd64.tar.gz" + "url": "https://github.com/containerd/nerdctl/releases/download/v1.7.4/nerdctl-1.7.4-linux-amd64.tar.gz" }, { - "package": "containerd-1.7.5", + "package": "containerd-1.7.16", "type": "tarball", - "url": "https://github.com/containerd/containerd/releases/download/v1.7.5/containerd-1.7.5-linux-amd64.tar.gz" + "url": "https://github.com/containerd/containerd/releases/download/v1.7.16/containerd-1.7.16-linux-amd64.tar.gz" }, { - "package": "helm-v3.12.3", + "package": "helm-v3.14.2", "type": "tarball", - "url": "https://get.helm.sh/helm-v3.12.3-linux-amd64.tar.gz" + "url": "https://get.helm.sh/helm-v3.14.2-linux-amd64.tar.gz" }, { "package": "nfs-subdir-external-provisioner-4.0.18", @@ -109,7 +104,7 @@ }, { "package": "registry.k8s.io/coredns/coredns", - "tag": "v1.9.3", + "tag": "v1.11.1", "type": "image" }, { @@ -124,22 +119,22 @@ }, { "package": "registry.k8s.io/kube-apiserver", - "tag": "v1.26.12", + "tag": "v1.29.5", "type": "image" }, { "package": "registry.k8s.io/kube-controller-manager", - "tag": "v1.26.12", + "tag": "v1.29.5", "type": "image" }, { "package": "registry.k8s.io/kube-proxy", - "tag": "v1.26.12", + "tag": "v1.29.5", "type": "image" }, { "package": "registry.k8s.io/kube-scheduler", - "tag": "v1.26.12", + "tag": "v1.29.5", "type": "image" }, { @@ -149,27 +144,27 @@ }, { "package": "quay.io/coreos/etcd", - "tag": "v3.5.10", + "tag": "v3.5.12", "type": "image" }, { "package": "quay.io/calico/cni", - "tag": "v3.25.2", + "tag": "v3.27.3", "type": "image" }, { "package": "quay.io/calico/kube-controllers", - "tag": "v3.25.2", + "tag": "v3.27.3", "type": "image" }, { "package": "quay.io/calico/pod2daemon-flexvol", - "tag": "v3.25.2", + "tag": "v3.27.3", "type": "image" }, { "package": "quay.io/calico/node", - "tag": "v3.25.2", + "tag": "v3.27.3", "type": "image" }, { @@ -194,19 +189,60 @@ }, { "package": "docker.io/kubernetesui/dashboard", - "tag": "v2.2.0", + "tag": "v2.7.0", "type": "image" }, { "package": "docker.io/kubernetesui/metrics-scraper", - "tag": "v1.0.6", + "tag": "v1.0.8", "type": "image" - }, + }, { "package": "docker.io/grafana/grafana-enterprise", "tag": "8.3.2", "type": "image" + }, + { + "package": "kube-prometheus-stack-62.3.0", + "type": "tarball", + "url": "https://github.com/prometheus-community/helm-charts/releases/download/kube-prometheus-stack-62.3.0/kube-prometheus-stack-62.3.0.tgz" + }, + { + "package": "quay.io/prometheus-operator/prometheus-operator", + "tag": "v0.76.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-state-metrics/kube-state-metrics", + "tag": "v2.13.0", + "type": "image" + }, + { + "package": "quay.io/prometheus-operator/prometheus-config-reloader", + "tag": "v0.76.0", + "type": "image" + }, + { + "package": "quay.io/prometheus/alertmanager", + "tag": "v0.27.0", + "type": "image" + }, + { + "package": "quay.io/prometheus/node-exporter", + "tag": "v1.8.2", + "type": "image" + }, + { + "package": "quay.io/prometheus/prometheus", + "tag": "v2.54.0", + "type": "image" + }, + { + "package": "registry.k8s.io/ingress-nginx/kube-webhook-certgen", + "tag": "v20221220-controller-v1.5.1-58-g787ea74b6", + "type": "image" } ] } } + diff --git a/input/config/ubuntu/20.04/vllm.json b/input/config/ubuntu/20.04/vllm.json index 0957956d6..ac5ce82dc 100644 --- a/input/config/ubuntu/20.04/vllm.json +++ b/input/config/ubuntu/20.04/vllm.json @@ -20,21 +20,21 @@ "cluster": [ { - "package": "python3.9", + "package": "python3.11", "type": "deb", "repo_name": "deadsnake-ppa" }, { - "package": "https://download.pytorch.org/whl/cu121/torch-2.1.2%2Bcu121-cp39-cp39-linux_x86_64.whl", + "package": "https://download.pytorch.org/whl/cu121/torch-2.1.2%2Bcu121-cp311-cp311-linux_x86_64.whl", "type": "pip_module" }, { - "package": "https://github.com/vllm-project/vllm/releases/download/v0.4.0/vllm-0.4.0-cp39-cp39-manylinux1_x86_64.whl", + "package": "https://github.com/vllm-project/vllm/releases/download/v0.4.0/vllm-0.4.0-cp311-cp311-manylinux1_x86_64.whl", "type": "pip_module" } ] } -} \ No newline at end of file +} diff --git a/input/config/ubuntu/22.04/amdgpu.json b/input/config/ubuntu/22.04/amdgpu.json index 9ea3c14ee..9239a76a3 100644 --- a/input/config/ubuntu/22.04/amdgpu.json +++ b/input/config/ubuntu/22.04/amdgpu.json @@ -8,7 +8,7 @@ }, "rocm": { "cluster": [ - {"package": "rocm-hip-sdk{{ rocm_version }}*", "type": "deb", "repo_name": "rocm"} + {"package": "rocm", "type": "deb", "repo_name": "rocm"} ] } } diff --git a/input/config/ubuntu/22.04/csi_driver_powerscale.json b/input/config/ubuntu/22.04/csi_driver_powerscale.json new file mode 100644 index 000000000..afc8fe181 --- /dev/null +++ b/input/config/ubuntu/22.04/csi_driver_powerscale.json @@ -0,0 +1,84 @@ +{ + "csi_driver_powerscale": { + "cluster": [ + { + "package": "csi-powerscale", + "url": "https://github.com/dell/csi-powerscale.git", + "type": "git", + "version": "v2.11.0" + }, + { + "package": "external-snapshotter", + "url": "https://github.com/kubernetes-csi/external-snapshotter.git", + "type": "git", + "version": "v8.0.1" + }, + { + "package": "helm-charts", + "url": "https://github.com/dell/helm-charts.git", + "type": "git", + "version": "csi-isilon-2.11.0" + }, + { + "package": "docker.io/dellemc/csi-isilon", + "tag": "v2.11.0", + "type": "image" + }, + { + "package": "registry.k8s.io/sig-storage/csi-attacher", + "tag": "v4.6.1", + "type": "image" + }, + { + "package": "registry.k8s.io/sig-storage/csi-provisioner", + "tag": "v5.0.1", + "type": "image" + }, + { + "package": "registry.k8s.io/sig-storage/csi-snapshotter", + "tag": "v8.0.1", + "type": "image" + }, + { + "package": "registry.k8s.io/sig-storage/csi-resizer", + "tag": "v1.11.1", + "type": "image" + }, + { + "package": "registry.k8s.io/sig-storage/csi-node-driver-registrar", + "tag": "v2.10.1", + "type": "image" + }, + { + "package": "registry.k8s.io/sig-storage/csi-external-health-monitor-controller", + "tag": "v0.12.1", + "type": "image" + }, + { + "package": "docker.io/dellemc/dell-csi-replicator", + "tag": "v1.9.0", + "type": "image" + }, + { + "package": "docker.io/dellemc/podmon", + "tag": "v1.10.0", + "type": "image" + }, + { + "package": "docker.io/dellemc/csm-authorization-sidecar", + "tag": "v1.11.0", + "type": "image" + }, + { + "package": "docker.io/dellemc/csi-metadata-retriever", + "tag": "v1.8.0", + "type": "image" + }, + { + "package": "docker.io/dellemc/csm-encryption", + "tag": "v0.6.0", + "type": "image" + } + ] + } +} diff --git a/input/config/ubuntu/22.04/intelgaudi.json b/input/config/ubuntu/22.04/intelgaudi.json new file mode 100644 index 000000000..f0d9d6157 --- /dev/null +++ b/input/config/ubuntu/22.04/intelgaudi.json @@ -0,0 +1,62 @@ +{ + "intelgaudi": { + "cluster": [ + { + "package": "habanalabs-dkms={{ intelgaudi_version }}", + "type": "deb", + "repo_name": "intelgaudi" + }, + { + "package": "habanalabs-firmware={{ intelgaudi_version }}", + "type": "deb", + "repo_name": "intelgaudi" + }, + { + "package": "habanalabs-firmware-tools={{ intelgaudi_version }}", + "type": "deb", + "repo_name": "intelgaudi" + }, + { + "package": "habanalabs-graph={{ intelgaudi_version }}", + "type": "deb", + "repo_name": "intelgaudi" + }, + { + "package": "habanalabs-qual={{ intelgaudi_version }}", + "type": "deb", + "repo_name": "intelgaudi" + }, + { + "package": "habanalabs-rdma-core={{ intelgaudi_version }}", + "type": "deb", + "repo_name": "intelgaudi" + }, + { + "package": "habanalabs-thunk={{ intelgaudi_version }}", + "type": "deb", + "repo_name": "intelgaudi" + }, + { + "package": "habanatools={{ intelgaudi_version }}", + "type": "deb", + "repo_name": "intelgaudi" + }, + { + "package": "hccl_demo", + "type": "git", + "url": "https://github.com/HabanaAI/hccl_demo.git", + "version": "main" + } + ] + }, + "intel": { + "cluster": [ + + { + "package": "habanalabs-container-runtime={{ intelgaudi_version }}", + "type": "deb", + "repo_name": "intelgaudi" + } + ] + } +} diff --git a/input/config/ubuntu/22.04/k8s.json b/input/config/ubuntu/22.04/k8s.json index b6be74b73..66c07a686 100644 --- a/input/config/ubuntu/22.04/k8s.json +++ b/input/config/ubuntu/22.04/k8s.json @@ -2,59 +2,45 @@ "k8s": { "cluster": [ { - "package": "containerd.io=1.6.20-1", + "package": "nvidia-container-toolkit", "type": "deb", - "repo_name": "docker-ce-repo" + "repo_name": "nvidia-repo" }, { - "package": "docker-ce-cli=5:20.10.20~3-0~ubuntu-jammy", - "type": "deb", - "repo_name": "docker-ce-repo" - }, - { - "package": "docker-ce=5:20.10.20~3-0~ubuntu-jammy", - "type": "deb", - "repo_name": "docker-ce-repo" - }, - { - "package": "docker-ce-rootless-extras=5:20.10.20~3-0~ubuntu-jammy", - "type": "deb", - "repo_name": "docker-ce-repo" - }, - { - "package": "kubectl", + "package": "kubectl-{{ k8s_version }}", "type": "tarball", "url": "https://dl.k8s.io/release/v{{ k8s_version }}/bin/linux/amd64/kubectl" }, { - "package": "kubelet", + "package": "kubelet-{{ k8s_version }}", "type": "tarball", "url": "https://dl.k8s.io/release/v{{ k8s_version }}/bin/linux/amd64/kubelet" }, { - "package": "kubeadm", + "package": "kubeadm-{{ k8s_version }}", "type": "tarball", "url": "https://dl.k8s.io/release/v{{ k8s_version }}/bin/linux/amd64/kubeadm" }, { - "package": "calicoctl-v3.25.2", + "package": "calicoctl-v3.27.3", "type": "tarball", - "url": "https://github.com/projectcalico/calico/releases/download/v3.25.2/calicoctl-linux-amd64" + "url": "https://github.com/projectcalico/calico/releases/download/v3.27.3/calicoctl-linux-amd64" }, { - "package": "calicocrds-v3.25.2", + "package": "calicocrds-v3.27.3", "type": "tarball", - "url": "https://github.com/projectcalico/calico/archive/v3.25.2.tar.gz" + "url": "https://github.com/projectcalico/calico/archive/v3.27.3.tar.gz" }, { - "package": "cri-tools-v1.26.1", + "package": "cri-tools-v1.29.0", "type": "tarball", - "url": "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.26.1/crictl-v1.26.1-linux-amd64.tar.gz" + "url": "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.29.0/crictl-v1.29.0-linux-amd64.tar.gz" }, + { - "package": "etcd-v3.5.10", + "package": "etcd-v3.5.12", "type": "tarball", - "url": "https://github.com/etcd-io/etcd/releases/download/v3.5.10/etcd-v3.5.10-linux-amd64.tar.gz" + "url": "https://github.com/etcd-io/etcd/releases/download/v3.5.12/etcd-v3.5.12-linux-amd64.tar.gz" }, { "package": "cni-plugins-v1.3.0", @@ -62,24 +48,24 @@ "url": "https://github.com/containernetworking/plugins/releases/download/v1.3.0/cni-plugins-linux-amd64-v1.3.0.tgz" }, { - "package": "runc.amd64", + "package": "runc.amd64.v1.1.12", "type": "tarball", - "url": "https://github.com/opencontainers/runc/releases/download/v1.1.9/runc.amd64" + "url": "https://github.com/opencontainers/runc/releases/download/v1.1.12/runc.amd64" }, { - "package": "nerdctl-v1.5.0", + "package": "nerdctl-v1.7.4", "type": "tarball", - "url": "https://github.com/containerd/nerdctl/releases/download/v1.5.0/nerdctl-1.5.0-linux-amd64.tar.gz" + "url": "https://github.com/containerd/nerdctl/releases/download/v1.7.4/nerdctl-1.7.4-linux-amd64.tar.gz" }, { - "package": "containerd-1.7.5", + "package": "containerd-1.7.16", "type": "tarball", - "url": "https://github.com/containerd/containerd/releases/download/v1.7.5/containerd-1.7.5-linux-amd64.tar.gz" + "url": "https://github.com/containerd/containerd/releases/download/v1.7.16/containerd-1.7.16-linux-amd64.tar.gz" }, { - "package": "helm-v3.12.3", + "package": "helm-v3.14.2", "type": "tarball", - "url": "https://get.helm.sh/helm-v3.12.3-linux-amd64.tar.gz" + "url": "https://get.helm.sh/helm-v3.14.2-linux-amd64.tar.gz" }, { "package": "nvidia-device-plugin", @@ -92,9 +78,14 @@ "url": "https://raw.githubusercontent.com/ROCm/k8s-device-plugin/r1.16/k8s-ds-amdgpu-dp.yaml" }, { - "package": "mpi-operator", + "package": "habana-device-plugin", "type": "manifest", - "url": "https://raw.githubusercontent.com/kubeflow/mpi-operator/v0.4.0/deploy/v2beta1/mpi-operator.yaml" + "url": "https://vault.habana.ai/artifactory/docker-k8s-device-plugin/habana-k8s-device-plugin.yaml" + }, + { + "package": "mpi-operator-v0.5.0", + "type": "manifest", + "url": "https://raw.githubusercontent.com/kubeflow/mpi-operator/v0.5.0/deploy/v2beta1/mpi-operator.yaml" }, { "package": "xilinx-device-plugin", @@ -118,7 +109,7 @@ }, { "package": "registry.k8s.io/coredns/coredns", - "tag": "v1.9.3", + "tag": "v1.11.1", "type": "image" }, { @@ -158,27 +149,27 @@ }, { "package": "quay.io/coreos/etcd", - "tag": "v3.5.10", + "tag": "v3.5.12", "type": "image" }, { "package": "quay.io/calico/cni", - "tag": "v3.25.2", + "tag": "v3.27.3", "type": "image" }, { "package": "quay.io/calico/kube-controllers", - "tag": "v3.25.2", + "tag": "v3.27.3", "type": "image" }, { "package": "quay.io/calico/pod2daemon-flexvol", - "tag": "v3.25.2", + "tag": "v3.27.3", "type": "image" }, { "package": "quay.io/calico/node", - "tag": "v3.25.2", + "tag": "v3.27.3", "type": "image" }, { @@ -223,7 +214,7 @@ }, { "package": "docker.io/mpioperator/mpi-operator", - "tag": "master", + "tag": "0.5.0", "type": "image" }, { @@ -237,13 +228,13 @@ "type": "image" }, { - "package": "nvcr.io/nvidia/gpu-feature-discovery", - "tag": "v0.8.2", + "package": "vault.habana.ai/docker-k8s-device-plugin/docker-k8s-device-plugin", + "tag": "1.16.2", "type": "image" }, { - "package": "registry.k8s.io/nfd/node-feature-discovery", - "tag": "v0.12.1", + "package": "nvcr.io/nvidia/gpu-feature-discovery", + "tag": "v0.8.2", "type": "image" }, { diff --git a/input/config/ubuntu/22.04/kserve.json b/input/config/ubuntu/22.04/kserve.json index c2d2869dc..8632d3869 100644 --- a/input/config/ubuntu/22.04/kserve.json +++ b/input/config/ubuntu/22.04/kserve.json @@ -4,111 +4,111 @@ { "package": "istio", "type": "tarball", - "url": "https://github.com/istio/istio/releases/download/1.17.0/istio-1.17.0-linux-amd64.tar.gz" + "url": "https://github.com/istio/istio/releases/download/1.20.4/istio-1.20.4-linux-amd64.tar.gz" }, { "package": "docker.io/istio/proxyv2", - "tag": "1.17.0", + "tag": "1.20.4", "type": "image" }, { "package": "docker.io/istio/pilot", - "tag": "1.17.0", + "tag": "1.20.4", "type": "image" }, { "package": "knative_serving_crds_manifest", "type": "manifest", - "url": "https://github.com/knative/serving/releases/download/knative-v1.11.0/serving-crds.yaml" + "url": "https://github.com/knative/serving/releases/download/knative-v1.13.1/serving-crds.yaml" }, { "package": "knative_serving_core_manifest", "type": "manifest", - "url": "https://github.com/knative/serving/releases/download/knative-v1.11.0/serving-core.yaml" + "url": "https://github.com/knative/serving/releases/download/knative-v1.13.1/serving-core.yaml" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/queue", - "digest": "987f53e3ead58627e3022c8ccbb199ed71b965f10c59485bab8015ecf18b44af", + "digest": "e52286fc4843470383e917abc9c1b0c8d10f585c4274c57b612279869bc86f0d", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/activator", - "digest": "6b98eed95dd6dcc3d957e673aea3d271b768225442504316d713c08524f44ebe", + "digest": "21f8e11a44bf1e260602d30e6762a3dc433c608d1dd0e309c0ff89728e71901d", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/autoscaler", - "digest": "5b52cc9aa521ee236645db57f19b70f2a0e8f6ef27dfa9181409a0f96406e2ad", + "digest": "34796e9f760bb67065c6f101296513b38d04d39d11888e919692ac46fa6dc7c2", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/controller", - "digest": "b11dbcba050eac9084edd021b7e0eee16b39c9e397b245bc4227266af1893404", + "digest": "53d9aa4d2c7a82f5a01202e386f7503b21839cbe2e5e62f1e9bda2aa5f11b518", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/webhook", - "digest": "7b138c73fcaaf0b9bb2d414b8a89a780f8c09371d24c6f57969be1694acf4aaa", + "digest": "700c69915dc7cd86dffb61c26b0ba34427fab809de1e3344589dd955b6440882", "type": "image" }, { "package": "knative_net_istio_manifest", "type": "manifest", - "url": "https://github.com/knative/net-istio/releases/download/knative-v1.11.0/net-istio.yaml" + "url": "https://github.com/knative/net-istio/releases/download/knative-v1.13.1/net-istio.yaml" }, { "package": "gcr.io/knative-releases/knative.dev/net-istio/cmd/controller", - "digest": "27e7beb7c62036216fc464fb2181e56b030158ad4ceb57a7de172f54b4fe43db", + "digest": "a5b041ba3c9ea40198b2331617bd1571942961c1416ef683b4de8ef162755a88", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/net-istio/cmd/webhook", - "digest": "0cdef272e39c57971ce9977765f164dd8e3abb9395a4f60e7a4160d57dcc09f2", + "digest": "f066376eee17505d14881b7635a7ca7531fce0f30cf968232fc0a93adc952ed5", "type": "image" }, { "package": "cert_manager_manifest", "type": "manifest", - "url": "https://github.com/cert-manager/cert-manager/releases/download/v1.13.0/cert-manager.yaml" + "url": "https://github.com/cert-manager/cert-manager/releases/download/v1.14.5/cert-manager.yaml" }, { "package": "quay.io/jetstack/cert-manager-cainjector", - "tag": "v1.13.0", + "tag": "v1.14.5", "type": "image" }, { "package": "quay.io/jetstack/cert-manager-controller", - "tag": "v1.13.0", + "tag": "v1.14.5", "type": "image" }, { "package": "quay.io/jetstack/cert-manager-webhook", - "tag": "v1.13.0", + "tag": "v1.14.5", "type": "image" }, { "package": "kserve_manifest", "type": "manifest", - "url": "https://github.com/kserve/kserve/releases/download/v0.11.2/kserve.yaml" + "url": "https://github.com/kserve/kserve/releases/download/v0.13.0/kserve.yaml" }, { "package": "docker.io/kserve/kserve-controller", - "tag": "v0.11.2", + "tag": "v0.13.0", "type": "image" }, { "package": "docker.io/kserve/storage-initializer", - "tag": "v0.11.2", + "tag": "v0.13.0", "type": "image" }, { "package": "docker.io/kserve/router", - "tag": "v0.11.2", + "tag": "v0.13.0", "type": "image" }, { "package": "docker.io/kserve/agent", - "tag": "v0.11.2", + "tag": "v0.13.0", "type": "image" }, { @@ -119,7 +119,7 @@ { "package": "kserve_runtimes_manifest", "type": "manifest", - "url": "https://github.com/kserve/kserve/releases/download/v0.11.2/kserve-runtimes.yaml" + "url": "https://github.com/kserve/kserve/releases/download/v0.13.0/kserve-cluster-resources.yaml" }, { "package": "docker.io/seldonio/mlserver", @@ -128,7 +128,7 @@ }, { "package": "docker.io/kserve/sklearnserver", - "tag": "v0.11.2", + "tag": "v0.13.0", "type": "image" } ] diff --git a/input/config/ubuntu/22.04/kubeflow.json b/input/config/ubuntu/22.04/kubeflow.json index e8e47c143..d040667bf 100644 --- a/input/config/ubuntu/22.04/kubeflow.json +++ b/input/config/ubuntu/22.04/kubeflow.json @@ -4,43 +4,47 @@ { "package": "kubeflow", "type": "git", - "url": "https://github.com/kubeflow/manifests.git" - , - "version": "v1.8.0" + "url": "https://github.com/kubeflow/manifests.git", + "version": "v1.9.1" }, { "package": "kustomize", "type": "tarball", - "url": "https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize%2Fv5.0.3/kustomize_v5.0.3_linux_amd64.tar.gz" + "url": "https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize%2Fv5.4.3/kustomize_v5.4.3_linux_amd64.tar.gz" }, { "package": "ghcr.io/dexidp/dex", - "tag": "v2.36.0", + "tag": "v2.39.1", + "type": "image" + }, + { + "package": "ghcr.io/dexidp/dex", + "tag": "v2.35.0", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/apiserver_receive_adapter", - "digest": "828db8155996e40c13b77c1d039dba98153dcfcbe272248e92866bd7b6d6a17d", + "digest": "4ed3e39a11f4fc3358787433beaea4a9e72773ea7710bf4beb95aa8770515c9e", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/webhook", - "digest": "ebf93652f0254ac56600bedf4a7d81611b3e1e7f6526c6998da5dd24cdc67ee1", + "digest": "cd577cb977a2830b29bb799cf146bbffe0241d65eef1c680ec158af97b18d4fa", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/activator", - "digest": "c2994c2b6c2c7f38ad1b85c71789bf1753cc8979926423c83231e62258837cb9", + "digest": "ad42ddc9bc4e25fdc88c240d7cbfad4b2708eb7d26e07ae904d258011141116e", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/autoscaler", - "digest": "8319aa662b4912e8175018bd7cc90c63838562a27515197b803bdcd5634c7007", + "digest": "66aa0dbceee62691d5327e423bbd7cbd411903747adeab61fdc81b14590793d4", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/controller", - "digest": "98a2cc7fd62ee95e137116504e7166c32c65efef42c3d1454630780410abf943", + "digest": "e5b7b6edd265b66d32f424bd245c06455154462ade6ce05698472212248d5657", "type": "image" }, { @@ -55,97 +59,97 @@ }, { "package": "gcr.io/knative-releases/knative.dev/net-istio/cmd/controller", - "digest": "421aa67057240fa0c56ebf2c6e5b482a12842005805c46e067129402d1751220", + "digest": "5782b4a6b1a106d7cafe77d044b30905a9fecbbd2e0029946cb8a4b3507b40a4", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/net-istio/cmd/webhook", - "digest": "bfa1dfea77aff6dfa7959f4822d8e61c4f7933053874cd3f27352323e6ecd985", + "digest": "eeff0ad31550f3ff519d988bb36bfe214e5b60c1ec4349c1f9bb2b2d8cad9479", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/webhook", - "digest": "4305209ce498caf783f39c8f3e85dfa635ece6947033bf50b0b627983fd65953", + "digest": "48aee2733721ecc77956abc5a2ca072853a669ebc97519beb48f7b3da8455e67", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/pkg/apiextensions/storageversion/cmd/migrate", - "digest": "bc91e1fdaf3b67876ca33de1ce15b1268ed0ca8da203102b7699286fae97cf58", + "digest": "232d6ffd88dfc0d0ec02c6f3a95520283d076c16b77543cee04f4ef276e0b7ae", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/serving/cmd/queue", - "digest": "dabaecec38860ca4c972e6821d5dc825549faf50c6feb8feb4c04802f2338b8a", - "type": "image" - }, - { - "package": "gcr.io/knative-releases/knative.dev/net-istio/cmd/webhook", - "digest": "bfa1dfea77aff6dfa7959f4822d8e61c4f7933053874cd3f27352323e6ecd985", + "digest": "89e6f90141f1b63405883fbb4de0d3b6d80f8b77e530904c4d29bdcd1dc5a167", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/net-istio/cmd/controller", - "digest": "421aa67057240fa0c56ebf2c6e5b482a12842005805c46e067129402d1751220", + "digest": "5782b4a6b1a106d7cafe77d044b30905a9fecbbd2e0029946cb8a4b3507b40a4", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/pkg/apiextensions/storageversion/cmd/migrate", - "digest": "56780f69e6496bb4790b0c147deb652a2b020ff81e08d58cc58a61cd649b1121", + "digest": "d438c3ad2fcef3c7ea1b3abb910f5fa911c8a1466d6460ac0b11bf034797d6f6", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/mtchannel_broker", - "digest": "4040ffc2d34e950b7969b4ba90cec29e65e506126ddb195faf3a56cb2fa653e8", + "digest": "9dc9e0b00325f1ec994ef6f48761ba7d9217333fa0c2cbfccfa9b204e3f616a9", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/broker/ingress", - "digest": "7f3b05f6e0abae19e9438fac44dd9938ddd2293014ef0fb8d388450c9ff63000", + "digest": "65412cf797d0bb7c7e22454431f57f8d9dcedf93620769f4c1206947acf05abb", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/broker/filter", - "digest": "29bd9f43359153c0ea39cf382d5f25ca43f55abbbce3d802ca37cc4d5c4a6942", + "digest": "4e3cf0703024129c60b66529f41a1d29310f61f6aced24d25fd241e43b1a2e8e", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/in_memory/channel_dispatcher", - "digest": "521234b4cff9d3cd32f8264cd7c830caa06f9982637b4866e983591fa1abc418", + "digest": "fa64db1ad126874f4e5ce1c17c2414b0fc3dde2a7e0db6fde939cafdbd4d96cd", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/in_memory/channel_controller", - "digest": "e004174a896811aec46520b1f2857f1973762389426bb0e0fc5d2332d5e36c7a", + "digest": "5386029f1fdcce1398dcca436864051a2f7eb5abed176453104f41b7b9b587f9", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/webhook", - "digest": "ebf93652f0254ac56600bedf4a7d81611b3e1e7f6526c6998da5dd24cdc67ee1", + "digest": "cd577cb977a2830b29bb799cf146bbffe0241d65eef1c680ec158af97b18d4fa", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/mtping", - "digest": "6d35cc98baa098fc0c5b4290859e363a8350a9dadc31d1191b0b5c9796958223", + "digest": "9d74e8c69d671ad10fdfd84d33569fde5c16c9f95824ea288d2cb6fd69e32f4d", "type": "image" }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/controller", - "digest": "92967bab4ad8f7d55ce3a77ba8868f3f2ce173c010958c28b9a690964ad6ee9b", + "digest": "7579c5a8b1dee07c382120a8bc1a6594aea4519d0cf652989f5d9a675b11a0de", "type": "image" }, { "package": "quay.io/jetstack/cert-manager-controller", - "tag": "v1.12.2", + "tag": "v1.14.5", "type": "image" }, { "package": "quay.io/jetstack/cert-manager-cainjector", - "tag": "v1.12.2", + "tag": "v1.14.5", "type": "image" }, { "package": "quay.io/jetstack/cert-manager-webhook", - "tag": "v1.12.2", + "tag": "v1.14.5", + "type": "image" + }, + { + "package": "docker.io/istio/proxyv2", + "tag": "1.22.1", "type": "image" }, { @@ -155,7 +159,7 @@ }, { "package": "docker.io/istio/pilot", - "tag": "1.17.5", + "tag": "1.22.1", "type": "image" }, { @@ -165,7 +169,7 @@ }, { "package": "gcr.io/knative-releases/knative.dev/eventing/cmd/controller", - "digest": "92967bab4ad8f7d55ce3a77ba8868f3f2ce173c010958c28b9a690964ad6ee9b", + "digest": "7579c5a8b1dee07c382120a8bc1a6594aea4519d0cf652989f5d9a675b11a0de", "type": "image" }, @@ -179,9 +183,14 @@ "tag": "latest", "type": "image" }, + { + "package": "docker.io/kubeflowkatib/pytorch-mnist", + "tag": "v1beta1-45c5727", + "type": "image" + }, { "package": "docker.io/kubeflownotebookswg/jupyter-scipy", - "tag": "v1.6.1", + "tag": "v1.9.2", "type": "image" }, { @@ -189,6 +198,36 @@ "tag": "latest", "type": "image" }, + { + "package": "docker.io/busybox", + "tag": "1.28", + "type": "image" + }, + { + "package": "docker.io/busybox", + "tag": "1.34.1", + "type": "image" + }, + { + "package": "docker.io/bentoml/fraud_detection", + "tag": "o5smnagbncigycvj", + "type": "image" + }, + { + "package": "docker.io/istio/install-cni", + "tag": "1.22.1", + "type": "image" + }, + { + "package": "docker.io/istio/pilot", + "tag": "1.22.1", + "type": "image" + }, + { + "package": "docker.io/istio/proxyv2", + "tag": "1.22.1", + "type": "image" + }, { "package": "gcr.io/kubeflow-images-public/profile-controller", "tag": "v20190228-v0.4.0-rc.1-192-g1a802656-dirty-f95773", @@ -196,7 +235,7 @@ }, { "package": "docker.io/seldonio/seldon-core-operator", - "tag": "1.17.1", + "tag": "1.18.1", "type": "image" }, { @@ -206,7 +245,7 @@ }, { "package": "docker.io/rayproject/ray", - "tag": "2.2.0-py38-cpu", + "tag": "2.23.0-py311-cpu", "type": "image" }, { @@ -216,12 +255,22 @@ }, { "package": "docker.io/kserve/kserve-controller", - "tag": "v0.11.2", + "tag": "v0.13.1", "type": "image" }, + { + "package": "docker.io/kserve/huggingfaceserver", + "tag": "v0.13.1", + "type": "image" + }, + { + "package": "docker.io/kserve/storage-initializer", + "tag": "v0.13.1", + "type": "image" + }, { "package": "docker.io/kserve/xgbserver", - "tag": "v0.11.2", + "tag": "v0.13.1", "type": "image" }, { @@ -231,7 +280,7 @@ }, { "package": "docker.io/pytorch/torchserve-kfs", - "tag": "0.8.2", + "tag": "0.9.0", "type": "image" }, { @@ -241,17 +290,17 @@ }, { "package": "docker.io/kserve/sklearnserver", - "tag": "v0.11.2", + "tag": "v0.13.1", "type": "image" }, { "package": "docker.io/kserve/pmmlserver", - "tag": "v0.11.2", + "tag": "v0.13.1", "type": "image" }, { "package": "docker.io/kserve/paddleserver", - "tag": "v0.11.2", + "tag": "v0.13.1", "type": "image" }, { @@ -261,7 +310,7 @@ }, { "package": "docker.io/kserve/lgbserver", - "tag": "v0.11.2", + "tag": "v0.13.1", "type": "image" }, { @@ -271,14 +320,24 @@ }, { "package": "quay.io/bentoml/yatai-image-builder", - "tag": "1.1.3", + "tag": "1.2.28", "type": "image" }, + { + "package": "quay.io/bentoml/yatai-deployment", + "tag": "1.1.21", + "type": "image" + }, { "package": "quay.io/oauth2-proxy/oauth2-proxy", "tag": "latest", "type": "image" }, + { + "package": "quay.io/oauth2-proxy/oauth2-proxy", + "tag": "v7.6.0", + "type": "image" + }, { "package": "docker.io/prom/prometheus", "tag": "latest", @@ -306,9 +365,24 @@ }, { "package": "docker.io/postgres", - "tag": "12-alpine", + "tag": "14.5-alpine", "type": "image" }, + { + "package": "docker.io/postgres", + "tag": "12-alpine", + "type": "image" + }, + { + "package": "docker.io/postgres", + "tag": "14.7-alpine", + "type": "image" + }, + { + "package": "docker.io/postgres", + "tag": "latest", + "type": "image" + }, { "package": "quay.io/argoproj/argocli", "tag": "latest", @@ -336,7 +410,7 @@ }, { "package": "gcr.io/tfx-oss-public/ml_metadata_store_server", - "tag": "1.5.0", + "tag": "1.14.0", "type": "image" }, { @@ -349,9 +423,14 @@ "tag": "nightly", "type": "image" }, + { + "package": "quay.io/aipipeline/pipelineloop-controller", + "tag": "nightly", + "type": "image" + }, { "package": "gcr.io/kubebuilder/kube-rbac-proxy", - "tag": "v0.4.0", + "tag": "v0.11.0", "type": "image" }, { @@ -361,87 +440,87 @@ }, { "package": "docker.io/kubeflowkatib/enas-cnn-cifar10-cpu", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/pytorch-mnist-cpu", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/file-metrics-collector", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/tfevent-metrics-collector", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-hyperopt", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-optuna", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-hyperband", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-skopt", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-goptuna", - "tag": "v0.16.0-rc.1", - "type": "image" - }, - { - "package": "docker.io/kubeflowkatib/suggestion-optuna", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-enas", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-darts", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/suggestion-pbt", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kubeflowkatib/earlystopping-medianstop", - "tag": "v0.16.0-rc.1", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kserve/kserve-controller", - "tag": "v0.11.1", + "tag": "v0.13.1", "type": "image" }, { "package": "gcr.io/ml-pipeline/frontend", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" }, { "package": "gcr.io/ml-pipeline/visualization-server", - "tag": "2.0.3", + "tag": "2.3.0", + "type": "image" + }, + { + "package": "docker.io/kubeflownotebookswg/poddefaults-webhook", + "tag": "v1.9.2", "type": "image" }, { @@ -451,17 +530,17 @@ }, { "package": "gcr.io/ml-pipeline/cache-server", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/centraldashboard", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/jupyter-web-app", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { @@ -469,6 +548,11 @@ "tag": "8.0.26", "type": "image" }, + { + "package": "docker.io/kubeflowkatib/katib-controller", + "tag": "v0.17.0", + "type": "image" + }, { "package": "docker.io/kubeflowkatib/katib-controller", "tag": "v0.16.0", @@ -476,7 +560,7 @@ }, { "package": "docker.io/kubeflowkatib/katib-db-manager", - "tag": "v0.16.0", + "tag": "v0.17.0", "type": "image" }, { @@ -486,17 +570,22 @@ }, { "package": "docker.io/kubeflowkatib/katib-ui", - "tag": "v0.16.0", + "tag": "v0.17.0", "type": "image" }, { "package": "docker.io/kserve/kserve-controller", - "tag": "v0.11.1", + "tag": "v0.13.1", + "type": "image" + }, + { + "package": "docker.io/kserve/models-web-app", + "tag": "v0.13.0-rc.0", "type": "image" }, { "package": "docker.io/kserve/models-web-app", - "tag": "v0.10.0", + "tag": "latest", "type": "image" }, { @@ -511,7 +600,7 @@ }, { "package": "gcr.io/ml-pipeline/metadata-envoy", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" }, { @@ -521,7 +610,7 @@ }, { "package": "gcr.io/ml-pipeline/metadata-writer", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" }, { @@ -531,52 +620,52 @@ }, { "package": "gcr.io/ml-pipeline/api-server", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" }, { "package": "gcr.io/ml-pipeline/persistenceagent", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" }, { "package": "gcr.io/ml-pipeline/scheduledworkflow", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/tensorboard-controller", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/tensorboards-web-app", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { "package": "docker.io/kubeflow/training-operator", - "tag": "v1-855e096", + "tag": "v1-04f9f13", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/notebook-controller", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/kfam", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/profile-controller", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { "package": "gcr.io/kubebuilder/kube-rbac-proxy", - "tag": "v0.13.1", + "tag": "v0.4.0", "type": "image" }, { @@ -586,17 +675,12 @@ }, { "package": "docker.io/kubeflownotebookswg/tensorboards-web-app", - "tag": "v1.8.0", - "type": "image" - }, - { - "package": "docker.io/mariadb", - "tag": "latest", + "tag": "v1.9.2", "type": "image" }, { "package": "docker.io/kubeflownotebookswg/pvcviewer-controller", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { @@ -604,16 +688,26 @@ "tag": "v3.3.10-license-compliance", "type": "image" }, + { + "package": "gcr.io/ml-pipeline/workflow-controller", + "tag": "v3.3.8-license-compliance", + "type": "image" + }, + { + "package": "gcr.io/ml-pipeline/workflow-controller", + "tag": "v3.4.17-license-compliance", + "type": "image" + }, { "package": "docker.io/kubeflownotebookswg/volumes-web-app", - "tag": "v1.8.0", + "tag": "v1.9.2", "type": "image" }, { "package": "gcr.io/ml-pipeline/viewer-crd-controller", - "tag": "2.0.3", + "tag": "2.3.0", "type": "image" } ] } - } + } \ No newline at end of file diff --git a/input/config/ubuntu/22.04/openldap.json b/input/config/ubuntu/22.04/openldap.json index baa252923..910b9b78f 100644 --- a/input/config/ubuntu/22.04/openldap.json +++ b/input/config/ubuntu/22.04/openldap.json @@ -9,7 +9,8 @@ { "package": "ansible-role-ldaptoolbox-openldap", "type": "git", "url": "https://github.com/ltb-project/ansible-role-ldaptoolbox-openldap.git", - "version": "main" + "version": "main", + "commit": "695a689ff91a83b47fbc6f575be37e1f811bd719" } ] } diff --git a/input/config/ubuntu/22.04/pytorch.json b/input/config/ubuntu/22.04/pytorch.json index 50950a07e..6312b383b 100644 --- a/input/config/ubuntu/22.04/pytorch.json +++ b/input/config/ubuntu/22.04/pytorch.json @@ -1,46 +1,56 @@ -{ +{ - "pytorch": { + "pytorch": { "cluster": [] }, "pytorch_cpu": { - "cluster": [ - { + "cluster": [ + { "package": "docker.io/pytorch/pytorch", "tag": "latest", "type": "image" } - ] + ] }, - "pytorch_amd": { + "pytorch_amd": { - "cluster": [ - { + "cluster": [ + { "package": "docker.io/rocm/pytorch", "tag": "latest", - "type": "image" + "type": "image" } ] }, - "pytorch_nvidia": { + "pytorch_nvidia": { - "cluster": [ - { + "cluster": [ + { "package": "nvcr.io/nvidia/pytorch", "tag": "23.12-py3", "type": "image" - + }, { "package": "nvidia-container-toolkit", - "type": "deb", + "type": "deb", "repo_name": "nvidia-container-toolkit" } ] - } + }, + "pytorch_gaudi": { + + "cluster": [ + { + "package": "vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0", + "tag": "latest", + "type": "image" + } + ] +} } \ No newline at end of file diff --git a/input/config/ubuntu/22.04/secure_login_node.json b/input/config/ubuntu/22.04/secure_login_node.json index 1b4b02104..d0b24144b 100644 --- a/input/config/ubuntu/22.04/secure_login_node.json +++ b/input/config/ubuntu/22.04/secure_login_node.json @@ -21,13 +21,13 @@ {"package": "tar", "type": "deb", "repo_name": "jammy"}, {"package": "wget", "type": "deb", "repo_name": "jammy"}, {"package": "acct", "type": "deb", "repo_name": "jammy"}, - {"package": "python3.9", "type": "deb", "repo_name": "deadsnake-ppa"}, + {"package": "python3.11", "type": "deb", "repo_name": "deadsnake-ppa"}, { - "package": "ansible==7.7.0", + "package": "ansible==9.5.1", "type": "pip_module" }, { - "package": "cryptography==41.0.7", + "package": "cryptography==44.0.0", "type": "pip_module" }, { diff --git a/input/config/ubuntu/22.04/telemetry.json b/input/config/ubuntu/22.04/telemetry.json index d944c83df..387603c2b 100644 --- a/input/config/ubuntu/22.04/telemetry.json +++ b/input/config/ubuntu/22.04/telemetry.json @@ -11,11 +11,6 @@ "type": "deb", "repo_name": "jammy" }, - { - "package": "containerd.io=1.6.20-1", - "type": "deb", - "repo_name": "docker-ce-repo" - }, { "package": "docker-ce-cli=5:20.10.20~3-0~ubuntu-jammy", "type": "deb", @@ -32,45 +27,45 @@ "repo_name": "docker-ce-repo" }, { - "package": "kubespray", + "package": "kubespray-v2.25.0", "type": "git", "url": "https://github.com/kubernetes-sigs/kubespray.git", - "version": "release-2.23" + "version": "v2.25.0" }, { - "package": "kubectl", + "package": "kubectl-1.29.5", "type": "tarball", - "url": "https://dl.k8s.io/release/v1.26.12/bin/linux/amd64/kubectl" + "url": "https://dl.k8s.io/release/v1.29.5/bin/linux/amd64/kubectl" }, { - "package": "kubelet", + "package": "kubelet-1.29.5", "type": "tarball", - "url": "https://dl.k8s.io/release/v1.26.12/bin/linux/amd64/kubelet" + "url": "https://dl.k8s.io/release/v1.29.5/bin/linux/amd64/kubelet" }, { - "package": "kubeadm", + "package": "kubeadm-1.29.5", "type": "tarball", - "url": "https://dl.k8s.io/release/v1.26.12/bin/linux/amd64/kubeadm" + "url": "https://dl.k8s.io/release/v1.29.5/bin/linux/amd64/kubeadm" }, { - "package": "calicoctl-v3.25.2", + "package": "calicoctl-v3.27.3", "type": "tarball", - "url": "https://github.com/projectcalico/calico/releases/download/v3.25.2/calicoctl-linux-amd64" + "url": "https://github.com/projectcalico/calico/releases/download/v3.27.3/calicoctl-linux-amd64" }, { - "package": "calicocrds-v3.25.2", + "package": "calicocrds-v3.27.3", "type": "tarball", - "url": "https://github.com/projectcalico/calico/archive/v3.25.2.tar.gz" + "url": "https://github.com/projectcalico/calico/archive/v3.27.3.tar.gz" }, { - "package": "cri-tools-v1.26.1", + "package": "cri-tools-v1.29.0", "type": "tarball", - "url": "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.26.1/crictl-v1.26.1-linux-amd64.tar.gz" + "url": "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.29.0/crictl-v1.29.0-linux-amd64.tar.gz" }, { - "package": "etcd-v3.5.10", + "package": "etcd-v3.5.12", "type": "tarball", - "url": "https://github.com/etcd-io/etcd/releases/download/v3.5.10/etcd-v3.5.10-linux-amd64.tar.gz" + "url": "https://github.com/etcd-io/etcd/releases/download/v3.5.12/etcd-v3.5.12-linux-amd64.tar.gz" }, { "package": "cni-plugins-v1.3.0", @@ -78,24 +73,24 @@ "url": "https://github.com/containernetworking/plugins/releases/download/v1.3.0/cni-plugins-linux-amd64-v1.3.0.tgz" }, { - "package": "runc.amd64", + "package": "runc.amd64-v1.1.12", "type": "tarball", - "url": "https://github.com/opencontainers/runc/releases/download/v1.1.9/runc.amd64" + "url": "https://github.com/opencontainers/runc/releases/download/v1.1.12/runc.amd64" }, { - "package": "nerdctl-v1.5.0", + "package": "nerdctl-v1.7.4", "type": "tarball", - "url": "https://github.com/containerd/nerdctl/releases/download/v1.5.0/nerdctl-1.5.0-linux-amd64.tar.gz" + "url": "https://github.com/containerd/nerdctl/releases/download/v1.7.4/nerdctl-1.7.4-linux-amd64.tar.gz" }, { - "package": "containerd-1.7.5", + "package": "containerd-1.7.16", "type": "tarball", - "url": "https://github.com/containerd/containerd/releases/download/v1.7.5/containerd-1.7.5-linux-amd64.tar.gz" + "url": "https://github.com/containerd/containerd/releases/download/v1.7.16/containerd-1.7.16-linux-amd64.tar.gz" }, { - "package": "helm-v3.12.3", + "package": "helm-v3.14.2", "type": "tarball", - "url": "https://get.helm.sh/helm-v3.12.3-linux-amd64.tar.gz" + "url": "https://get.helm.sh/helm-v3.14.2-linux-amd64.tar.gz" }, { "package": "nfs-subdir-external-provisioner-4.0.18", @@ -109,7 +104,7 @@ }, { "package": "registry.k8s.io/coredns/coredns", - "tag": "v1.9.3", + "tag": "v1.11.1", "type": "image" }, { @@ -124,22 +119,22 @@ }, { "package": "registry.k8s.io/kube-apiserver", - "tag": "v1.26.12", + "tag": "v1.29.5", "type": "image" }, { "package": "registry.k8s.io/kube-controller-manager", - "tag": "v1.26.12", + "tag": "v1.29.5", "type": "image" }, { "package": "registry.k8s.io/kube-proxy", - "tag": "v1.26.12", + "tag": "v1.29.5", "type": "image" }, { "package": "registry.k8s.io/kube-scheduler", - "tag": "v1.26.12", + "tag": "v1.29.5", "type": "image" }, { @@ -149,27 +144,27 @@ }, { "package": "quay.io/coreos/etcd", - "tag": "v3.5.10", + "tag": "v3.5.12", "type": "image" }, { "package": "quay.io/calico/cni", - "tag": "v3.25.2", + "tag": "v3.27.3", "type": "image" }, { "package": "quay.io/calico/kube-controllers", - "tag": "v3.25.2", + "tag": "v3.27.3", "type": "image" }, { "package": "quay.io/calico/pod2daemon-flexvol", - "tag": "v3.25.2", + "tag": "v3.27.3", "type": "image" }, { "package": "quay.io/calico/node", - "tag": "v3.25.2", + "tag": "v3.27.3", "type": "image" }, { @@ -194,18 +189,78 @@ }, { "package": "docker.io/kubernetesui/dashboard", - "tag": "v2.2.0", + "tag": "v2.7.0", "type": "image" }, { "package": "docker.io/kubernetesui/metrics-scraper", - "tag": "v1.0.6", + "tag": "v1.0.8", "type": "image" - }, + }, { "package": "docker.io/grafana/grafana-enterprise", "tag": "8.3.2", "type": "image" + }, + { + "package": "kube-prometheus-stack-62.3.0", + "type": "tarball", + "url": "https://github.com/prometheus-community/helm-charts/releases/download/kube-prometheus-stack-62.3.0/kube-prometheus-stack-62.3.0.tgz" + }, + { + "package": "quay.io/prometheus-operator/prometheus-operator", + "tag": "v0.76.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-state-metrics/kube-state-metrics", + "tag": "v2.13.0", + "type": "image" + }, + { + "package": "quay.io/prometheus-operator/prometheus-config-reloader", + "tag": "v0.76.0", + "type": "image" + }, + { + "package": "quay.io/prometheus/alertmanager", + "tag": "v0.27.0", + "type": "image" + }, + { + "package": "quay.io/prometheus/node-exporter", + "tag": "v1.8.2", + "type": "image" + }, + { + "package": "quay.io/prometheus/prometheus", + "tag": "v2.54.0", + "type": "image" + }, + { + "package": "registry.k8s.io/ingress-nginx/kube-webhook-certgen", + "tag": "v20221220-controller-v1.5.1-58-g787ea74b6", + "type": "image" + }, + { + "package": "metric-exporter-daemonset", + "type": "manifest", + "url": "https://vault.habana.ai/artifactory/gaudi-metric-exporter/yaml/1.18.0/metric-exporter-daemonset.yaml" + }, + { + "package": "metric-exporter-service", + "type": "manifest", + "url": "https://vault.habana.ai/artifactory/gaudi-metric-exporter/yaml/1.18.0/metric-exporter-service.yaml" + }, + { + "package": "metric-exporter-serviceMonitor", + "type": "manifest", + "url": "https://vault.habana.ai/artifactory/gaudi-metric-exporter/yaml/1.18.0/metric-exporter-serviceMonitor.yaml" + }, + { + "package": "vault.habana.ai/gaudi-metric-exporter/metric-exporter", + "tag": "1.18.0-524", + "type": "image" } ] } diff --git a/input/config/ubuntu/22.04/vllm.json b/input/config/ubuntu/22.04/vllm.json index 0957956d6..f7e985eea 100644 --- a/input/config/ubuntu/22.04/vllm.json +++ b/input/config/ubuntu/22.04/vllm.json @@ -20,21 +20,25 @@ "cluster": [ { - "package": "python3.9", + "package": "python3.11", "type": "deb", "repo_name": "deadsnake-ppa" }, { - "package": "https://download.pytorch.org/whl/cu121/torch-2.1.2%2Bcu121-cp39-cp39-linux_x86_64.whl", + "package": "https://download.pytorch.org/whl/cu121/torch-2.1.2%2Bcu121-cp311-cp311-linux_x86_64.whl", "type": "pip_module" }, { - "package": "https://github.com/vllm-project/vllm/releases/download/v0.4.0/vllm-0.4.0-cp39-cp39-manylinux1_x86_64.whl", + "package": "https://github.com/vllm-project/vllm/releases/download/v0.4.0/vllm-0.4.0-cp311-cp311-manylinux1_x86_64.whl", "type": "pip_module" - } + }, + { + "package": "numpy<2", + "type": "pip_module" + } ] } -} \ No newline at end of file +} diff --git a/input/k8s_access_config.yml b/input/k8s_access_config.yml index 1b6451dc0..ab6ce09e0 100644 --- a/input/k8s_access_config.yml +++ b/input/k8s_access_config.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,12 +13,12 @@ # limitations under the License. --- -#*********************************************************************** +# *********************************************************************** # DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. # SIMPLY APPEND THE REQUIRD VALUES AGAINST THE PARAMETER OF YOUR CHOICE. -#*********************************************************************** +# *********************************************************************** # This variable accepts the usernames for which k8s access needs to be setup # Eg1. user_name: "user1" # Eg2. user_name: "user1,user2,user3" -user_name: "" \ No newline at end of file +user_name: "" diff --git a/input/local_repo_config.yml b/input/local_repo_config.yml index 473659d53..bdc6a354f 100644 --- a/input/local_repo_config.yml +++ b/input/local_repo_config.yml @@ -13,23 +13,23 @@ # limitations under the License. --- -#*********************************************************************** -# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. # SIMPLY APPEND THE REQUIRD VALUES AGAINST THE PARAMETER OF YOUR CHOICE. -#*********************************************************************** +# *********************************************************************** # Mandatory # All the offline repository data for omnia stack will be stored at this path. # Please make sure assigned partition has enough space. # Ensure 755 permission is given to repo_store_path if user intends to use nfs share mount for repo_store_path -# Default: /omnia_repo -repo_store_path: "/omnia_repo" +# Default: /opt/omnia_repo +repo_store_path: "/opt/omnia_repo" # Optional # This variable accepts the repository urls of the user which contains the packages required for the cluster. -# always: In this case, Omnia creates a local repo on the Control plane hosting all the packages required for the cluster. +# always: In this case, Omnia creates a local repo on the Omnia Infrastructure Manager hosting all the packages required for the cluster. # User should make sure required disk space is available. -# partial: In this case, Omnia creates a local repo on the Control Plane hosting packages not part of user's repository. +# partial: In this case, Omnia creates a local repo on the Omnia Infrastructure Manager hosting packages not part of user's repository. # never: In this case, Omnia does not create a local repo. All the packages are directly downloaded on the cluster. # This variable accepts repo url and gpgkey # url: defines the baseurl for the repository @@ -46,9 +46,9 @@ user_repo_url: # Optional # This variable accepts the registry url along with port of the user which contains the images required for cluster. -# always: In this case, Omnia creates a local registry on the Control plane hosting all the images required for the cluster. +# always: In this case, Omnia creates a local registry on the Omnia Infrastructure Manager hosting all the images required for the cluster. # User should make sure required disk space is available. -# partial: In this case, Omnia creates a local registry on the Control Plane hosting images not part of user's registry. +# partial: In this case, Omnia creates a local registry on the Omnia Infrastructure Manager hosting images not part of user's registry. # never: In this case, Omnia does not create a local registry. All the images are directly downloaded on the cluster. # This variable accepts host and cert_path # host: defines the url and port for registry @@ -66,15 +66,15 @@ user_registry: # When repo_config is always, partial or never, the given ubuntu_os_url configured via proxy in compute nodes # Online ubuntu_os_url for Ubuntu 22.04 is http://in.archive.ubuntu.com/ubuntu # Online ubuntu_os_url for Ubuntu 20.04 is http://archive.ubuntu.com/ubuntu -# Example: +# Example: # When cluster_os_type is Ubuntu 22.04 # ubuntu_os_url: "http://in.archive.ubuntu.com/ubuntu" ubuntu_os_url: # Mandatory when cluster_os_type is rhel in softwares_config.json # This variable will be ignored when cluster_os_type is ubuntu or rocky -# User has to provide the code ready builder url inorder to download the packages -# When repo_config is always, the given rhel_os_url will be configured in the control plane and packages required for cluster will be downloaded +# User has to provide the code ready builder url that should not have a RedHat subscription authentication inorder to download the packages +# When repo_config is always, the given rhel_os_url will be configured in the Omnia Infrastructure Manager and packages required for cluster will be downloaded # When repo_config is partial or never, the packages required for cluster which were coming from rhel_repo_url will not be downloaded. # and the rhel_os_url configured via proxy in compute nodes # Example: @@ -99,7 +99,7 @@ omnia_repo_url_rhel: - { url: "https://ltb-project.org/rpm/openldap25/$releasever/$basearch", gpgkey: ""} - { url: "https://nvidia.github.io/libnvidia-container/stable/rpm/$basearch", gpgkey: "https://nvidia.github.io/libnvidia-container/gpgkey"} - { url: "https://a2o.github.io/snoopy-packages/repo/centos/8/stable/", gpgkey: ""} - + # Mandatory # This variable defines all the repo urls from where rpms will be downloaded for omnia features when cluster_os_type is rocky. # Making incorrect changes to this variable can cause omnia failure. Please edit cautiously. @@ -134,3 +134,4 @@ omnia_repo_url_ubuntu: - { url: "https://nvidia.github.io/libnvidia-container/stable/deb/amd64 /", gpgkey: "https://nvidia.github.io/libnvidia-container/gpgkey" } - { url: "http://ppa.launchpad.net/deadsnakes/ppa/ubuntu {{ os_release }} main", gpgkey: "" } - { url: "https://a2o.github.io/snoopy-packages/repo/ubuntu {{ os_release }} stable", publickey: "https://a2o.github.io/snoopy-packages/snoopy-packages-key.pub" } + - { url: "https://vault.habana.ai/artifactory/debian {{ os_release }} main", publickey: "https://vault.habana.ai/artifactory/api/gpg/key/public" } diff --git a/input/login_node_security_config.yml b/input/login_node_security_config.yml index b5861c5be..9f24db570 100644 --- a/input/login_node_security_config.yml +++ b/input/login_node_security_config.yml @@ -1,4 +1,4 @@ -# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,10 +13,10 @@ # limitations under the License. --- -#*********************************************************************** -# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. # SIMPLY APPEND THE REQUIRD VALUES AGAINST THE PARAMETER OF YOUR CHOICE. -#*********************************************************************** +# *********************************************************************** # Maximum number of consecutive failures before lockout # The default value of this variable can't be changed @@ -29,7 +29,7 @@ max_failures: 3 # Max: 60 failure_reset_interval: 60 -# Period (in seconds) for which users are locked out +# Period (in seconds) for which users are locked out # Default value: 10 # Min: 5 # Max: 10 @@ -43,11 +43,21 @@ session_timeout: 180 # Email address used for sending alerts in case of authentication failure # If this variable is left blank, authentication failure alerts will be disabled. -# Currently, only one email ID is accepted in this field +# Multiple email address can be provided by comma separated values +# Example: alert_email_address: "user1@domain.com,user2@domain.com" alert_email_address: "" +# This variable will be applicable only when alert_email_address is provided +# SMTP server details in the cluster to sent email alerts +# Supported only single SMTP server configuration +# Example: +# smtp_server: +# - { host: "smtp-server.domain.com", port: "25", sender_address: "alert@domain.com" }" +smtp_server: + - { host: "", port: "", sender_address: "" } + # This variable mentions the users to whom the access will be provided -# format of user shall be username@ip or username +# format of user shall be username@ip or username # Ex1- root@1.2.3.4 Ex2- root Ex3- root@1.2.3.4 root (if multiple user, provide space seperated values) by default empty user: "" @@ -58,8 +68,8 @@ user: "" allow_deny: "allow" # This variable is used to disable services. -# Accepted values: "true" or "false". -# Default value: false +# Accepted values: "true" or "false". +# Default value: false # Root access is needed. restrict_program_support: false diff --git a/input/network_config.yml b/input/network_config.yml index 48671c1de..413dce707 100644 --- a/input/network_config.yml +++ b/input/network_config.yml @@ -1,4 +1,4 @@ -# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,10 +13,10 @@ # limitations under the License. --- -#*********************************************************************** -# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. # SIMPLY APPEND THE REQUIRD VALUES AGAINST THE PARAMETER OF YOUR CHOICE. -#*********************************************************************** +# *********************************************************************** # Absolute path to local copy of .tgz file containing mlnx_ofed package. # The package can be downloaded from https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/ @@ -28,8 +28,8 @@ mlnx_ofed_offline_path: "" # Default value: 5.4-2.4.1.3 mlnx_ofed_version: 5.4-2.4.1.3 -# Set this variable to true if kernel version currently available on compute nodes is -# not compatible with mlnx_ofed version in use. +# Set this variable to true if kernel version currently available on compute nodes is +# not compatible with mlnx_ofed version in use. # Mandatory variable # Default value: true mlnx_ofed_add_kernel_support: true diff --git a/input/network_spec.yml b/input/network_spec.yml index a37d1e656..d396bacb4 100644 --- a/input/network_spec.yml +++ b/input/network_spec.yml @@ -21,21 +21,21 @@ network_gateway: "" MTU: "1500" -#********************************************************************** +# ********************************************************************** # Following are the templates for providing additional network details -# If vlan creation is required ensure vlan name is provided in the format NIC.vlan_id(eth1.101) in server_spec.yml -#********************************************************************** +# If vlan creation is required ensure vlan name is provided in the format NIC.vlan_id(eth1.101) in server_spec.yml +# ********************************************************************** -# - thor_network1: +# - nic_network1: # netmask_bits: "20" # CIDR: "10.10.16.0" # network_gateway: "" # MTU: "1500" # VLAN: "" # -# - thor_network2: +# - nic_network2: # netmask_bits: "20" # static_range: "10.10.1.1-10.10.15.254" # network_gateway: "" # MTU: "1500" -# VLAN: "1" \ No newline at end of file +# VLAN: "1" diff --git a/input/omnia_config.yml b/input/omnia_config.yml index 3357087fa..cf5add78c 100644 --- a/input/omnia_config.yml +++ b/input/omnia_config.yml @@ -13,10 +13,10 @@ # limitations under the License. --- -#*********************************************************************** -# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. # SIMPLY APPEND THE REQUIRD VALUES AGAINST THE PARAMETER OF YOUR CHOICE. -#*********************************************************************** +# *********************************************************************** # Path to directory hosting ansible config file (ansible.cfg file) # Default value is "/etc/ansible" @@ -43,7 +43,7 @@ slurm_installation_type: "configless" # Default value is true restart_slurm_services: true -#----------------------------K8S------------------------------------------------------ +# ----------------------------K8S------------------------------------------------------ # Kubernetes SDN network. # It can either be "calico" or "flannel". @@ -57,7 +57,6 @@ k8s_cni: "calico" # Mandatory Field pod_external_ip_range: "" -###------ADVANCE CONFIGURATIONS FOR KUBERNETES------ # Kubernetes internal network for services. # This network must be unused in your network infrastructure. # Default value is "10.233.0.0/18" @@ -68,3 +67,33 @@ k8s_service_addresses: "10.233.0.0/18" # This network must be unused in your network infrastructure. # Default value is "10.233.64.0/18" k8s_pod_network_cidr: "10.233.64.0/18" + +# Kubernetes Topology manager policies. +# It can either be "none" or "best-effort" or "restricted" or "single-numa-node". +# Default value assigned is "none". +topology_manager_policy: "none" + +# Kubernetes Topology manager scope. +# It can either be "container" or "pod". +# Default value assigned is "container". +topology_manager_scope: "container" + +# ----------------------------VERIFY INTEL GAUDI INSTALLATION------------------------------------------------------ + +# It's recommended to do extensive tests hl_qual and hccl when installing a new Gaudi node. This takes around 20mins. +# To do that during provisioning this variable has to be set to true. +# By default no tests are run +run_intel_gaudi_tests: false + +# ----------------------------CSI Driver------------------------------------------------------ +# Following csi powerscale driver input variables are mandatory only if csi_driver_powerscale entry is present in software_config.json + +# Absolute file path for the secret.yaml file. +# User need to download secret.yaml file and fill required data in secret file. Provided the path of the secret file here. + +csi_powerscale_driver_secret_file_path: "" + +# File path for the values.yml file which will contain the Powerscale driver configuration parameters. +# User need to download values.yaml file and fill required data in values.yaml file. Provided the path of the values.yaml file here. +# mention configurable values +csi_powerscale_driver_values_file_path: "" diff --git a/input/passwordless_ssh_config.yml b/input/passwordless_ssh_config.yml index cf408605e..673440418 100644 --- a/input/passwordless_ssh_config.yml +++ b/input/passwordless_ssh_config.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,10 +13,10 @@ # limitations under the License. --- -#*********************************************************************** -# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. # SIMPLY APPEND THE REQUIRD VALUES AGAINST THE PARAMETER OF YOUR CHOICE. -#*********************************************************************** +# *********************************************************************** # This variable accepts the user name for which passwordless ssh needs to be setup # Eg. user_name: "user1,user2,user3" @@ -24,5 +24,5 @@ user_name: "" # Variable indicating whether FreeIPA or LDAP is setup # It can be "freeipa" or "ldap" -# Default value: freeipa +# Default value: ldap authentication_type: "ldap" diff --git a/input/provision_config.yml b/input/provision_config.yml index 8976a781c..fa9623ed0 100644 --- a/input/provision_config.yml +++ b/input/provision_config.yml @@ -13,10 +13,10 @@ # limitations under the License. --- -#*********************************************************************** +# *********************************************************************** # DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. # SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE. -#*********************************************************************** +# *********************************************************************** #### Mandatory # Path where user has placed the iso image that needs to be provisioned on target nodes. @@ -89,7 +89,7 @@ disk_partition: #### Mandatory # Timezone that needs to be set during OS provisioning. -# Available timezones are provided in provision/roles/provision_validation/files/timezone.txt +# Available timezones are provided in discovery/roles/discovery_validations/common/files/timezone.txt # Default: "GMT" # Few accepted values: EST,CET,MST,CST6CDT,PST8PDT timezone: "GMT" @@ -105,4 +105,12 @@ language: "en-US" # Min: 21600 # Default: 86400 # Max: 31536000 -default_lease_time: "86400" \ No newline at end of file +default_lease_time: "86400" + +#### Mandatory +# The ntp_support variable controls whether the cluster will have a Network Time Protocol (NTP) server configured in the Omnia Infrastructure Manager. +# If ntp_support is set to true, NTP server will be configured in the Omnia Infrastructure Manager and the time will be synchronized to the cluster nodes. +# If ntp_support is set to false, NTP server will not be configured in the Omnia Infrastructure Manager and the time will not be synchronized to the cluster nodes. +# In a proxy environment or environment with restricted network access, setting up NTP server in Omnia Infrastructure Manager can result in failure due to unreachable public NTP pools and is not recommended. +# Default: true +ntp_support: true diff --git a/input/provision_config_credentials.yml b/input/provision_config_credentials.yml index 7c2e33438..198adb654 100644 --- a/input/provision_config_credentials.yml +++ b/input/provision_config_credentials.yml @@ -49,4 +49,4 @@ docker_username: "" # Password for Dockerhub account # This will be used for Docker login # This value is mandatory if docker username is provided -docker_password: "" \ No newline at end of file +docker_password: "" diff --git a/input/roce_plugin_config.yml b/input/roce_plugin_config.yml index cab5f5562..91c146344 100644 --- a/input/roce_plugin_config.yml +++ b/input/roce_plugin_config.yml @@ -1,4 +1,4 @@ -# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -35,47 +35,47 @@ interfaces: range: 192.168.1.0/24 range_start: range_end: - gateway: 192.168.1.1 - route: 192.168.1.0/24 + gateway: + route: - name: eth2 range: 192.168.2.0/24 range_start: range_end: - gateway: + gateway: route: - name: eth3 range: 192.168.3.0/24 range_start: range_end: - gateway: + gateway: route: - name: eth4 range: 192.168.4.0/24 range_start: range_end: - gateway: + gateway: route: - name: eth5 range: 192.168.5.0/24 range_start: range_end: - gateway: + gateway: route: - name: eth6 range: 192.168.6.0/24 range_start: range_end: - gateway: + gateway: route: - name: eth7 range: 192.168.7.0/24 range_start: range_end: - gateway: + gateway: route: - name: eth8 range: 192.168.8.0/24 range_start: range_end: - gateway: - route: \ No newline at end of file + gateway: + route: diff --git a/input/security_config.yml b/input/security_config.yml index 24455d509..7cd7c6676 100644 --- a/input/security_config.yml +++ b/input/security_config.yml @@ -13,10 +13,10 @@ # limitations under the License. --- -#*********************************************************************** -# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. # SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE. -#*********************************************************************** +# *********************************************************************** # This variable is used to accept the domain name the user intends to configure # This variable is a mandatory requirement of both FreeIPA and OpenLDAP Client @@ -79,7 +79,7 @@ openldap_config_password: "" openldap_monitor_password: "" # OpenLDAP server is configured using organizations -# These organizations and its units are necessary for user creation and group mapping +# These organizations and its units are necessary for user creation and group mapping # Eg. openldap_organization: "omnia", openldap_organizational_unit: "People" # Here user belongs to Omnia organization and is part of People unit # The default values are openldap_organization: "omnia", openldap_organizational_unit: "People" diff --git a/input/server_spec.yml b/input/server_spec.yml index 2c2cbb504..7b275a683 100644 --- a/input/server_spec.yml +++ b/input/server_spec.yml @@ -1,17 +1,43 @@ --- Categories: - group-1: - - network: - - ensp0: - nicnetwork: "thor_network1" - nictypes: "ethernet" - - ensp0.5: - nicnetwork: "thor_network2" - nictypes: "vlan" - nicdevices: "ensp0" + - network: + - ensp0: + nicnetwork: "nic_network1" + nictypes: "ethernet" + - ensp0.5: + nicnetwork: "nic_network2" + nictypes: "vlan" + nicdevices: "ensp0" + - os: + - kernel: + - cmdline: "" - group-2: - - network: - - eno1: - nicnetwork: "thor_network1" - nictypes: "ethernet" + - network: + - eno1: + nicnetwork: "nic_network1" + nictypes: "ethernet" + +# ********************************************************************** +# Following are the templates for providing additional network and OS details. +# Users may include the `os` or `network` sections individually if only one +# of them needs to be configured, or both together as well. +# +# - Use space (' ') as a delimiter in case of multiple parameters for cmdline. +# ********************************************************************** +# +# Example for configuring only network settings: +# Categories: +# - group-1: +# - network: +# - eno1: +# nicnetwork: "nic_network1" +# nictypes: "ethernet" +# +# Example for configuring only OS settings: +# Categories: +# - group-2: +# - os: +# - kernel: +# - cmdline: "iommu=pt intel_iommu=off pci=realloc=off processor.max_cstate=0 intel_idle.max_cstate=0 intel_pstate=disable" diff --git a/input/site_config.yml b/input/site_config.yml new file mode 100644 index 000000000..f48ccdcee --- /dev/null +++ b/input/site_config.yml @@ -0,0 +1,25 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Proxy configuration for the Omnia Infrastructure Manager if it is behind a proxy environment for internet access +# If proxy details are not provided, Omnia assumes direct internet connectivity is available to the Omnia Infrastructure Manager +# The values for http_proxy and https_proxy in the proxy variable should be set as environment variables http_proxy and https_proxy in the Omnia Infrastructure Manager. +# The no_proxy environment variable should include the Omnia Infrastructure Manager hostname and the admin network IP address. +# Example of providing proxy details: +# proxy: +# - { http_proxy: "http://corporate-proxy:3128", https_proxy: "http://corporate-proxy:3128", no_proxy: "" } +# +proxy: + - { http_proxy: "", https_proxy: "", no_proxy: "" } diff --git a/input/software_config.json b/input/software_config.json index 76b294a5c..d1318a869 100644 --- a/input/software_config.json +++ b/input/software_config.json @@ -3,27 +3,32 @@ "cluster_os_version": "22.04", "repo_config": "partial", "softwares": [ - {"name": "amdgpu", "version": "6.0"}, - {"name": "bcm_roce", "version": "229.2.61.0"}, + {"name": "amdgpu", "version": "6.2.2"}, + {"name": "bcm_roce", "version": "230.2.54.0"}, {"name": "openldap"}, {"name": "nfs"}, - {"name": "k8s", "version":"1.26.12"}, + {"name": "k8s", "version":"1.29.5"}, {"name": "roce_plugin"}, {"name": "jupyter"}, {"name": "pytorch"}, - {"name": "tensorflow"} + {"name": "tensorflow"}, + {"name": "intelgaudi", "version": "1.18.0-524"} ], "bcm_roce": [ - {"name": "bcm_roce_libraries", "version": "229.2.61.0"} + {"name": "bcm_roce_libraries", "version": "230.2.54.0"} ], "amdgpu": [ - {"name": "rocm", "version": "6.0" } + {"name": "rocm", "version": "6.2.2" } + ], + "intelgaudi": [ + {"name": "intel"} ], "pytorch": [ {"name": "pytorch_cpu"}, {"name": "pytorch_amd"}, - {"name": "pytorch_nvidia"} + {"name": "pytorch_nvidia"}, + {"name": "pytorch_gaudi"} ], "tensorflow": [ {"name": "tensorflow_cpu"}, @@ -31,4 +36,4 @@ {"name": "tensorflow_nvidia"} ] -} \ No newline at end of file +} diff --git a/input/storage_config.yml b/input/storage_config.yml index 4bd07cedd..744ddaeec 100644 --- a/input/storage_config.yml +++ b/input/storage_config.yml @@ -13,10 +13,10 @@ # limitations under the License. --- -#*********************************************************************** -# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. # SIMPLY APPEND THE REQUIRD VALUES AGAINST THE PARAMETER OF YOUR CHOICE. -#*********************************************************************** +# *********************************************************************** # -----------------------------NFS------------------------------------------------ @@ -26,21 +26,21 @@ # Values should be entered in JSON format only. # If mount_option values are empty, NFS client will be mounted with these values "nosuid,rw,sync,hard,intr" # Its mandatory to provide atleast one entry in nfs_client_params -# If user wants to setup NFS server on control plane "localhost" can be mentioned as server_ip or admin_nic_ip of control plane also can be provided. +# If user wants to setup NFS server on Omnia Infrastructure Manager "localhost" can be mentioned as server_ip or admin_nic_ip of Omnia Infrastructure Manager also can be provided. # For the server which must be used as k8s server share for NFS external provisioner must be given k8s_share as true # For the server which must be used as slurm share, slurm_share must be given as true # For benchmarks, either slurm_share or k8s_share will be used. Higher precedence will be given to slurm_share # Example for single mount file system: -# nfs_client_params: +# nfs_client_params: # - { server_ip: 10.5.0.101, server_share_path: "/mnt/share", client_share_path: "/home", client_mount_options: "nosuid,rw,sync,hard", nfs_server: true, slurm_share: true, k8s_share: true } # Example for supporting multiple mount points: -# nfs_client_params: -# - { server_ip: localhost, server_share_path: "/mnt/share1", client_share_path: "/home", client_mount_options: "nosuid,rw,sync,hard", nfs_server: true, slurm_share: false, k8s_share: true } +# nfs_client_params: +# - { server_ip: localhost, server_share_path: "/mnt/share1", client_share_path: "/home", client_mount_options: "nosuid,rw,sync,hard", nfs_server: true, slurm_share: false, k8s_share: true } # - { server_ip: 198.168.0.1, server_share_path: "/mnt/share2", client_share_path: "/mnt/mount2", client_mount_options: "nosuid,rw,sync,hard", nfs_server: false, slurm_share: true, k8s_share: true} # Example for multiple mount file system: -# nfs_client_params: -# - { server_ip: 198.168.0.1, server_share_path: "/mnt/share1", client_share_path: "/mnt/mount1", client_mount_options: "nosuid,rw,sync,hard", nfs_server: false , slurm_share: false, k8s_share: true} -# - { server_ip: 198.168.0.2, server_share_path: "/mnt/share2", client_share_path: "/mnt/mount2", client_mount_options: "nosuid,rw,sync,hard", nfs_server: false , slurm_share: true, k8s_share: false} +# nfs_client_params: +# - { server_ip: 198.168.0.1, server_share_path: "/mnt/share1", client_share_path: "/mnt/mount1", client_mount_options: "nosuid,rw,sync,hard", nfs_server: false , slurm_share: false, k8s_share: true} +# - { server_ip: 198.168.0.2, server_share_path: "/mnt/share2", client_share_path: "/mnt/mount2", client_mount_options: "nosuid,rw,sync,hard", nfs_server: false , slurm_share: true, k8s_share: false} nfs_client_params: - { server_ip: localhost, server_share_path: /mnt/omnia_home_share, client_share_path: /home, client_mount_options: "nosuid,rw,sync,hard,intr", nfs_server: true, slurm_share: true, k8s_share: true } diff --git a/input/telemetry_config.yml b/input/telemetry_config.yml index e7fe864eb..4717c3e96 100644 --- a/input/telemetry_config.yml +++ b/input/telemetry_config.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,10 +13,10 @@ # limitations under the License. --- -#*********************************************************************** -# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. # SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE. -#*********************************************************************** +# *********************************************************************** # This variable is used to enable iDRAC telemetry support # Accepted values: true or false @@ -32,9 +32,24 @@ omnia_telemetry_support: false # Accepted values: true or false visualization_support: false +# This variable signifies support for k8s metric collection and Kube Prometheus deployment on kube_control_plane +# Accepted values: true or false +k8s_prometheus_support: false + +# This variable denotes the time interval for prometheus to collect metrics for targets +# This variable accepts input in seconds +# Default value is 15 +prometheus_scrape_interval: 15 + +# This variable signifies the support for Intel Gaudi habana Metric collection using Gaudi Prometheus metric exporter. +# k8s_prometheus_support must be true for this metric support. +# prometheus_gaudi_support is only available for cluster_os_type: ubuntu and cluster_os_version: 22.04 in software_config.json +# Accepted values: true or false +prometheus_gaudi_support: false + ##### BELOW VARIABLES ARE MANDATORY IF telemetry ENTRY is PRESENT in software_config.json AND EITHER idrac_telemetry_support OR omnia_telemetry_support OR visualization_support IS true -###-----CONFIGURATIONS FOR KUBERNETES ON CONTROL PLANE FOR TELEMETRY SUPPORT------ -# These addresses will be used by Loadbalancer for assigning External IPs to K8s services running on control plane +# ##-----CONFIGURATIONS FOR KUBERNETES ON Omnia Infrastructure Manager FOR TELEMETRY SUPPORT------ +# These addresses will be used by Loadbalancer for assigning External IPs to K8s services running on Omnia Infrastructure Manager # Make sure the IP range is not assigned to any node in the cluster. # If admin_nic network provided in network_spec.yml is in 10.11.0.0 network, then pod_external_ip_range should be in same netwwork like "10.11.0.60-10.11.0.70" # Acceptable formats: "10.11.0.100-10.11.0.150" , "10.11.0.0/16" @@ -47,7 +62,7 @@ pod_external_ip_range: "" # Default value assigned is "calico". k8s_cni: "calico" -###------ADVANCE CONFIGURATIONS FOR KUBERNETES------ +# ##------ADVANCE CONFIGURATIONS FOR KUBERNETES------ # Kubernetes internal network for services. # This network must be unused in your network infrastructure. # Default value is "10.233.0.0/18" diff --git a/local_repo/ansible.cfg b/local_repo/ansible.cfg index b0144abf5..217f00f3d 100644 --- a/local_repo/ansible.cfg +++ b/local_repo/ansible.cfg @@ -6,6 +6,7 @@ timeout = 180 executable = /bin/bash stdout_callback = default callbacks_enabled = profile_tasks +collections_path = $VIRTUAL_ENV [persistent_connection] command_timeout = 180 diff --git a/local_repo/local_repo.yml b/local_repo/local_repo.yml index 5d2631427..541f442b3 100644 --- a/local_repo/local_repo.yml +++ b/local_repo/local_repo.yml @@ -13,6 +13,14 @@ # limitations under the License. --- +- name: Check if virtual environment is active + ansible.builtin.import_playbook: ../utils/check_venv.yml + when: not ( check_venv_executed | default(false) | bool ) + +- name: Check if package manager is not locked + ansible.builtin.import_playbook: ../utils/check_package_lock.yml + when: not ( hostvars['127.0.0.1']['apt_lock_status'] | default(false) | bool ) + - name: Local Repository Playbook hosts: localhost connection: local diff --git a/local_repo/roles/configure_registry/tasks/certificates.yml b/local_repo/roles/configure_registry/tasks/certificates.yml index 7d9565578..0c747f403 100644 --- a/local_repo/roles/configure_registry/tasks/certificates.yml +++ b/local_repo/roles/configure_registry/tasks/certificates.yml @@ -26,8 +26,8 @@ - "{{ nerdctl_registry_data_dir }}" # Create self-signed certificate -# Read hostname of control plane -- name: Read hostname of control plane +# Read hostname of Omnia Infrastructure Manager +- name: Read hostname of Omnia Infrastructure Manager ansible.builtin.command: hostname changed_when: false register: hostname_result diff --git a/local_repo/roles/configure_registry/tasks/docker_login.yml b/local_repo/roles/configure_registry/tasks/docker_login.yml index 00eaaee98..00e9f8489 100644 --- a/local_repo/roles/configure_registry/tasks/docker_login.yml +++ b/local_repo/roles/configure_registry/tasks/docker_login.yml @@ -17,10 +17,13 @@ ansible.builtin.command: nerdctl login -u {{ docker_username }} -p {{ docker_password }} changed_when: true register: docker_login_output + retries: "{{ retry_count }}" + delay: "{{ delay_time }}" + until: docker_login_output.rc == 0 failed_when: false no_log: true - name: Docker login check ansible.builtin.fail: - msg: "{{ docker_login_fail_msg }}" - when: docker_login_output is failed + msg: "{{ docker_login_fail_msg }} Error: {{ docker_login_output.stderr }}" + when: docker_login_output.rc != 0 diff --git a/local_repo/roles/configure_registry/tasks/initiate_private_registry.yml b/local_repo/roles/configure_registry/tasks/initiate_private_registry.yml index 55891bb3d..4a6edb729 100644 --- a/local_repo/roles/configure_registry/tasks/initiate_private_registry.yml +++ b/local_repo/roles/configure_registry/tasks/initiate_private_registry.yml @@ -69,20 +69,15 @@ dest: "{{ containerd_certs_dir }}/{{ hostname_result.stdout }}:{{ nerdctl_registry_port }}/ca.crt" mode: "{{ file_permission }}" -- name: Fetch control_plane hostname - ansible.builtin.command: hostname - changed_when: false - register: cp_hostname - -- name: Check /etc/hosts has control plane hostname +- name: Check /etc/hosts has Omnia Infrastructure Manager hostname ansible.builtin.command: cat "{{ hosts_file }}" changed_when: false register: hosts_file_content -- name: Update control plane hostname in /etc/hosts +- name: Update Omnia Infrastructure Manager hostname in /etc/hosts ansible.builtin.lineinfile: path: "{{ hosts_file }}" insertafter: "EOF" state: present - line: "127.0.0.1 {{ cp_hostname.stdout }}" - when: cp_hostname.stdout not in hosts_file_content.stdout + line: "127.0.0.1 {{ oim_hostname }}" + when: oim_hostname not in hosts_file_content.stdout diff --git a/local_repo/roles/configure_registry/tasks/main.yml b/local_repo/roles/configure_registry/tasks/main.yml index 13091c0ad..947cc4570 100644 --- a/local_repo/roles/configure_registry/tasks/main.yml +++ b/local_repo/roles/configure_registry/tasks/main.yml @@ -16,6 +16,10 @@ - name: Include vars for {{ ansible_distribution | lower }} ansible.builtin.include_vars: "{{ role_path }}/vars/{{ ansible_distribution | lower }}.yml" +- name: Configure environment variables + ansible.builtin.include_tasks: set_environment.yml + when: proxy_status + - name: Include pre-requisites ansible.builtin.include_tasks: pre_requisites_{{ ansible_distribution | lower }}.yml diff --git a/local_repo/roles/configure_registry/tasks/pre_requisites_redhat.yml b/local_repo/roles/configure_registry/tasks/pre_requisites_redhat.yml index 00b5b33da..685225e4c 100644 --- a/local_repo/roles/configure_registry/tasks/pre_requisites_redhat.yml +++ b/local_repo/roles/configure_registry/tasks/pre_requisites_redhat.yml @@ -25,6 +25,29 @@ state: absent with_items: "{{ conflicting_packages }}" +- name: Install python rpm interface + ansible.builtin.dnf: + name: python3-rpm + state: present + +- name: Gather the package facts + ansible.builtin.package_facts: + manager: rpm + no_log: true + +- name: Check whether containerd.io version + ansible.builtin.set_fact: + higher_containerd_version: true + when: (containerd_str in ansible_facts.packages) and (ansible_facts.packages[containerd_str][0]['version'] + is ansible.builtin.version(containerd_version, '>')) + +- name: Warning prompt if containerd is downgrading + ansible.builtin.pause: + seconds: 20 + prompt: "Higher version of {{ containerd_str }} {{ ansible_facts.packages['containerd.io'][0]['version'] }} already present, + Proceeding with downgrade to {{ containerd_version }}." + when: higher_containerd_version is defined and higher_containerd_version + # Install Pre-requisites # Download and Install containerd - name: Download containerd.io RPM @@ -40,6 +63,7 @@ ansible.builtin.dnf: name: "{{ containerd_rpm_dest }}" state: present + allow_downgrade: true disable_gpg_check: true - name: Start containerd @@ -58,10 +82,16 @@ until: download_nerdctl is not failed retries: "{{ max_retries }}" +- name: Create nerdctl temp directory + ansible.builtin.file: + path: "{{ temp_download_dir }}/nerdctl" + state: directory + mode: "{{ directory_permissions }}" + - name: Extract nerdctl archive ansible.builtin.unarchive: src: "{{ nerdctl_archive_dest }}" - dest: "{{ temp_download_dir }}" + dest: "{{ temp_download_dir }}/nerdctl/" mode: "{{ file_permission }}" - name: Make nerdctl executable @@ -74,3 +104,37 @@ src: "{{ nerdctl_folder_dest }}" dest: "{{ nerdctl_executable_dest }}" mode: preserve + +# Download and install CNI +- name: Create /opt/cni/ directory + ansible.builtin.file: + path: "{{ cni_download_dir }}" + state: directory + mode: "{{ directory_permissions }}" + +- name: Create /opt/cni/bin/ directory + ansible.builtin.file: + path: "{{ cni_bin_download_dir }}" + state: directory + mode: "{{ directory_permissions }}" + +- name: Download CNI archive + ansible.builtin.get_url: + url: "{{ cni_url }}" + dest: "{{ cni_archive_dest }}" + mode: "{{ directory_permissions }}" + register: download_cni + until: download_cni is not failed + retries: "{{ max_retries }}" + +- name: Extract cni archive + ansible.builtin.unarchive: + src: "{{ cni_archive_dest }}" + dest: "{{ cni_bin_download_dir }}" + mode: "{{ directory_permissions }}" + +- name: Set execute permissions for extracted CNI files + ansible.builtin.file: + path: "{{ cni_bin_download_dir }}" + recurse: true + mode: "{{ cni_mode }}" diff --git a/local_repo/roles/configure_registry/tasks/pre_requisites_ubuntu.yml b/local_repo/roles/configure_registry/tasks/pre_requisites_ubuntu.yml index 3bd3204f2..d42127fe5 100644 --- a/local_repo/roles/configure_registry/tasks/pre_requisites_ubuntu.yml +++ b/local_repo/roles/configure_registry/tasks/pre_requisites_ubuntu.yml @@ -24,9 +24,33 @@ until: download_containerd is not failed retries: "{{ max_retries }}" +- name: Install python apt interface + ansible.builtin.apt: + name: python3-apt + state: present + +- name: Gather the package facts + ansible.builtin.package_facts: + manager: apt + no_log: true + +- name: Check whether containerd.io version + ansible.builtin.set_fact: + higher_containerd_version: true + when: (containerd_str in ansible_facts.packages) and (ansible_facts.packages[containerd_str][0]['version'] + is ansible.builtin.version(containerd_version, '>')) + +- name: Warning prompt if containerd is downgrading + ansible.builtin.pause: + seconds: 20 + prompt: "Higher version of {{ containerd_str }} {{ ansible_facts.packages['containerd.io'][0]['version'] }} already present, + Proceeding with downgrade to {{ containerd_version }}." + when: higher_containerd_version is defined and higher_containerd_version + - name: Install containerd.io ansible.builtin.apt: deb: "{{ containerd_deb_dest }}" + allow_downgrade: true state: present - name: Start containerd systemd service @@ -45,10 +69,16 @@ until: download_nerdctl is not failed retries: "{{ max_retries }}" +- name: Create nerdctl temp directory + ansible.builtin.file: + path: "{{ temp_download_dir }}/nerdctl" + state: directory + mode: "{{ directory_permissions }}" + - name: Extract nerdctl archive ansible.builtin.unarchive: src: "{{ nerdctl_archive_dest }}" - dest: "{{ temp_download_dir }}" + dest: "{{ temp_download_dir }}/nerdctl/" mode: "{{ file_permission }}" - name: Make nerdctl executable diff --git a/upgrade/roles/preinstall_cluster_cleanup/tasks/main.yml b/local_repo/roles/configure_registry/tasks/set_environment.yml similarity index 59% rename from upgrade/roles/preinstall_cluster_cleanup/tasks/main.yml rename to local_repo/roles/configure_registry/tasks/set_environment.yml index eaa91cfdd..54dcadc0d 100644 --- a/upgrade/roles/preinstall_cluster_cleanup/tasks/main.yml +++ b/local_repo/roles/configure_registry/tasks/set_environment.yml @@ -13,19 +13,17 @@ # limitations under the License. --- -# Remove the 1.5 python binary from cluster -- name: Remove omnia 1.5 collector python binary - ansible.builtin.file: - path: "{{ binary_files_path }}" - state: absent - become: true +- name: Gather all IP addresses + ansible.builtin.command: ip -4 addr show + register: ip_output + changed_when: false -- name: Remove nfs share created by 1.5 - ansible.builtin.include_tasks: cleanup_15_nfs.yml +- name: Extract IP addresses + ansible.builtin.set_fact: + oim_ip_addresses: "{{ ip_output.stdout | regex_findall('inet\\s([0-9.]+)') }}" -- name: Remove slurm spool clustername file - ansible.builtin.file: - path: "{{ spool_clustername_file }}" - state: absent - changed_when: false - when: "'manager' in group_names" +- name: Set omnia environment variable file + ansible.builtin.template: + src: "{{ environment_file_path }}" + dest: "{{ omnia_environment_path }}" + mode: "{{ file_permission }}" diff --git a/local_repo/roles/configure_registry/templates/nerdctl-registry.service.j2 b/local_repo/roles/configure_registry/templates/nerdctl-registry.service.j2 index 7837f7d7e..6b9555e25 100644 --- a/local_repo/roles/configure_registry/templates/nerdctl-registry.service.j2 +++ b/local_repo/roles/configure_registry/templates/nerdctl-registry.service.j2 @@ -3,10 +3,14 @@ Description=Nerdctl Registry [Service] Type=oneshot +{% if proxy_status %} +EnvironmentFile={{ omnia_environment_path }} +{% endif %} RemainAfterExit=yes WorkingDirectory={{ nerdctl_registry_dir }} ExecStart=/bin/bash nerdctl-registry start ExecStop=/bin/bash nerdctl-registry stop [Install] -WantedBy=default.target \ No newline at end of file +WantedBy=default.target + diff --git a/local_repo/roles/configure_registry/templates/omnia_environment.j2 b/local_repo/roles/configure_registry/templates/omnia_environment.j2 new file mode 100644 index 000000000..8b7ab69d0 --- /dev/null +++ b/local_repo/roles/configure_registry/templates/omnia_environment.j2 @@ -0,0 +1,7 @@ +http_proxy={{ proxy[0].http_proxy }} +https_proxy={{ proxy[0].https_proxy }} +HTTP_PROXY={{ proxy[0].http_proxy }} +HTTPS_PROXY={{ proxy[0].https_proxy }} +no_proxy="localhost,{{ oim_hostname }},*.{{ oim_domain_name }},{{ oim_ip_addresses | join(',') }}{% if no_proxy_input_status %},{{ proxy[0].no_proxy }}{% endif %}" +NO_PROXY="localhost,{{ oim_hostname }},*.{{ oim_domain_name }},{{ oim_ip_addresses | join(',') }}{% if no_proxy_input_status %},{{ proxy[0].no_proxy }}{% endif %}" + diff --git a/local_repo/roles/configure_registry/vars/main.yml b/local_repo/roles/configure_registry/vars/main.yml index 6952e76fb..dfd5d9dbf 100644 --- a/local_repo/roles/configure_registry/vars/main.yml +++ b/local_repo/roles/configure_registry/vars/main.yml @@ -31,7 +31,7 @@ nerdctl_registry_path: - { src: "nerdctl-registry.j2", dest: "{{ nerdctl_registry_dir }}/nerdctl-registry" } - { src: "nerdctl-registry.service.j2", dest: "/etc/systemd/system/nerdctl-registry.service" } omnia_registry_failure_msg: "omnia-registry container could not be started" -nerdctl_registry_enable_fail_msg: "Failed to initiate nerdctl-registry service." +nerdctl_registry_enable_fail_msg: "Failed to initiate nerdctl-registry service. This may be due to a Docker pull issue or insuffecient proxy details. If you are behind a proxy, please ensure that the necessary proxy details are provided in the input/site_config.yml file." # noqa: yaml[line-length] containerd_certs_dir: "/etc/containerd/certs.d" nerdctl_registry_port: 5001 directory_permissions: "0755" @@ -43,6 +43,14 @@ retry_count: "5" delay_time: "10" # Usage: main.yml -control_plane_os_redhat: "redhat" -control_plane_os_rocky: "rocky" -control_plane_os_ubuntu: "ubuntu" +oim_os_redhat: "redhat" +oim_os_rocky: "rocky" +oim_os_ubuntu: "ubuntu" + +# Usage: set_environment.yml +environment_file_path: "{{ role_path }}/templates/omnia_environment.j2" +omnia_environment_path: "/etc/omnia_environment" + +# Usage: docker_login.yml +docker_login_fail_msg: "Docker login failed. Please ensure the docker login credentials in the input/provision_config_credentials.yml are valid. +If they are, this error can occur due to a pull limit issue or multiple requests. Please try running the playbook again after waiting for a while." diff --git a/local_repo/roles/configure_registry/vars/redhat.yml b/local_repo/roles/configure_registry/vars/redhat.yml index e0a0bde12..bd5799f03 100644 --- a/local_repo/roles/configure_registry/vars/redhat.yml +++ b/local_repo/roles/configure_registry/vars/redhat.yml @@ -21,11 +21,18 @@ conflicting_packages: - podman - buildah - containers-common -containerd_url: "https://download.docker.com/linux/centos/8/x86_64/stable/Packages/containerd.io-1.6.16-3.1.el8.x86_64.rpm" -containerd_rpm_dest: "/{{ temp_download_dir }}/containerd.io-1.6.16-3.1.el8.x86_64.rpm" -nerdctl_url: "https://github.com/containerd/nerdctl/releases/download/v1.5.0/nerdctl-1.5.0-linux-amd64.tar.gz" -nerdctl_archive_dest: "{{ temp_download_dir }}/nerdctl-1.5.0-linux-amd64.tar.gz" -nerdctl_folder_dest: "{{ temp_download_dir }}/nerdctl" +containerd_str: "containerd.io" +containerd_version: '1.6.16-3.1' +containerd_url: "https://download.docker.com/linux/centos/8/x86_64/stable/Packages/{{ containerd_str }}-{{ containerd_version }}.el8.x86_64.rpm" +containerd_rpm_dest: "{{ temp_download_dir }}/{{ containerd_str }}-{{ containerd_version }}.el8.x86_64.rpm" +nerdctl_url: "{{ 'https://github.com/containerd/nerdctl/releases/download/v1.5.0/nerdctl-1.5.0-linux-amd64.tar.gz' if (ansible_distribution_version in ['8.6', '8.7']) else 'https://github.com/containerd/nerdctl/releases/download/v1.7.4/nerdctl-1.7.4-linux-amd64.tar.gz' }}" # noqa: yaml[line-length] +nerdctl_archive_dest: "{{ temp_download_dir }}/{{ 'nerdctl-1.5.0-linux-amd64.tar.gz' if (ansible_distribution_version in ['8.6', '8.7']) else 'nerdctl-1.7.4-linux-amd64.tar.gz' }}" # noqa: yaml[line-length] +nerdctl_folder_dest: "{{ temp_download_dir }}/nerdctl/nerdctl" nerdctl_folder_permission: "+x" nerdctl_executable_dest: "/usr/local/bin/" max_retries: 10 +cni_download_dir: "/opt/cni" +cni_bin_download_dir: "/opt/cni/bin" +cni_url: "https://github.com/containernetworking/plugins/releases/download/v1.4.0/cni-plugins-linux-amd64-v1.4.0.tgz" +cni_archive_dest: "{{ cni_bin_download_dir }}/cni-plugins-linux-amd64-v1.4.0.tgz" +cni_mode: "u+rwx,go+rx" diff --git a/local_repo/roles/configure_registry/vars/ubuntu.yml b/local_repo/roles/configure_registry/vars/ubuntu.yml index c31bcfa4d..933c3c776 100644 --- a/local_repo/roles/configure_registry/vars/ubuntu.yml +++ b/local_repo/roles/configure_registry/vars/ubuntu.yml @@ -14,11 +14,14 @@ --- # Usage: pre_requisites_ubuntu.yml -containerd_url: "https://download.docker.com/linux/ubuntu/dists/{{ os_release }}/pool/stable/amd64/containerd.io_1.6.16-1_amd64.deb" -containerd_deb_dest: "/{{ temp_download_dir }}/containerd.io_1.6.16-1_amd64.deb" -nerdctl_url: "https://github.com/containerd/nerdctl/releases/download/v1.5.0/nerdctl-full-1.5.0-linux-amd64.tar.gz" -nerdctl_archive_dest: "{{ temp_download_dir }}/nerdctl-1.5.0-linux-amd64.tar.gz" -nerdctl_folder_dest: "{{ temp_download_dir }}/bin/nerdctl" +containerd_str: "containerd.io" +containerd_version: '1.6.16-1' +containerd_url: "https://download.docker.com/linux/ubuntu/dists/{{ ansible_distribution_release }}\ +/pool/stable/amd64/containerd.io_{{ containerd_version }}_amd64.deb" +containerd_deb_dest: "{{ temp_download_dir }}/containerd.io_{{ containerd_version }}_amd64.deb" +nerdctl_url: "https://github.com/containerd/nerdctl/releases/download/v1.7.4/nerdctl-full-1.7.4-linux-amd64.tar.gz" +nerdctl_archive_dest: "{{ temp_download_dir }}/nerdctl-1.7.4-linux-amd64.tar.gz" +nerdctl_folder_dest: "{{ temp_download_dir }}/nerdctl/bin/nerdctl" nerdctl_folder_permission: "+x" nerdctl_executable_dest: "/usr/local/bin/" cni_download_dir: "/opt/cni" diff --git a/local_repo/roles/configure_repos/tasks/main.yml b/local_repo/roles/configure_repos/tasks/main.yml index 7fa785846..3703fb4df 100644 --- a/local_repo/roles/configure_repos/tasks/main.yml +++ b/local_repo/roles/configure_repos/tasks/main.yml @@ -14,4 +14,4 @@ --- - name: Configure repos for os - ansible.builtin.include_tasks: configure_repos_{{ control_plane_os }}.yml + ansible.builtin.include_tasks: configure_repos_{{ oim_os }}.yml diff --git a/local_repo/roles/configure_repos/vars/main.yml b/local_repo/roles/configure_repos/vars/main.yml index 0d0311f72..de54b2eb8 100644 --- a/local_repo/roles/configure_repos/vars/main.yml +++ b/local_repo/roles/configure_repos/vars/main.yml @@ -14,9 +14,9 @@ --- # Usage: main.yml -control_plane_os_redhat: "redhat" -control_plane_os_rocky: "rocky" -control_plane_os_ubuntu: "ubuntu" +oim_os_redhat: "redhat" +oim_os_rocky: "rocky" +oim_os_ubuntu: "ubuntu" software_config_default: "omnia_default" # Usage: configure_repos_rhel.yml diff --git a/local_repo/roles/manifest/tasks/main.yml b/local_repo/roles/manifest/tasks/main.yml index a64f089d4..1f975981c 100644 --- a/local_repo/roles/manifest/tasks/main.yml +++ b/local_repo/roles/manifest/tasks/main.yml @@ -14,7 +14,7 @@ --- - name: Load software_config.json as software_config ansible.builtin.include_vars: - file: "{{ software_config_file }}" + file: "{{ sw_config_json_path }}" name: software_config - name: Check if kubeflow entry is present in software_config.json diff --git a/local_repo/roles/manifest/vars/main.yml b/local_repo/roles/manifest/vars/main.yml index 03550ebb9..901383aac 100644 --- a/local_repo/roles/manifest/vars/main.yml +++ b/local_repo/roles/manifest/vars/main.yml @@ -13,7 +13,7 @@ # limitations under the License. --- local_repo_config_path: "{{ role_path }}/../../../input/local_repo_config.yml" -software_config_file: "{{ role_path }}/../../../input/software_config.json" +sw_config_json_path: "{{ role_path }}/../../../input/software_config.json" git_path: "{{ repo_store_path }}/cluster/git" omnia_tag: omnia-kubeflow file_permission: "644" diff --git a/local_repo/roles/parse_and_download/files/common_utility.py b/local_repo/roles/parse_and_download/files/common_utility.py index c1d93472e..d2d63399f 100644 --- a/local_repo/roles/parse_and_download/files/common_utility.py +++ b/local_repo/roles/parse_and_download/files/common_utility.py @@ -5,6 +5,7 @@ import subprocess import os +import shlex def update_status(package_name, package_type, status, status_file_path): """ @@ -30,11 +31,12 @@ def update_status(package_name, package_type, status, status_file_path): found = False # Check if the status entry already exists in the status file - for i, existing_status in enumerate(existing_statuses): - if existing_status.startswith(f"{package_name},{package_type},"): - existing_statuses[i] = package_status - found = True - break + if existing_statuses: + for i, existing_status in enumerate(existing_statuses): + if existing_status.startswith(f"{package_name},{package_type},"): + existing_statuses[i] = package_status + found = True + break if not found: # If the entry doesn't exist, append it to the file @@ -50,6 +52,7 @@ def run_createrepo_rhel(directory): directory: The directory path where createrepo will be executed. """ + directory = shlex.quote(directory).strip("'\"") command = ["createrepo", directory] try: subprocess.run(command, check=True) @@ -82,7 +85,7 @@ def run_createrepo_on_rhel_directories(repo_store_path, cluster_os_type, cluster cluster_os_version: The version of the cluster operating system. version_variables: A dictionary containing version variables for different packages. """ - + base_directories = [ os.path.join(repo_store_path, 'cluster', cluster_os_type, cluster_os_version, 'rpm') ] @@ -124,7 +127,12 @@ def run_createrepo_on_ubuntu_directories(repo_store_path, cluster_os_type, clust if len(version_variables.get('amdgpu_version', '').strip()) > 0: base_directories.append(os.path.join(repo_store_path, 'cluster', 'apt', 'amdgpu', version_variables.get('amdgpu_version', ''))) - + if len(version_variables.get('intelgaudi_version', '').strip()) > 0: + base_directories.append(os.path.join(repo_store_path, 'cluster', 'apt', 'intelgaudi', + version_variables.get('intelgaudi_version', ''))) + if os.path.exists(os.path.join(repo_store_path, 'cluster', 'apt', 'intel')): + base_directories.append(os.path.join(repo_store_path, 'cluster', 'apt', 'intel', + version_variables.get('intelgaudi_version', ''))) for directory in base_directories: if os.path.exists(directory): run_dpkg_scan(directory) diff --git a/local_repo/roles/parse_and_download/files/download_common.py b/local_repo/roles/parse_and_download/files/download_common.py index 9955fc22b..fb1589ce2 100644 --- a/local_repo/roles/parse_and_download/files/download_common.py +++ b/local_repo/roles/parse_and_download/files/download_common.py @@ -1,10 +1,25 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """ Module to handle download processes for various package types such as pip_module,git, tarball,manifest,ansible-galaxy collection. """ import subprocess -import os +import os, shlex +import sys import tarfile from jinja2 import Template import shutil @@ -20,7 +35,10 @@ def process_pip_package(package, repo_store_path, status_file_path): status_file_path: Path to the status file. """ + python_version = os.path.basename(sys.executable) package_name = package['package'] + package_name = shlex.quote(package_name).strip("'\"") + version = package.get('version', None) package_type = package['type'] print(f"Processing Pip Package: {package_name}, Version: {version}") @@ -28,13 +46,14 @@ def process_pip_package(package, repo_store_path, status_file_path): # Assuming you have a specific path to store pip packages pip_modules_directory = os.path.join(repo_store_path, 'cluster', 'pip_module') os.makedirs(pip_modules_directory, exist_ok=True) # Ensure the directory exists + pip_modules_directory = shlex.quote(pip_modules_directory).strip("'\"") # Pip specific processing logic goes here # ... # Download the package download_command = [ - 'python3.9', '-m', 'pip', 'download', + python_version, '-m', 'pip', 'download', package_name, '-d', pip_modules_directory ] @@ -61,8 +80,14 @@ def process_git_package(package, repo_store_path, status_file_path): """ package_name = package['package'] + package_name = shlex.quote(package_name).strip("'\"") + url = package.get('url', None) + url = shlex.quote(url).strip("'\"") + version = package.get('version', None) + version = shlex.quote(version).strip("'\"") + package_type = package['type'] print(f"Processing Git Package: {package_name}, URL: {url}, Version: {version}") @@ -71,11 +96,12 @@ def process_git_package(package, repo_store_path, status_file_path): os.makedirs(git_modules_directory, exist_ok=True) # Ensure the directory exists clone_directory = os.path.join(git_modules_directory, package_name) + clone_directory = shlex.quote(clone_directory).strip("'\"") tarball_path = os.path.join(git_modules_directory, f'{package_name}.tar.gz') try: # Using wget to check if the URL exists (returns 0 for success, non-zero for failure) - subprocess.run(['wget', '-q', '--spider', '--tries=1', url], check=True) + subprocess.run(['wget', '-q', "--spider", '--tries=1', url], check=True) # Clone the repository only if it doesn't exist if not os.path.exists(clone_directory): clone_command = ['git', 'clone', '--branch', version, url, clone_directory] @@ -121,14 +147,15 @@ def process_tarball_package(package, repo_store_path, status_file_path, version_ url = url_template.render(**version_variables) if 'path' in package: path = package['path'] - + print(f"Processing Tarball Package: {package_name}, URL: {url}, Path: {path}") + url = shlex.quote(url).strip("'\"") if path is not None and len(path) > 1: if os.path.isfile(path): path_support = True url_support = False - + # Creating the local path to save the tarball tarball_directory = os.path.join(repo_store_path, 'cluster', 'tarball') @@ -137,6 +164,7 @@ def process_tarball_package(package, repo_store_path, status_file_path, version_ # Use the package name for the tarball filename tarball_path = os.path.join(tarball_directory, f"{package_name}.tar.gz") + tarball_path = shlex.quote(tarball_path).strip("'\"") if path_support == False and url_support == True: try: @@ -186,6 +214,7 @@ def process_manifest_package(package,repo_store_path, status_file_path): """ package_name = package['package'] url = package.get('url', None) + url = shlex.quote(url).strip("'\"") package_type = package['type'] print(f"Processing Manifest Package: {package_name}, URL: {url}") @@ -269,11 +298,16 @@ def process_ansible_galaxy_collection(package, repo_store_path, status_file_path """ package_name = package['package'] version = package.get('version', None) + + package_name = shlex.quote(package_name).strip("'\"") + version = shlex.quote(version).strip("'\"") + package_type = package['type'] print(f"Processing Ansible Galaxy Collection Package: {package_name}, Version: {version}") # Assuming you have a specific path to store Ansible Galaxy Collections galaxy_collections_directory = os.path.join(repo_store_path, 'cluster', 'ansible_galaxy_collection') + galaxy_collections_directory = shlex.quote(galaxy_collections_directory).strip("'\"") os.makedirs(galaxy_collections_directory, exist_ok=True) # Ensure the directory exists # Check if the tarball already exists @@ -339,7 +373,7 @@ def process_iso_package(package, repo_store_path, status_file_path, cluster_os_t print(f"Processing iso Package to directory: {iso_directory}") os.makedirs(iso_directory, exist_ok=True) - + if path_support == False and url_support == True: try: download_file_name = url.split('/') diff --git a/local_repo/roles/parse_and_download/files/download_deb.py b/local_repo/roles/parse_and_download/files/download_deb.py index 430edc73f..c18723f09 100644 --- a/local_repo/roles/parse_and_download/files/download_deb.py +++ b/local_repo/roles/parse_and_download/files/download_deb.py @@ -1,7 +1,8 @@ import subprocess -import os +import os, shlex from jinja2 import Template from common_utility import update_status +import distro def process_deb_package(package, repo_store_path, status_file_path, cluster_os_type, cluster_os_version, repo_config, version_variables, cluster_name): """ @@ -20,12 +21,18 @@ def process_deb_package(package, repo_store_path, status_file_path, cluster_os_t package_template = Template(package.get('package', None)) # Use Jinja2 Template for package # Render the packages, substituting Jinja variables if present package_name = package_template.render(**version_variables) + package_name = shlex.quote(package_name).strip("'\"") + print(f"Processing DEB: {package_name}, Repo Name: {repo_name}, Repo Config: {repo_config}") user_apt_conf_path= "/etc/apt/user_apt.conf" # Specify the repository names that should be skipped - skip_repos = ['focal','jammy'] + os_version = distro.version() + if os_version != cluster_os_version: + skip_repos = ['focal','jammy','deadsnake-ppa'] + else: + skip_repos = ['focal','jammy'] download_flag = False - omnia_always = ['amdgpu', 'cuda', 'ofed'] + omnia_always = ['amdgpu', 'intelgaudi', 'cuda', 'ofed'] # Construct the path based on the provided repository store format if cluster_name == 'beegfs': @@ -37,9 +44,17 @@ def process_deb_package(package, repo_store_path, status_file_path, cluster_os_t elif cluster_name == 'rocm': deb_directory = os.path.join(repo_store_path, 'cluster', 'apt', 'rocm', version_variables.get('rocm_version', '')) + elif cluster_name == 'intelgaudi': + deb_directory = os.path.join(repo_store_path, 'cluster', 'apt', 'intelgaudi', + version_variables.get('intelgaudi_version', '')) + elif cluster_name == 'intel': + deb_directory = os.path.join(repo_store_path, 'cluster', 'apt', 'intel', + version_variables.get('intelgaudi_version', '')) else: deb_directory = os.path.join(repo_store_path, 'cluster', cluster_os_type, cluster_os_version, 'deb') + deb_directory = shlex.quote(deb_directory).strip("'\"") + # Default status value status = "Skipped" @@ -71,8 +86,9 @@ def process_deb_package(package, repo_store_path, status_file_path, cluster_os_t dependencies.append(dependency) # Download each dependency - if repo_name != 'ldap': + if repo_name not in ['ldap','intelgaudi']: for dependency in dependencies: + dependency = shlex.quote(dependency).strip("'") download_dependency_command = ['apt-get', 'download', dependency, '-o', f'Dir::Cache={deb_directory}'] try: subprocess.run(download_dependency_command, check=True) @@ -87,7 +103,7 @@ def process_deb_package(package, repo_store_path, status_file_path, cluster_os_t # Update user defined apt cache subprocess.run(['apt-get', 'update', '-c', user_apt_conf_path], check=True) subprocess.run(['apt-cache', 'show', package_name, '-c', user_apt_conf_path], check=True) - split_package_name = package_name.split('=')[0] + split_package_name = package_name.split('=')[0].strip("'\"") # Check if the package is installed result = subprocess.run(['dpkg', '-l', split_package_name], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) if result.stdout.strip(): @@ -122,8 +138,9 @@ def process_deb_package(package, repo_store_path, status_file_path, cluster_os_t dependency = dependency.split(':')[0] dependencies.append(dependency) - if repo_name != 'ldap': + if repo_name not in ['ldap','intelgaudi']: for dependency in dependencies: + dependency = shlex.quote(dependency).strip("'") download_dependency_command = ['apt-get', 'download', dependency, '-o', f'Dir::Cache={deb_directory}'] try: # Checking flag if package is downloaded diff --git a/local_repo/roles/parse_and_download/files/download_image.py b/local_repo/roles/parse_and_download/files/download_image.py index 0cdac52e7..84228e53c 100644 --- a/local_repo/roles/parse_and_download/files/download_image.py +++ b/local_repo/roles/parse_and_download/files/download_image.py @@ -5,9 +5,11 @@ import subprocess from jinja2 import Template from common_utility import update_status -import requests +import requests, shlex -def process_image_tag_package(package_name, repo_config, nerdctl_registry_host, image_tag): +json_headers = "application/vnd.oci.image.index.v1+json" + +def process_image_tag_package(package_name, repo_config, nerdctl_registry_host, image_tag, crt_file_path): """ Process an image package with tag. @@ -18,11 +20,16 @@ def process_image_tag_package(package_name, repo_config, nerdctl_registry_host, image_tag: Tag value of image to be pulled. """ print(f"Processing Image Package: {package_name}, Tag: {image_tag}") + + package_name = shlex.quote(package_name).strip("'\"") + image_tag = shlex.quote(image_tag).strip("'\"") + nerdctl_registry_host = shlex.quote(nerdctl_registry_host).strip("'\"") + # Check if image exists in omnia_local_registry try: - headers = {"Accept": "application/vnd.oci.image.index.v1+json"} + headers = {"Accept": json_headers} url = f"https://{nerdctl_registry_host}/v2/{package_name.split('/', 1)[-1]}/manifests/{image_tag}" - response = requests.get(url, headers=headers, verify=False) + response = requests.get(url, headers=headers, verify=crt_file_path) if response.status_code == 200: print(f"Image {package_name}:{image_tag} exists in the registry {nerdctl_registry_host}.") return "Success" @@ -53,7 +60,7 @@ def process_image_tag_package(package_name, repo_config, nerdctl_registry_host, print(f"Exception occured while trying to access registry {nerdctl_registry_host}.") return "Failed" -def process_image_digest_package(package_name, repo_config, nerdctl_registry_host, image_digest, new_tag): +def process_image_digest_package(package_name, repo_config, nerdctl_registry_host, image_digest, new_tag, crt_file_path): """ Process an image package with digest. @@ -66,22 +73,39 @@ def process_image_digest_package(package_name, repo_config, nerdctl_registry_hos """ print(f"Processing Image Package: {package_name}, Digest: {image_digest}") # Check if image exists in omnia_local_registry + + nerdctl_registry_host = shlex.quote(nerdctl_registry_host).strip("'\"") + package_name = shlex.quote(package_name).strip("'\"") + new_tag = shlex.quote(new_tag).strip("'\"") + image_digest = shlex.quote(image_digest).strip("'\"") + + try: - headers = {"Accept": "application/vnd.oci.image.index.v1+json"} + headers = {"Accept": json_headers} url = f"https://{nerdctl_registry_host}/v2/{package_name.split('/', 1)[-1]}/manifests/{new_tag}" - response = requests.get(url, headers=headers, verify=False) + response = requests.get(url, headers=headers, verify=crt_file_path) if response.status_code == 200: print(f"Image {package_name}:{new_tag} exists in the registry {nerdctl_registry_host}.") return "Success" else: pull_command = ["nerdctl", "pull", f"{package_name}@sha256:{image_digest}"] + pull_command_all_platforms = ["nerdctl", "pull", f"{package_name}@sha256:{image_digest}", "--all-platforms"] tag_command = ["nerdctl", "tag", f"{package_name}@sha256:{image_digest}", f"{nerdctl_registry_host}/{package_name.split('/', 1)[-1]}:{new_tag}"] push_command = ["nerdctl", "push", f"{nerdctl_registry_host}/{package_name.split('/', 1)[-1]}:{new_tag}"] try: subprocess.run(pull_command, check=True) subprocess.run(tag_command, check=True) - subprocess.run(push_command, check=True) - return "Success" + push_command_output = subprocess.run(push_command, capture_output=True, text=True) + if push_command_output.returncode == 0: + return "Success" + else: + if "failed to create a tmp single-platform image" in push_command_output.stderr: + subprocess.run(pull_command_all_platforms, check=True) + subprocess.run(tag_command, check=True) + subprocess.run(push_command, check=True) + return "Success" + else: + raise subprocess.CalledProcessError(returncode=1, cmd="failed to push image to private registry") except subprocess.CalledProcessError as e: return "Failed" except Exception as err: @@ -101,15 +125,21 @@ def check_image_in_registry(image_name, image_version, user_registries): Returns: bool: True if the image exists in any of the user's registries, False otherwise. """ + image_name = shlex.quote(image_name).strip("'\"") + image_version = shlex.quote(image_version).strip("'\"") + if user_registries is not None and len(user_registries) > 0: for registry in user_registries: try: host = registry.get("host") + cert_file_path = registry.get("cert_path").strip() + cert_file_path = cert_file_path if cert_file_path else False + print(f"Checking for image: {image_name}:{image_version} in registry {host}.") # Check if the image with the specified tag/digest exists - headers = {"Accept": "application/vnd.oci.image.index.v1+json"} + headers = {"Accept": json_headers} url = f"https://{host}/v2/{image_name.split('/', 1)[-1]}/manifests/{image_version}" - response = requests.get(url, headers=headers, verify=False) + response = requests.get(url, headers=headers, verify=cert_file_path) if response.status_code == 200: print(f"Image {image_name}:{image_version} exists in the registry {host}.") return True @@ -124,7 +154,7 @@ def check_image_in_registry(image_name, image_version, user_registries): return False -def process_image_package(package, repo_config, nerdctl_registry_host, status_file_path, version_variables, user_registries, software_names): +def process_image_package(package, repo_config, nerdctl_registry_host, status_file_path, version_variables, user_registries, software_names, openssl_cert_path): """ Process an image package. @@ -135,6 +165,7 @@ def process_image_package(package, repo_config, nerdctl_registry_host, status_fi status_file_path: Path to the status file. version_variables: Variables for rendering version template. """ + package_name = package['package'] package_type = package['type'] @@ -162,11 +193,11 @@ def process_image_package(package, repo_config, nerdctl_registry_host, status_fi # Render the tag, substituting Jinja variables if present image_tag = tag_template.render(**version_variables) if repo_config == "always": - status = process_image_tag_package(package_name, repo_config, nerdctl_registry_host, image_tag) + status = process_image_tag_package(package_name, repo_config, nerdctl_registry_host, image_tag, openssl_cert_path) if repo_config == "partial": image_skip_status = check_image_in_registry(package_name, image_tag, user_registries) if not image_skip_status: - status = process_image_tag_package(package_name, repo_config, nerdctl_registry_host, image_tag) + status = process_image_tag_package(package_name, repo_config, nerdctl_registry_host, image_tag, openssl_cert_path) else: status = "Skipped" if repo_config == "never": @@ -175,16 +206,16 @@ def process_image_package(package, repo_config, nerdctl_registry_host, status_fi if process_image_digest is True: if repo_config == "always": - status = process_image_digest_package(package_name, repo_config, nerdctl_registry_host, image_digest, new_tag) + status = process_image_digest_package(package_name, repo_config, nerdctl_registry_host, image_digest, new_tag, openssl_cert_path) if repo_config == "partial": image_skip_status = check_image_in_registry(package_name, "sha256:" + image_digest, user_registries) if not image_skip_status: - status = process_image_digest_package(package_name, repo_config, nerdctl_registry_host, image_digest, new_tag) + status = process_image_digest_package(package_name, repo_config, nerdctl_registry_host, image_digest, new_tag, openssl_cert_path) else: status = "Skipped" if repo_config == "never": status = "Skipped" complete_package_name = package_name + "@sha256:" + image_digest - + # Update the status - update_status(complete_package_name, package_type, status, status_file_path) + update_status(complete_package_name, package_type, status, status_file_path) \ No newline at end of file diff --git a/local_repo/roles/parse_and_download/files/download_rpm.py b/local_repo/roles/parse_and_download/files/download_rpm.py index 205385471..452cd864e 100644 --- a/local_repo/roles/parse_and_download/files/download_rpm.py +++ b/local_repo/roles/parse_and_download/files/download_rpm.py @@ -3,7 +3,7 @@ """ import subprocess import re -import os +import os, shlex from jinja2 import Template from common_utility import update_status @@ -25,6 +25,7 @@ def process_rpm_package(package, repo_store_path, status_file_path, cluster_os_t package_template = Template(package.get('package', None)) # Use Jinja2 Template for package # Render the packages, substituting Jinja variables if present package_name = package_template.render(**version_variables) + package_name = shlex.quote(package_name).strip("'\"") print(f"Processing RPM: {package_name},Repo Name: {repo_name},Repo Config: {repo_config}") # Specify the repository names that should be skipped @@ -45,6 +46,8 @@ def process_rpm_package(package, repo_store_path, status_file_path, cluster_os_t else: rpm_directory = os.path.join(repo_store_path, 'cluster', cluster_os_type, cluster_os_version, 'rpm') + # shlex quote rpm_directory + rpm_directory = shlex.quote(rpm_directory).strip("'\"") # Default status value status = "Skipped" @@ -109,6 +112,7 @@ def process_rpm_package(package, repo_store_path, status_file_path, cluster_os_t # Step 3: Check if each dependency is available in user repos for dependency in dependencies: + dependency = shlex.quote(dependency).strip("'\"") try: subprocess.run(['dnf', 'list', 'available', dependency, '--disablerepo=omnia_repo*'], check=True) # If the command succeeds, the dependency is available in user repos diff --git a/local_repo/roles/parse_and_download/files/parse_and_download.py b/local_repo/roles/parse_and_download/files/parse_and_download.py index 65f2abbb8..d46257ed1 100644 --- a/local_repo/roles/parse_and_download/files/parse_and_download.py +++ b/local_repo/roles/parse_and_download/files/parse_and_download.py @@ -1,6 +1,7 @@ """ Module to parse software_config.json data and download packages. """ +import sys import json import os import yaml @@ -10,6 +11,7 @@ import download_rpm import download_deb + def load_user_json(file_path): """ Load software_config JSON data from file. @@ -47,7 +49,8 @@ def load_software_config_json(software_names, cluster_os_type, cluster_os_versio software_configs = {} user_json_dir = os.path.dirname(user_json_path) for software_name in software_names: - json_file_path = f'{user_json_dir}/config/{cluster_os_type}/{cluster_os_version}/{software_name}.json' + json_file_path = f'{user_json_dir}/config/{cluster_os_type}/'\ + f'{cluster_os_version}/{software_name}.json' if os.path.exists(json_file_path): with open(json_file_path, 'r', encoding='utf-8') as json_file: software_config = json.load(json_file) @@ -56,7 +59,9 @@ def load_software_config_json(software_names, cluster_os_type, cluster_os_versio # Example function to process a package -def process_package(package, repo_store_path,status_file_path,cluster_os_type, cluster_os_version, repo_config, version_variables, nerdctl_registry_host, user_registries, cluster_name, software_names): +def process_package(package, repo_store_path,status_file_path,cluster_os_type, + cluster_os_version, repo_config, version_variables, + nerdctl_registry_host, user_registries, cluster_name, software_names, openssl_cert_path): """ Function to process a package Args: @@ -67,17 +72,23 @@ def process_package(package, repo_store_path,status_file_path,cluster_os_type, c cluster_os_version: Cluster OS version. repo_config: Repository configuration. version_variables: Variables for rendering version template. - nerdctl_registry_host: Nerdctl registry host. + nerdctl_registry_host: Nerdctl registry host. """ package_name = package['package'] package_type = package['type'] print(f"Processing Package: {package_name}, Type: {package_type}") # Type-specific processing based on the extracted type - process_by_package_type(package_name, package_type, package, repo_store_path, status_file_path, cluster_os_type, cluster_os_version, repo_config, version_variables, nerdctl_registry_host, user_registries, cluster_name, software_names) + process_by_package_type(package_name, package_type, package, + repo_store_path, status_file_path, cluster_os_type, cluster_os_version, + repo_config, version_variables, nerdctl_registry_host, + user_registries, cluster_name, software_names, openssl_cert_path) # Type-specific processing function -def process_by_package_type(package_name, package_type, package, repo_store_path, status_file_path, cluster_os_type, cluster_os_version, repo_config, version_variables, nerdctl_registry_host, user_registries, cluster_name, software_names): +def process_by_package_type(package_name, package_type, package, + repo_store_path, status_file_path, cluster_os_type, cluster_os_version, + repo_config, version_variables, nerdctl_registry_host, + user_registries, cluster_name, software_names, openssl_cert_path): """ Function to process based on package type Args: @@ -94,44 +105,127 @@ def process_by_package_type(package_name, package_type, package, repo_store_path user_registries: External registries hosted by user. """ if package_type == 'rpm' and cluster_os_type in ['rhel','rocky']: - download_rpm.process_rpm_package(package, repo_store_path, status_file_path, cluster_os_type, cluster_os_version, repo_config, version_variables, cluster_name) + download_rpm.process_rpm_package(package, repo_store_path, + status_file_path, cluster_os_type, cluster_os_version, + repo_config, version_variables, cluster_name) elif package_type == 'deb' and cluster_os_type == 'ubuntu': - download_deb.process_deb_package(package, repo_store_path, status_file_path, cluster_os_type, cluster_os_version, repo_config, version_variables, cluster_name) + download_deb.process_deb_package(package, repo_store_path, + status_file_path, cluster_os_type, cluster_os_version, + repo_config, version_variables, cluster_name) elif package_type == 'pip_module': download_common.process_pip_package(package, repo_store_path, status_file_path) elif package_type == 'git': download_common.process_git_package(package, repo_store_path, status_file_path) elif package_type == 'tarball': - download_common.process_tarball_package(package, repo_store_path, status_file_path, version_variables) + download_common.process_tarball_package(package, repo_store_path, + status_file_path, version_variables) elif package_type == 'manifest': download_common.process_manifest_package(package, repo_store_path, status_file_path) elif package_type == 'shell': download_common.process_shell_package(package, repo_store_path, status_file_path) elif package_type == 'image': - download_image.process_image_package(package, repo_config, nerdctl_registry_host, status_file_path, version_variables, user_registries, software_names) + download_image.process_image_package(package, repo_config, + nerdctl_registry_host, status_file_path, + version_variables, user_registries, software_names, openssl_cert_path) elif package_type == 'ansible_galaxy_collection': - download_common.process_ansible_galaxy_collection(package, repo_store_path, status_file_path) + download_common.process_ansible_galaxy_collection(package, + repo_store_path, status_file_path) elif package_type == 'iso': - download_common.process_iso_package(package, repo_store_path, status_file_path, cluster_os_type, cluster_os_version, repo_config, version_variables) + download_common.process_iso_package(package, repo_store_path, + status_file_path, cluster_os_type, cluster_os_version, + repo_config, version_variables) else: print(f"Unknown package type: {package_type} for package {package_name}") + +def get_local_repo_config_path(): + """ + Retrieves and validates the path to the local repository configuration YAML file. + + This function fetches the path from the environment variable `LOCAL_REPO_CONFIG_YAML_PATH`, + validates that it points to a valid file, and returns the path. + + Raises: + ValueError: If the environment variable is not set or the path does not point to a valid file. + Exception: For any other unexpected errors encountered during path validation. + + Returns: + Path: A Path object representing the validated file path. + """ + try: + config_path = os.getenv('LOCAL_REPO_CONFIG_YAML_PATH') + + if not config_path: + raise ValueError("Environment variable LOCAL_REPO_CONFIG_YAML_PATH is not set") + + + # Ensure it's an existing file (or directory based on your use case) + if not os.path.isfile(config_path): # Use is_file() or is_dir() based on the expected input + raise ValueError(f"Invalid path: {config_path} does not point to a valid file") + + return config_path + except Exception as e: + print(f"Error: {e}") + + +def get_user_json_path(): + """ + Retrieves and validates the path to the user json file. + + This function fetches the path from the environment variable `USER_JSON_PATH`, + validates that it points to a valid file, and returns the path. + + Raises: + ValueError: If the environment variable is not set or the path does not point to a valid file. + Exception: For any other unexpected errors encountered during path validation. + + Returns: + Path: A Path object representing the validated file path. + """ + try: + config_path = os.getenv('USER_JSON_PATH') + + if not config_path: + raise ValueError("Environment variable USER_JSON_PATH is not set") + + + # Ensure it's an existing file (or directory based on your use case) + if not os.path.isfile(config_path): + raise ValueError(f"Invalid path: {config_path} does not point to a valid file") + + return config_path + except Exception as e: + print(f"Error: {e}") + def main(): """ Main function to parse software_config.json data and initiate package downloads. """ - # File paths - # Access environment variables - user_json_path = os.environ.get('USER_JSON_PATH') - local_repo_config_yaml_path = os.environ.get('LOCAL_REPO_CONFIG_YAML_PATH') + try: + # File paths + # Access environment variables + user_json_path = get_user_json_path() + + local_repo_config_yaml_path = get_local_repo_config_path() + + except ValueError as ve: + print(f"Input validation error: {ve}") + sys.exit(1) + except KeyError as ke: + print(f"Missing environment variable: {ke}") + sys.exit(1) + except Exception as ex: + print(f"Unexpected error occurred: {ex}") + sys.exit(1) + status_file_path = os.environ.get('STATUS_FILE_PATH') nerdctl_registry_host = os.environ.get('NERDCTL_REGISTRY_HOST') software_name = os.environ.get('SOFTWARE_NAME') + openssl_cert_path = os.environ.get('OPENSSL_CERT_PATH') # Load data from software_config.json user_data = load_user_json(user_json_path) - # Get the list of keys software_config_keys = user_data.keys() @@ -154,7 +248,6 @@ def main(): # Find the common keys between software_names and key_value_pairs subgroup_keys = set(software_names).intersection(software_config_keys) - # Get the list of names for each subgroup key subgroup_names = [] for key in subgroup_keys: @@ -181,20 +274,24 @@ def main(): user_registries = repo_config_data['user_registry'] # Load software configuration JSON files - software_configs = load_software_config_json(software_names, cluster_os_type, cluster_os_version, user_json_path) - + software_configs = load_software_config_json(software_names, + cluster_os_type, cluster_os_version, user_json_path) for cluster_type, cluster_data in software_configs.items(): for cluster_name, cluster_info in cluster_data.items(): if 'cluster' in cluster_info and cluster_name in subgroup_names: print(f"Processing software stack: {cluster_type}") for package in cluster_info['cluster']: - process_package(package, repo_store_path, status_file_path, cluster_os_type,cluster_os_version, repo_config, version_variables, nerdctl_registry_host, user_registries, cluster_name, software_names) + process_package(package, repo_store_path, status_file_path, + cluster_os_type,cluster_os_version, repo_config, version_variables, + nerdctl_registry_host, user_registries, cluster_name, software_names, openssl_cert_path) print() if cluster_os_type == 'rhel' or cluster_os_type == 'rocky': - common_utility.run_createrepo_on_rhel_directories(repo_store_path, cluster_os_type, cluster_os_version, version_variables) + common_utility.run_createrepo_on_rhel_directories(repo_store_path, + cluster_os_type, cluster_os_version, version_variables) if cluster_os_type == 'ubuntu': - common_utility.run_createrepo_on_ubuntu_directories(repo_store_path, cluster_os_type, cluster_os_version, version_variables) + common_utility.run_createrepo_on_ubuntu_directories(repo_store_path, + cluster_os_type, cluster_os_version, version_variables) if __name__ == "__main__": main() diff --git a/local_repo/roles/parse_and_download/tasks/apt_conf_config.yml b/local_repo/roles/parse_and_download/tasks/apt_conf_config.yml index b728749b0..092333d58 100644 --- a/local_repo/roles/parse_and_download/tasks/apt_conf_config.yml +++ b/local_repo/roles/parse_and_download/tasks/apt_conf_config.yml @@ -13,7 +13,7 @@ # limitations under the License. --- -- name: Read hostname of control plane +- name: Read hostname of Omnia Infrastructure Manager ansible.builtin.command: hostname changed_when: false register: hostname_result diff --git a/local_repo/roles/parse_and_download/tasks/main.yml b/local_repo/roles/parse_and_download/tasks/main.yml index 5fad980e9..7f44492be 100644 --- a/local_repo/roles/parse_and_download/tasks/main.yml +++ b/local_repo/roles/parse_and_download/tasks/main.yml @@ -18,11 +18,11 @@ - name: Configure Cluster Repositories for RHEL/Rocky ansible.builtin.include_tasks: cluster_repo_config.yml - when: control_plane_os in control_plane_os_redhat or control_plane_os in control_plane_os_rocky + when: oim_os in oim_os_redhat or oim_os in oim_os_rocky - name: Configure Cluster Repositories for Ubuntu ansible.builtin.include_tasks: apt_conf_config.yml - when: control_plane_os in control_plane_os_ubuntu + when: oim_os in oim_os_ubuntu - name: Execute python script to Download Packages ansible.builtin.include_tasks: run_python_script_{{ ansible_distribution | lower }}.yml @@ -32,11 +32,11 @@ - name: Configure repositories to /etc/yum.repos.d ansible.builtin.include_tasks: yum_repo_config.yml - when: control_plane_os in control_plane_os_redhat or control_plane_os in control_plane_os_rocky + when: oim_os in oim_os_redhat or oim_os in oim_os_rocky - name: Configure repositories to /etc/apt/sources.list.d ansible.builtin.include_tasks: sources_list_config.yml - when: control_plane_os in control_plane_os_ubuntu + when: oim_os in oim_os_ubuntu - name: Create metadata file ansible.builtin.include_tasks: create_metadata.yml diff --git a/local_repo/roles/parse_and_download/tasks/run_python_script_redhat.yml b/local_repo/roles/parse_and_download/tasks/run_python_script_redhat.yml index 004661ded..a5cafaba5 100644 --- a/local_repo/roles/parse_and_download/tasks/run_python_script_redhat.yml +++ b/local_repo/roles/parse_and_download/tasks/run_python_script_redhat.yml @@ -19,11 +19,15 @@ status can be checked at /opt/omnia/offline/download_package_status.csv) # noqa: name[template] ansible.builtin.command: "{{ python_version }} {{ python_script_path }}" environment: - USER_JSON_PATH: "{{ user_json_path }}" + USER_JSON_PATH: "{{ sw_config_json_path }}" LOCAL_REPO_CONFIG_YAML_PATH: "{{ local_repo_config_path }}" STATUS_FILE_PATH: "{{ status_file_path }}" NERDCTL_REGISTRY_HOST: "{{ hostname_result.stdout }}:{{ nerdctl_registry_port }}" SOFTWARE_NAME: "{{ software_name }}" + HTTP_PROXY: "{{ proxy[0].http_proxy | default('', true) }}" + HTTPS_PROXY: "{{ proxy[0].http_proxy | default('', true) }}" + NO_PROXY: "localhost,127.0.0.1,{{ oim_hostname }}" + OPENSSL_CERT_PATH: "{{ openssl_cert_path }}" changed_when: true register: python_script_result diff --git a/local_repo/roles/parse_and_download/tasks/run_python_script_ubuntu.yml b/local_repo/roles/parse_and_download/tasks/run_python_script_ubuntu.yml index 40acb6925..97e2b502e 100644 --- a/local_repo/roles/parse_and_download/tasks/run_python_script_ubuntu.yml +++ b/local_repo/roles/parse_and_download/tasks/run_python_script_ubuntu.yml @@ -19,11 +19,15 @@ status can be checked at /opt/omnia/offline/download_package_status.csv) # noqa: name[template] ansible.builtin.command: "{{ python_version }} {{ python_script_path }}" environment: - USER_JSON_PATH: "{{ user_json_path }}" + USER_JSON_PATH: "{{ sw_config_json_path }}" LOCAL_REPO_CONFIG_YAML_PATH: "{{ local_repo_config_path }}" STATUS_FILE_PATH: "{{ status_file_path }}" NERDCTL_REGISTRY_HOST: "{{ hostname_result.stdout }}:{{ nerdctl_registry_port }}" SOFTWARE_NAME: "{{ software_name }}" + HTTP_PROXY: "{{ proxy[0].http_proxy | default('', true) }}" + HTTPS_PROXY: "{{ proxy[0].https_proxy | default('', true) }}" + NO_PROXY: "localhost,127.0.0.1,{{ oim_hostname }}" + OPENSSL_CERT_PATH: "{{ openssl_cert_path }}" changed_when: true register: python_script_result diff --git a/local_repo/roles/parse_and_download/vars/main.yml b/local_repo/roles/parse_and_download/vars/main.yml index 95c748eb2..a32e53853 100644 --- a/local_repo/roles/parse_and_download/vars/main.yml +++ b/local_repo/roles/parse_and_download/vars/main.yml @@ -21,8 +21,8 @@ csv_file_mode: "0644" # Usage: run_python_script.yml python_script_path: "{{ role_path }}/files/parse_and_download.py" -python_version: "python3.9" -user_json_path: "{{ role_path }}/../../../input/software_config.json" +python_version: "{{ ansible_python_interpreter }}" +sw_config_json_path: "{{ role_path }}/../../../input/software_config.json" local_repo_config_path: "{{ role_path }}/../../../input/local_repo_config.yml" status_file_path: "/opt/omnia/offline/download_package_status.csv" yum_repos_path: "/etc/yum.repos.d" @@ -40,8 +40,8 @@ max_retries: 10 parse_message: "{{ software_name }}.json parsed. Status can be checked at /opt/omnia/offline/download_package_status.csv" # Usage: main.yml -control_plane_os_redhat: "redhat" -control_plane_os_rocky: "rocky" +oim_os_redhat: "redhat" +oim_os_rocky: "rocky" # Usage: create_metadata.yml meta_dest: "/opt/omnia/offline/.data" diff --git a/local_repo/roles/validation/tasks/main.yml b/local_repo/roles/validation/tasks/main.yml index 9f16cc295..1e4d6b59c 100644 --- a/local_repo/roles/validation/tasks/main.yml +++ b/local_repo/roles/validation/tasks/main.yml @@ -18,8 +18,24 @@ seconds: "{{ warning_wait_time_warning }}" prompt: "{{ warning_msg_local_repo }}" -- name: Validate control plane OS - ansible.builtin.include_tasks: validate_cp_os.yml +- name: Check /var mount use percentage + ansible.builtin.shell: | + set -o pipefail + df -h /var | awk 'FNR == 2 {print $5}' | tr -d '%' + register: var_mount_use_percentage + changed_when: false + +- name: Warning - local_repo may fail due to insufficient disk space + ansible.builtin.pause: + seconds: "{{ warning_wait_time_warning }}" + prompt: "{{ var_mount_overuse_msg }}" + when: var_mount_use_percentage.stdout | float >= var_mount_percentage_limit + +- name: Validate Omnia Infrastructure Manager OS + ansible.builtin.include_tasks: validate_oim_os.yml + +- name: Validate domain_name + ansible.builtin.include_tasks: validate_domain_name.yml - name: Install prerequisites ansible.builtin.include_tasks: prerequisites_{{ ansible_distribution | lower }}.yml @@ -30,6 +46,9 @@ - name: Validate local_repo_config.yml ansible.builtin.include_tasks: validate_local_repo_config.yml +- name: Validate site_config.yml + ansible.builtin.include_tasks: validate_site_config.yml + - name: Validate ubuntu_os_url ansible.builtin.include_tasks: validate_ubuntu_os_url.yml when: cluster_os_type == 'ubuntu' diff --git a/local_repo/roles/validation/tasks/prerequisites_redhat.yml b/local_repo/roles/validation/tasks/prerequisites_redhat.yml index 4f314e33e..431b02737 100644 --- a/local_repo/roles/validation/tasks/prerequisites_redhat.yml +++ b/local_repo/roles/validation/tasks/prerequisites_redhat.yml @@ -20,20 +20,6 @@ line: 'reposdir={{ yum_repos_path }}' state: present -- name: Install ansible galaxy collection {{ community_general_collection }} - ansible.builtin.command: ansible-galaxy collection install {{ community_general_collection }} - changed_when: true - register: community_general_collection - until: community_general_collection is not failed - retries: "{{ max_retries }}" - -- name: Install ansible galaxy collection {{ community_crypto_collection }} - ansible.builtin.command: ansible-galaxy collection install {{ community_crypto_collection }} - changed_when: true - register: community_crypto_collection - until: community_crypto_collection is not failed - retries: "{{ max_retries }}" - - name: Install jq packages ansible.builtin.yum: name: "{{ jq_package }}" diff --git a/local_repo/roles/validation/tasks/prerequisites_ubuntu.yml b/local_repo/roles/validation/tasks/prerequisites_ubuntu.yml index e771a1b6b..7ef605999 100644 --- a/local_repo/roles/validation/tasks/prerequisites_ubuntu.yml +++ b/local_repo/roles/validation/tasks/prerequisites_ubuntu.yml @@ -13,25 +13,25 @@ # limitations under the License. --- -- name: Install community.general galaxy collection - ansible.builtin.command: ansible-galaxy collection install "{{ community_general_collection }}" - changed_when: true - register: community_general_collection - until: community_general_collection is not failed - retries: "{{ max_retries }}" +- name: Delete apt.conf file + ansible.builtin.file: + path: "{{ apt_conf_dest }}" + state: absent -- name: Install ansible galaxy collection {{ community_crypto_collection }} - ansible.builtin.command: ansible-galaxy collection install {{ community_crypto_collection }} - changed_when: true - register: community_crypto_collection - until: community_crypto_collection is not failed - retries: "{{ max_retries }}" +- name: Update repo - Ubuntu + ansible.builtin.command: apt update + changed_when: false - name: Install jq package ansible.builtin.apt: name: "{{ jq_package }}" state: present +- name: Install dpkg package + ansible.builtin.apt: + name: "{{ dpkg_package }}" + state: present + - name: Install pip modules ansible.builtin.command: "{{ python_version }} -m pip install {{ item }}" loop: "{{ python_package }}" diff --git a/discovery/roles/db_operations/tasks/cp_details_db.yml b/local_repo/roles/validation/tasks/validate_domain_name.yml similarity index 54% rename from discovery/roles/db_operations/tasks/cp_details_db.yml rename to local_repo/roles/validation/tasks/validate_domain_name.yml index 1f278d84c..b6d4491e6 100644 --- a/discovery/roles/db_operations/tasks/cp_details_db.yml +++ b/local_repo/roles/validation/tasks/validate_domain_name.yml @@ -13,13 +13,23 @@ # limitations under the License. --- -- name: Fetch control_plane hostname - ansible.builtin.command: hostname +- name: Fetch the domain name + ansible.builtin.command: hostname -d + register: domain_name_check changed_when: false - register: cp_hostname + failed_when: false + +- name: Verify the domain name is not blank in hostname + ansible.builtin.fail: + msg: "{{ server_domain_name_blank_msg }}" + when: domain_name_check.stdout | length < 1 -- name: Create control_plane entry in cluster_info table - ansible.builtin.command: | - {{ python_version }} {{ cp_db_utility_path }} {{ admin_nic_ip }} {{ network_interface_type }} {{ pxe_mac_address }} - {{ cp_hostname.stdout }} {{ bmc_nic_ip }} +- name: Read Omnia Infrastructure Manager hostname + ansible.builtin.command: hostname changed_when: false + register: hostname_result + +- name: Set fact for the Omnia Infrastructure Manager hostname and domain name + ansible.builtin.set_fact: + oim_hostname: "{{ hostname_result.stdout }}" + oim_domain_name: "{{ domain_name_check.stdout }}" diff --git a/local_repo/roles/validation/tasks/validate_cp_os.yml b/local_repo/roles/validation/tasks/validate_oim_os.yml similarity index 68% rename from local_repo/roles/validation/tasks/validate_cp_os.yml rename to local_repo/roles/validation/tasks/validate_oim_os.yml index 7a1f39d6d..2dad3cc10 100644 --- a/local_repo/roles/validation/tasks/validate_cp_os.yml +++ b/local_repo/roles/validation/tasks/validate_oim_os.yml @@ -13,21 +13,21 @@ # limitations under the License. --- -- name: Set control_plane_os +- name: Set oim_os ansible.builtin.set_fact: - control_plane_os: "{{ ansible_distribution | lower }}" + oim_os: "{{ ansible_distribution | lower }}" -- name: Validate control_plane_os +- name: Validate oim_os ansible.builtin.fail: - msg: "{{ control_plane_os_fail_msg }}" + msg: "{{ oim_os_fail_msg }}" when: - - control_plane_os not in control_plane_os_redhat - - control_plane_os not in control_plane_os_rocky - - control_plane_os not in control_plane_os_ubuntu + - oim_os not in oim_os_redhat + - oim_os not in oim_os_rocky + - oim_os not in oim_os_ubuntu - name: Validate user ansible.builtin.fail: msg: "{{ user_fail_msg }}" when: - - control_plane_os in control_plane_os_ubuntu + - oim_os in oim_os_ubuntu - ansible_env.USER != root_user_name diff --git a/local_repo/roles/validation/tasks/validate_provision_config_credentials.yml b/local_repo/roles/validation/tasks/validate_provision_config_credentials.yml index 3360b7d4d..ebde47525 100644 --- a/local_repo/roles/validation/tasks/validate_provision_config_credentials.yml +++ b/local_repo/roles/validation/tasks/validate_provision_config_credentials.yml @@ -13,11 +13,6 @@ # limitations under the License. --- -- name: Fetch ansible-vault path - ansible.builtin.command: whereis ansible-vault - changed_when: false - register: ansible_vault_path - - name: Check provision_config_credentials.yml file is encrypted ansible.builtin.command: cat {{ provision_config_credentials_filename }} changed_when: false @@ -26,7 +21,7 @@ - name: Decrpyt provision_config_credentials.yml ansible.builtin.command: >- - {{ ansible_vault_path.stdout.split(' ')[1] }} decrypt {{ provision_config_credentials_filename }} + ansible-vault decrypt {{ provision_config_credentials_filename }} --vault-password-file {{ provision_credentials_vault_path }} changed_when: false when: ansible_vault_search_key in provision_config_content.stdout @@ -79,7 +74,7 @@ - name: Encrypt provision_config_credentials.yml ansible.builtin.command: >- - {{ ansible_vault_path.stdout.split(' ')[1] }} encrypt {{ provision_config_credentials_filename }} + ansible-vault encrypt {{ provision_config_credentials_filename }} --vault-password-file {{ provision_credentials_vault_path }} changed_when: false diff --git a/local_repo/roles/validation/tasks/validate_site_config.yml b/local_repo/roles/validation/tasks/validate_site_config.yml new file mode 100644 index 000000000..236f82263 --- /dev/null +++ b/local_repo/roles/validation/tasks/validate_site_config.yml @@ -0,0 +1,104 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Initialize variables + ansible.builtin.set_fact: + http_proxy_input_status: false + https_proxy_input_status: false + no_proxy_input_status: false + proxy_status: false + +- name: Include site_config.yml + ansible.builtin.include_vars: "{{ site_config_file }}" + +- name: Validate http_proxy variable provided + ansible.builtin.set_fact: + http_proxy_input_status: true + when: + - proxy[0].http_proxy is defined + - proxy[0].http_proxy | default("", true) | length > 1 + +- name: Validate https_proxy variable provided + ansible.builtin.set_fact: + https_proxy_input_status: true + when: + - proxy[0].https_proxy is defined + - proxy[0].https_proxy | default("", true) | length > 1 + +- name: Validate no_proxy variable provided + ansible.builtin.set_fact: + no_proxy_input_status: true + when: + - proxy[0].no_proxy is defined + - proxy[0].no_proxy | default("", true) | length > 1 + +- name: Validate both http_proxy and https_proxy input provided + ansible.builtin.fail: + msg: "{{ invalid_proxy_failure_msg }}" + when: + - not https_proxy_input_status and http_proxy_input_status or + not http_proxy_input_status and https_proxy_input_status + +- name: Validate proxy + when: + - http_proxy_input_status + - https_proxy_input_status + block: + - name: Validate http_proxy, https_proxy and no_proxy configured as environment variables + ansible.builtin.assert: + that: + - lookup('env', 'http_proxy') | length > 1 + - lookup('env', 'https_proxy') | length > 1 + - lookup('env', 'no_proxy') | length > 1 + - lookup('env', 'http_proxy') == proxy[0].http_proxy + - lookup('env', 'https_proxy') == proxy[0].https_proxy + - oim_hostname in lookup('env', 'no_proxy') + fail_msg: "{{ proxy_env_fail_msg }}" + + - name: Try updating repos in Ubuntu + when: oim_os in oim_os_ubuntu + block: + - name: Update repos in Ubuntu + ansible.builtin.apt: + update_cache: true + register: update_repos + until: update_repos is not failed + retries: "{{ repo_retries }}" + delay: "{{ repo_delay }}" + rescue: + - name: Failed to update repos + ansible.builtin.fail: + msg: "{{ update_repos_fail_msg }}" + + - name: Try updating repos in RHEL/Rocky + when: + - oim_os in oim_os_redhat or + oim_os in oim_os_rocky + block: + - name: Update repos in RHEL/Rocky + ansible.builtin.dnf: + update_cache: true + register: update_repos + until: update_repos is not failed + retries: "{{ repo_retries }}" + delay: "{{ repo_delay }}" + rescue: + - name: Failed to update repos + ansible.builtin.fail: + msg: "{{ update_repos_fail_msg }}" + + - name: Set proxy_status to true + ansible.builtin.set_fact: + proxy_status: true diff --git a/local_repo/roles/validation/tasks/validate_software_config_json.yml b/local_repo/roles/validation/tasks/validate_software_config_json.yml index 0b1da3dcc..b28266264 100644 --- a/local_repo/roles/validation/tasks/validate_software_config_json.yml +++ b/local_repo/roles/validation/tasks/validate_software_config_json.yml @@ -19,10 +19,11 @@ amdgpu_version: "omnia_default" rocm_version: "omnia_default" bcm_roce_libraries_version: "omnia_default" + intelgaudi_version: "omnia_default" - name: Check that the software_config.json exists ansible.builtin.stat: - path: "{{ software_config_json_file }}" + path: "{{ sw_config_json_path }}" register: stat_result - name: Fail if software_config.json file doesn't exist @@ -32,7 +33,7 @@ - name: Check JSON syntax of software_config.json ansible.builtin.command: - cmd: "jq . {{ software_config_json_file }}" + cmd: "jq . {{ sw_config_json_path }}" register: json_check ignore_errors: true changed_when: true @@ -44,7 +45,7 @@ - name: Load software_config.json ansible.builtin.include_vars: - file: "{{ software_config_json_file }}" + file: "{{ sw_config_json_path }}" name: software_config - name: Assert cluster_os_type, cluster_os_version, repo_config and softwares @@ -75,7 +76,7 @@ - name: Generate software names having subgroup ansible.builtin.set_fact: - subgroup_software_names: "{{ subgroup_software_names | default([]) + [item] }}" + subgroup_softwares: "{{ subgroup_softwares | default({}) | combine({item: software_config[item]}) }}" loop: "{{ software_names }}" when: item in software_config | dict2items | map(attribute='key') | list # noqa: var-naming[no-jinja] @@ -134,13 +135,13 @@ - repo_config in valid_repo_config fail_msg: "{{ repo_config_fail_msg }}" -- name: Validate control_plane_os with cluster_os_type +- name: Validate oim_os with cluster_os_type ansible.builtin.fail: msg: "{{ cross_os_support_fail_msg }}" when: - - (control_plane_os == 'ubuntu' and cluster_os_type != 'ubuntu') or - (control_plane_os == 'rocky' and cluster_os_type != 'rocky') or - (control_plane_os == 'redhat' and cluster_os_type != 'rhel') + - (oim_os == 'ubuntu' and cluster_os_type != 'ubuntu') or + (oim_os == 'rocky' and cluster_os_type != 'rocky') or + (oim_os == 'redhat' and cluster_os_type != 'rhel') - name: Generate software JSON file patterns ansible.builtin.set_fact: @@ -191,6 +192,38 @@ vars: failed_softwares: "{{ version_result.results | selectattr('msg', 'equalto', 'Assertion failed') | map(attribute='item.name') | list }}" +- name: Check if k8s support is true + ansible.builtin.set_fact: + k8s_support: "{{ software_config.softwares | selectattr('name', 'equalto', 'k8s') | list | length > 0 }}" + +- name: Check virtual ENV + ansible.builtin.set_fact: + venv_path: "{{ lookup('ansible.builtin.env', 'VIRTUAL_ENV') }}" + +- name: Validate k8s_support variable + when: k8s_support + block: + - name: Extract k8s version + ansible.builtin.set_fact: + k8s_version: "{{ software_config.softwares | selectattr('name', 'equalto', 'k8s') | map(attribute='version') | first }}" + # noqa: var-naming[no-jinja] + + - name: Fail if kubernetes version is other than 1.26.12 or omnia161_venv is not activated for RHEL/Rocky 8.6 or 8.7 + ansible.builtin.fail: + msg: "{{ kube_version_on_unsupported_os }}" + when: + - cluster_os_type in ['rhel', 'rocky'] + - cluster_os_version in ['8.6', '8.7'] + - (omnia161_k8s_version not in k8s_version) or ('omnia161_venv' not in venv_path) + + - name: Assert supported kubernetes version + ansible.builtin.assert: + that: + - k8s_version | default("", false) | length > 1 + - "('omnia161_venv' in venv_path and '{{ omnia161_k8s_version }}' in k8s_version) or ( 'omnia17_venv' in venv_path and '{{ omnia17_k8s_version }}' in k8s_version)" # noqa: yaml[line-length] + success_msg: "{{ success_msg_k8s_version }}" + fail_msg: "{{ fail_msg_k8s_version }}" + - name: Update software versions from software_config.json (softwares) ansible.builtin.set_fact: "{{ item.name }}_version": "{{ item.version }}" @@ -211,10 +244,10 @@ - name: Update software versions from software_config.json (subgroup) ansible.builtin.include_tasks: validate_software_config_subgroup_json.yml - loop: "{{ subgroup_software_names }}" + with_dict: "{{ subgroup_softwares }}" loop_control: - loop_var: item + loop_var: subgroup vars: - subgroup_software_name: "{{ item }}" + subgroup_software_name: "{{ subgroup.key }}" when: subgroup_software_name is defined # noqa: var-naming[no-jinja] diff --git a/local_repo/roles/validation/tasks/validate_software_config_subgroup_json.yml b/local_repo/roles/validation/tasks/validate_software_config_subgroup_json.yml index 695a510bb..6a51877b4 100644 --- a/local_repo/roles/validation/tasks/validate_software_config_subgroup_json.yml +++ b/local_repo/roles/validation/tasks/validate_software_config_subgroup_json.yml @@ -13,6 +13,12 @@ # limitations under the License. --- +- name: Validate software_config subgroups + ansible.builtin.assert: + that: + - item.name in subgroup_software_list[subgroup_software_name] + fail_msg: "Failed. Invalid software name: '{{ item.name }}' in subgroup '{{ subgroup_software_name }}'. {{ subgroup_software_name_fail_msg }}" + with_items: "{{ subgroup.value }}" - name: Update software versions from software_config.json '{{ subgroup_software_name }}' ansible.builtin.set_fact: diff --git a/local_repo/roles/validation/tasks/validate_user_registry.yml b/local_repo/roles/validation/tasks/validate_user_registry.yml index b249d6f38..225bf55e7 100644 --- a/local_repo/roles/validation/tasks/validate_user_registry.yml +++ b/local_repo/roles/validation/tasks/validate_user_registry.yml @@ -30,3 +30,52 @@ loop: "{{ user_registry }}" loop_control: loop_var: item + +- name: Initialize unreachable user registry + ansible.builtin.set_fact: + unreachable_registries: "" + +- name: Check if user registry is reachable from Omnia Infrastructure Manager + ansible.builtin.uri: + url: "https://{{ item.host }}" + validate_certs: false + timeout: "{{ time_out }}" + register: https_registry_reachable + failed_when: false + loop: "{{ user_registry }}" + loop_control: + loop_var: item + +- name: Set facts for reachable and unreachable registries + ansible.builtin.set_fact: + reachable_registries: "{{ https_registry_reachable.results | selectattr('status', 'equalto', 200) | map(attribute='item.host') | list }}" + unreachable_registries: "{{ https_registry_reachable.results | selectattr('status', 'ne', 200) | map(attribute='item.host') | list }}" + +- name: Warning - Display unreachable registries + ansible.builtin.pause: + prompt: "{{ user_registry_msg }}" + seconds: "{{ warning_wait_time_warning }}" + when: unreachable_registries | default("", true) | length > 0 + +- name: Check if cert_path is a valid file path + ansible.builtin.stat: + path: "{{ item.cert_path }}" + register: cert_path_status + loop: "{{ user_registry }}" + loop_control: + loop_var: item + when: + - item.cert_path is defined + - item.cert_path | length > 0 + +- name: Fail if cert_path does not exist + ansible.builtin.fail: + msg: " {{ cert_path_failure_msg }}" + loop: "{{ cert_path_status.results }}" + loop_control: + loop_var: item + when: + - item.stat is defined + - not item.stat.exists + - item.item.cert_path | length > 0 + - cert_path_status.results is defined diff --git a/local_repo/roles/validation/vars/main.yml b/local_repo/roles/validation/vars/main.yml index 545eb99e7..8caff2113 100644 --- a/local_repo/roles/validation/vars/main.yml +++ b/local_repo/roles/validation/vars/main.yml @@ -15,10 +15,15 @@ # Usage: main.yml local_repo_config_file: "{{ role_path }}/../../../input/local_repo_config.yml" +var_mount_percentage_limit: 80 +var_mount_overuse_msg: | + [WARNING] local_repo.yml may fail as /var mount usage has exceeded the limit of {{ var_mount_percentage_limit }}%. + Current usage: {{ var_mount_use_percentage.stdout }}%. + This could result in failures when downloading large packages or images. + + For OMNIA disk spaces requirements follow : https://omnia-doc.readthedocs.io/en/latest # Usage: prerequisites_redhat.yml, prerequisites_ubuntu.yml -community_general_collection: community.general:4.8.7 -community_crypto_collection: community.crypto:2.14.0 max_retries: 10 yum_repos_path: "/etc/yum.repos.d" yum_conf_path: "/etc/yum.conf" @@ -26,23 +31,30 @@ jq_package: jq repo_pkgs: - createrepo - yum-utils -python_version: python3.9 + - wget + - unzip + - tar +python_version: "{{ ansible_python_interpreter }}" python_package: - requests==2.31.0 - pyopenssl==21.0.0 - urllib3==1.26.5 -# Usage: validate_cp_os.yml -control_plane_os_redhat: "redhat" -control_plane_os_rocky: "rocky" -control_plane_os_ubuntu: "ubuntu" -control_plane_os_fail_msg: "Failed. Control plane OS should be RHEL, Rocky or Ubuntu." + - distro==1.9.0 +dpkg_package: "dpkg-dev" +apt_conf_dest: /etc/apt/apt.conf + +# Usage: validate_oim_os.yml +oim_os_redhat: "redhat" +oim_os_rocky: "rocky" +oim_os_ubuntu: "ubuntu" +oim_os_fail_msg: "Failed. Omnia Infrastructure Manager OS should be RHEL, Rocky or Ubuntu." root_user_name: "root" user_fail_msg: "Failed. Omnia playbooks should run as root user." warning_wait_time_warning: 15 warning_msg_local_repo: "[WARNING] Omnia will remove any package/software conflicting with the requirements." # Usage: validate_software_config_json.yml -software_config_json_file: "{{ role_path }}/../../../input/software_config.json" +sw_config_json_path: "{{ role_path }}/../../../input/software_config.json" fail_msg_software_config_json_file: "software_config.json file doesn't exist." software_config_parameters_fail_msg: "Failed. Please ensure cluster_os_type, cluster_os_verion, repo_config, softwares are defined in software_config.json" software_config_softwares_fail_msg: "Failed. softwares list cannot be empty in software_config.json. Atleast one software should be defined." @@ -58,6 +70,7 @@ specific_softwares: - 'bcm_roce' - 'ucx' - 'rocm' + - 'intelgaudi' - 'intel_benchmarks' - 'openmpi' - 'bcm_roce_libraries' @@ -68,7 +81,7 @@ cluster_os_version_fail_msg_rhel: "Failed. The supported values of cluster_os_ve cluster_os_version_fail_msg_rocky: "Failed. The supported values of cluster_os_version is 8.6,8.7 and 8.8 when cluster_os_type is rocky" cluster_os_version_fail_msg_ubuntu: "Failed. The supported values of cluster_os_version is 20.04 and 22.04 when cluster_os_type is ubuntu" repo_config_fail_msg: "Failed. The supported values of repo_config is always, partial and never" -cross_os_support_fail_msg: "Cross-OS is not supported: control_plane_os '{{ control_plane_os }}' does not match cluster_os_type '{{ cluster_os_type }}'" +cross_os_support_fail_msg: "Cross-OS is not supported: oim_os '{{ oim_os }}' does not match cluster_os_type '{{ cluster_os_type }}'" valid_cluster_os_type: - 'rhel' - 'rocky' @@ -89,6 +102,12 @@ valid_repo_config: - 'partial' - 'never' +omnia17_k8s_version: '1.29.5' +omnia161_k8s_version: '1.26.12' +success_msg_k8s_version: "Kubernetes Version Validated" +fail_msg_k8s_version: "Failed. Kubernetes Version is unsupported or incorrect in software_config.json for this virtual environment. Supported versions for omnia17_venv: 1.29.5, and for omnia161_venv: 1.26.12." # noqa: yaml[line-length] +kube_version_on_unsupported_os: "Failed. On RHEL/Rocky 8.6 or 8.7 OS, supported kubernetes version is 1.26.12 and suppported virtual environment is omnia161_venv only" # noqa: yaml[line-length] + # Usage: validate_local_repo_config.yml fail_msg_local_repo_config_file: "local_repo_config.yml file doesn't exist." local_repo_config_syntax_fail_msg: "Failed. Syntax errors present in local_repo_config.yml. Fix errors and re-run playbook again. Common syntax Errors: @@ -111,6 +130,9 @@ deb_dir_path: "{{ repo_store_path }}/cluster/{{ cluster_os_type }}/{{ cluster_os user_registry_fail_msg: "Failed. Please ensure user_registry is non empty list and check if there is any indentation error in input/local_repo_config.yml" user_registry_fail_host_cert_path_msg: "Failed. Each item in user_registry should have 'host' and 'cert_path' keys defined" +time_out: 30 +user_registry_msg: "The following user registries are not reachable: [{{ unreachable_registries | join(', ') }}]. If the user registry is not accessible from the Omnia Infrastructure Manager, Omnia will download all the images for the software listed in software_config.json." # noqa: yaml[line-length] +cert_path_failure_msg: "Certificate file path {{ item.item.cert_path }} does not exist on the Omnia Infrastructure Manager for host {{ item.item.host }}. Please verify that correct cert_path is given in input/local_repo_config.yml" # noqa: yaml[line-length] # Usage: validate_user_repo_url.yml user_repo_url_fail_msg: "Failed. Please ensure user_repo_url is proper and should not have jinja variables. @@ -134,6 +156,26 @@ manifest_properties_fail_msg: "Missing required properties for an item of type ' shell_properties_fail_msg: "Missing required properties for an item of type 'shell' in the JSON file '{{ file_basename }}.json'." iso_properties_fail_msg: "Missing required properties for an item of type 'iso' in the JSON file '{{ file_basename }}.json'." validation_failure_msg: "Validation failed for JSON file: {{ json_file.path }}. Please ensure all properties are defined in all items based on type" +subgroup_software_list: + bcm_roce: + - 'bcm_roce_libraries' + amdgpu: + - 'rocm' + intelgaudi: + - 'intel' + vllm: + - 'vllm_amd' + - 'vllm_nvidia' + pytorch: + - 'pytorch_cpu' + - 'pytorch_amd' + - 'pytorch_nvidia' + - 'pytorch_gaudi' + tensorflow: + - 'tensorflow_cpu' + - 'tensorflow_amd' + - 'tensorflow_nvidia' +subgroup_software_name_fail_msg: "Please ensure valid software names are defined in subgroups in the software_config.json file." # Usage: validate_ubuntu_os_url.yml url_format_fail_msg: "Failed. Invalid format for ubuntu_os_url. @@ -192,3 +234,16 @@ subgroup_warning_msg: "bcm_roce_libraries attribute is not mentioned in the bcm_ so bcm_roce_libraries will not be installed on the nodes post provisioning." bcm_roce_libraries_warning_msg: "bcm_roce_libraries object not found in bcm_roce.json, so bcm_roce_libraries will not be installed during post provisioning." + +# Usage: validate_domain_name.yml +server_domain_name_blank_msg: "Failed. domain_name is not set in hostname. It should have hostname.domain_name format" + +# Usage: validate_site_config.yml +site_config_file: "{{ role_path }}/../../../input/site_config.yml" +invalid_proxy_failure_msg: "Failed. Both http_proxy and https_proxy should be set for proxy variable provided in site_config.yml" +proxy_env_fail_msg: "Failed. The values for http_proxy {{ proxy[0].http_proxy }} and https_proxy {{ proxy[0].https_proxy }} in the +proxy variable of the site_config.yml should be set as environment variables http_proxy and https_proxy in the Omnia Infrastructure Manager. +The no_proxy environment variable should include the Omnia Infrastructure Manager hostname and the admin network IP address." +update_repos_fail_msg: "Failed to update repos. Verify proxy configuration in Omnia Infrastructure Manager for acccessing internet." +repo_retries: 5 +repo_delay: 10 diff --git a/network/ansible.cfg b/network/ansible.cfg index 5acb930c9..7b0fb33d6 100644 --- a/network/ansible.cfg +++ b/network/ansible.cfg @@ -3,6 +3,7 @@ log_path = /var/log/omnia/network.log host_key_checking = false forks = 5 timeout = 180 +collections_path = $VIRTUAL_ENV [persistent_connection] command_timeout = 180 @@ -10,4 +11,4 @@ connect_timeout = 180 [ssh_connection] retries = 3 -ssh_args = -o ControlMaster=auto -o ControlPersist=180 \ No newline at end of file +ssh_args = -o ControlMaster=auto -o ControlPersist=180 diff --git a/network/ethernet_sseries_input.yml b/network/ethernet_sseries_input.yml index 94cb4877c..ad0c1fd1e 100644 --- a/network/ethernet_sseries_input.yml +++ b/network/ethernet_sseries_input.yml @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -#*********************************************************************** -# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. # SIMPLY APPEND THE REQUIRD VALUES AGAINST THE PARAMETER OF YOUR CHOICE. -#*********************************************************************** +# *********************************************************************** # ---Default configurations written for Dell PowerSwitch S5232F-ON--- # ---Change the configurations as per the switch model to avoid failures--- @@ -44,109 +44,109 @@ snmp_community_name: "public" # By default, all ports are brought up in admin UP state os10_interface: ethernet 1/1/1: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/2: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/3: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/4: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/5: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/6: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/7: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/8: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/9: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/10: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/11: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/12: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/13: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/14: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/15: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/16: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/17: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/18: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/19: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/20: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/21: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/22: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/23: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/24: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/25: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/26: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/27: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/28: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/29: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/30: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/31: - admin: up - fanout: "{{ breakout_value }}" + admin: up + fanout: "{{ breakout_value }}" ethernet 1/1/32: - desc: "Port 32" - admin: up + desc: "Port 32" + admin: up ethernet 1/1/33: - desc: "Port 33" - admin: up + desc: "Port 33" + admin: up ethernet 1/1/34: - desc: "Port 34" - admin: up + desc: "Port 34" + admin: up vlan 1: - admin: up + admin: up # save_changes_to_startup is a boolean flag. By default, this option is set to false. # When set to true, it will save the switch's running configuration to the startup configuration file diff --git a/network/ethernet_switch_config.yml b/network/ethernet_switch_config.yml index 2674b0ada..75607fb8f 100644 --- a/network/ethernet_switch_config.yml +++ b/network/ethernet_switch_config.yml @@ -1,4 +1,4 @@ -# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,6 +13,10 @@ # limitations under the License. --- +- name: Check if virtual environment is active + ansible.builtin.import_playbook: ../utils/check_venv.yml + when: not ( hostvars['127.0.0.1']['check_venv_executed'] | default(false) | bool ) + - name: Include pre-requisites ethernet configuration hosts: localhost tasks: @@ -25,8 +29,6 @@ hosts: all gather_facts: false connection: network_cli - collections: - - dellemc.os10 vars: ansible_network_os: dellemc.os10.os10 ansible_command_timeout: 180 diff --git a/network/ethernet_tor_input.yml b/network/ethernet_tor_input.yml index af71bf895..99fd491c7 100644 --- a/network/ethernet_tor_input.yml +++ b/network/ethernet_tor_input.yml @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -#*********************************************************************** -# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. # SIMPLY APPEND THE REQUIRD VALUES AGAINST THE PARAMETER OF YOUR CHOICE. -#*********************************************************************** +# *********************************************************************** # ---Default configurations written for Dell PowerSwitch S3048-ON--- # ---Change the configurations as per the switch model to avoid failures--- @@ -41,163 +41,163 @@ snmp_community_name: "public" # Interface configuration for switch os10_interface: ethernet 1/1/1: - desc: "Port 1" - admin: up + desc: "Port 1" + admin: up ethernet 1/1/2: - desc: "Port 2" - admin: up + desc: "Port 2" + admin: up ethernet 1/1/3: - desc: "Port 3" - admin: up + desc: "Port 3" + admin: up ethernet 1/1/4: - desc: "Port 4" - admin: up + desc: "Port 4" + admin: up ethernet 1/1/5: - desc: "Port 5" - admin: up + desc: "Port 5" + admin: up ethernet 1/1/6: - desc: "Port 6" - admin: up + desc: "Port 6" + admin: up ethernet 1/1/7: - desc: "Port 7" - admin: up + desc: "Port 7" + admin: up ethernet 1/1/8: - desc: "Port 8" - admin: up + desc: "Port 8" + admin: up ethernet 1/1/9: - desc: "Port 9" - admin: up + desc: "Port 9" + admin: up ethernet 1/1/10: - desc: "Port 10" - admin: up + desc: "Port 10" + admin: up ethernet 1/1/11: - desc: "Port 11" - admin: up + desc: "Port 11" + admin: up ethernet 1/1/12: - desc: "Port 12" - admin: up + desc: "Port 12" + admin: up ethernet 1/1/13: - desc: "Port 13" - admin: up + desc: "Port 13" + admin: up ethernet 1/1/14: - desc: "Port 14" - admin: up + desc: "Port 14" + admin: up ethernet 1/1/15: - desc: "Port 15" - admin: up + desc: "Port 15" + admin: up ethernet 1/1/16: - desc: "Port 16" - admin: up + desc: "Port 16" + admin: up ethernet 1/1/17: - desc: "Port 17" - admin: up + desc: "Port 17" + admin: up ethernet 1/1/18: - desc: "Port 18" - admin: up + desc: "Port 18" + admin: up ethernet 1/1/19: - desc: "Port 19" - admin: up + desc: "Port 19" + admin: up ethernet 1/1/20: - desc: "Port 20" - admin: up + desc: "Port 20" + admin: up ethernet 1/1/21: - desc: "Port 21" - admin: up + desc: "Port 21" + admin: up ethernet 1/1/22: - desc: "Port 22" - admin: up + desc: "Port 22" + admin: up ethernet 1/1/23: - desc: "Port 23" - admin: up + desc: "Port 23" + admin: up ethernet 1/1/24: - desc: "Port 24" - admin: up + desc: "Port 24" + admin: up ethernet 1/1/25: - desc: "Port 25" - admin: up + desc: "Port 25" + admin: up ethernet 1/1/26: - desc: "Port 26" - admin: up + desc: "Port 26" + admin: up ethernet 1/1/27: - desc: "Port 27" - admin: up + desc: "Port 27" + admin: up ethernet 1/1/28: - desc: "Port 28" - admin: up + desc: "Port 28" + admin: up ethernet 1/1/29: - desc: "Port 29" - admin: up + desc: "Port 29" + admin: up ethernet 1/1/30: - desc: "Port 30" - admin: up + desc: "Port 30" + admin: up ethernet 1/1/31: - desc: "Port 31" - admin: up + desc: "Port 31" + admin: up ethernet 1/1/32: - desc: "Port 32" - admin: up + desc: "Port 32" + admin: up ethernet 1/1/33: - desc: "Port 33" - admin: up + desc: "Port 33" + admin: up ethernet 1/1/34: - desc: "Port 34" - admin: up + desc: "Port 34" + admin: up ethernet 1/1/35: - desc: "Port 35" - admin: up + desc: "Port 35" + admin: up ethernet 1/1/36: - desc: "Port 36" - admin: up + desc: "Port 36" + admin: up ethernet 1/1/37: - desc: "Port 37" - admin: up + desc: "Port 37" + admin: up ethernet 1/1/38: - desc: "Port 38" - admin: up + desc: "Port 38" + admin: up ethernet 1/1/39: - desc: "Port 39" - admin: up + desc: "Port 39" + admin: up ethernet 1/1/40: - desc: "Port 40" - admin: up + desc: "Port 40" + admin: up ethernet 1/1/41: - desc: "Port 41" - admin: up + desc: "Port 41" + admin: up ethernet 1/1/42: - desc: "Port 42" - admin: up + desc: "Port 42" + admin: up ethernet 1/1/43: - desc: "Port 43" - admin: up + desc: "Port 43" + admin: up ethernet 1/1/44: - desc: "Port 4" - admin: up + desc: "Port 4" + admin: up ethernet 1/1/45: - desc: "Port 45" - admin: up + desc: "Port 45" + admin: up ethernet 1/1/46: - desc: "Port 46" - admin: up + desc: "Port 46" + admin: up ethernet 1/1/47: - desc: "Port 47" - admin: up + desc: "Port 47" + admin: up ethernet 1/1/48: - desc: "Port 48" - admin: up + desc: "Port 48" + admin: up ethernet 1/1/49: - desc: "Port 49" - admin: up + desc: "Port 49" + admin: up ethernet 1/1/50: - desc: "Port 50" - admin: up + desc: "Port 50" + admin: up ethernet 1/1/51: - desc: "Port 51" - admin: up + desc: "Port 51" + admin: up ethernet 1/1/52: - desc: "Port 52" - admin: up + desc: "Port 52" + admin: up vlan 1: - admin: up + admin: up # save_changes_to_startup is a boolean flag. By default, this option is set to false. # When set to true, it will save the switch's running configuration to the startup configuration file diff --git a/network/ethernet_zseries_input.yml b/network/ethernet_zseries_input.yml index 6661252b4..5711f1609 100644 --- a/network/ethernet_zseries_input.yml +++ b/network/ethernet_zseries_input.yml @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -#*********************************************************************** -# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. # SIMPLY APPEND THE REQUIRD VALUES AGAINST THE PARAMETER OF YOUR CHOICE. -#*********************************************************************** +# *********************************************************************** # ---Default configurations written for Dell PowerSwitch Z9264F-ON--- # ---Change the configurations as per the switch model to avoid failures--- @@ -42,71 +42,71 @@ snmp_community_name: "public" # By default, all ports are brought up in admin UP state ethernet_ports: port 1/1/1: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/3: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/5: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/7: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/9: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/11: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/13: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/15: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/17: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/19: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/21: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/23: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/25: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/27: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/29: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/31: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/33: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/35: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/37: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/39: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/41: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/43: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/45: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/47: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/49: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/51: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/53: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/55: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/57: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/59: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/61: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" port 1/1/63: - fanout: "{{ breakout_value }}" + fanout: "{{ breakout_value }}" vlan 1: - admin: up + admin: up # save_changes_to_startup is a boolean flag. By default, this option is set to false. # When set to true, it will save the switch's running configuration to the startup configuration file diff --git a/network/infiniband_edr_input.yml b/network/infiniband_edr_input.yml index f70c22983..9b22b26fa 100644 --- a/network/infiniband_edr_input.yml +++ b/network/infiniband_edr_input.yml @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -#*********************************************************************** -# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. # SIMPLY APPEND THE REQUIRD VALUES AGAINST THE PARAMETER OF YOUR CHOICE. -#*********************************************************************** +# *********************************************************************** # Default configurations written for : # ----Switch-IB(TM) 2 based EDR InfiniBand 1U Switch, 36 QSFP28 ports---- @@ -36,7 +36,7 @@ # This variable is used for enabling split port functionality on IB switch # Accepted values are true or false -# Default value is false +# Default value is false enable_split_port: false # This variable is used for taking list of ports to be splitted @@ -222,4 +222,4 @@ mellanox_switch_interface_config: # When set to true, it will save the switch's running configuration to the startup configuration file # after the role applies its configuration. This will allow the configuration to persist after a # restart or power failure. -save_changes_to_startup: false \ No newline at end of file +save_changes_to_startup: false diff --git a/network/infiniband_hdr_input.yml b/network/infiniband_hdr_input.yml index 27dc5c12c..41b49ce5c 100644 --- a/network/infiniband_hdr_input.yml +++ b/network/infiniband_hdr_input.yml @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -#*********************************************************************** -# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. # SIMPLY APPEND THE REQUIRD VALUES AGAINST THE PARAMETER OF YOUR CHOICE. -#*********************************************************************** +# *********************************************************************** # Default configurations written for : # ----Mellanox Quantum(TM) HDR InfiniBand Switch, 40 QSFP56 ports---- @@ -36,7 +36,7 @@ # This variable is used for enabling split port functionality on IB switch # Accepted values are true or false -# Default value is false +# Default value is false enable_split_port: false # This variable is used for taking list of ports to be splitted @@ -238,4 +238,4 @@ mellanox_switch_interface_config: # When set to true, it will save the switch's running configuration to the startup configuration file # after the role applies its configuration. This will allow the configuration to persist after a # restart or power failure. -save_changes_to_startup: false \ No newline at end of file +save_changes_to_startup: false diff --git a/network/infiniband_ndr_input.yml b/network/infiniband_ndr_input.yml index 26f0def8a..0296b323e 100644 --- a/network/infiniband_ndr_input.yml +++ b/network/infiniband_ndr_input.yml @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -#*********************************************************************** -# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. # SIMPLY APPEND THE REQUIRD VALUES AGAINST THE PARAMETER OF YOUR CHOICE. -#*********************************************************************** +# *********************************************************************** # Default configurations written for : # ----Switch-IB(TM) 2 based EDR InfiniBand 1U Switch, 36 QSFP28 ports---- @@ -28,7 +28,7 @@ # This variable is used for enabling split port functionality on IB switch # Accepted values are true or false -# Default value is false +# Default value is false enable_split_port: false # This variable is used for taking list of ports to be splitted @@ -324,4 +324,4 @@ mellanox_switch_interface_config: # When set to true, it will save the switch's running configuration to the startup configuration file # after the role applies its configuration. This will allow the configuration to persist after a # restart or power failure. -save_changes_to_startup: false \ No newline at end of file +save_changes_to_startup: false diff --git a/network/infiniband_switch_config.yml b/network/infiniband_switch_config.yml index 0208d18b1..9f4dfd334 100644 --- a/network/infiniband_switch_config.yml +++ b/network/infiniband_switch_config.yml @@ -1,4 +1,4 @@ -# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,8 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - --- + +- name: Check if virtual environment is active + ansible.builtin.import_playbook: ../utils/check_venv.yml + when: not ( hostvars['127.0.0.1']['check_venv_executed'] | default(false) | bool ) + - name: Infiniband Configuration hosts: all gather_facts: false diff --git a/network/network.yml b/network/network.yml index 4b2443575..cfb331b0a 100644 --- a/network/network.yml +++ b/network/network.yml @@ -17,7 +17,7 @@ hosts: all gather_facts: true -- name: Validate xcat status on control plane +- name: Validate xcat status on Omnia Infrastructure Manager hosts: localhost gather_facts: false roles: @@ -47,11 +47,11 @@ name: mlnx_ofed tasks_from: pre-requisites.yml -- name: Setup NFS Server on RedHat control plane if Infiniband Support is present on nodes +- name: Setup NFS Server on RedHat Omnia Infrastructure Manager if Infiniband Support is present on nodes hosts: localhost connection: local tasks: - - name: Setup NFS Server on control plane if Rhel OS is installed on nodes + - name: Setup NFS Server on Omnia Infrastructure Manager if Rhel OS is installed on nodes ansible.builtin.include_role: name: mlnx_ofed tasks_from: initiate_nfs_server.yml diff --git a/network/roles/ethernet/tasks/apply_config.yml b/network/roles/ethernet/tasks/apply_config.yml index 7287aa7c5..5f6e71781 100644 --- a/network/roles/ethernet/tasks/apply_config.yml +++ b/network/roles/ethernet/tasks/apply_config.yml @@ -22,15 +22,15 @@ commands: "{{ os10_config }}" when: os10_config is defined vars: - - ansible_ssh_user: "{{ ethernet_switch_username }}" - ansible_ssh_password: "{{ ethernet_switch_password }}" + ansible_ssh_user: "{{ ethernet_switch_username }}" + ansible_ssh_password: "{{ ethernet_switch_password }}" - name: Apply switch interface configuration ansible.builtin.include_role: name: dellemc.os10.os10_interface vars: - - hostname: "{{ inventory_hostname }}" - os10_cfg_generate: true - ansible_ssh_user: "{{ ethernet_switch_username }}" - ansible_ssh_password: "{{ ethernet_switch_password }}" + hostname: "{{ inventory_hostname }}" + os10_cfg_generate: true + ansible_ssh_user: "{{ ethernet_switch_username }}" + ansible_ssh_password: "{{ ethernet_switch_password }}" when: os10_interface is defined diff --git a/network/roles/ethernet/tasks/configure_port_group.yml b/network/roles/ethernet/tasks/configure_port_group.yml index 09319ab6d..41894ad5f 100644 --- a/network/roles/ethernet/tasks/configure_port_group.yml +++ b/network/roles/ethernet/tasks/configure_port_group.yml @@ -34,8 +34,8 @@ - '{{ item.key }} mode Eth {{ item.value.fanout }}' parents: ['port-group {{ port_prefix + port_group }}'] vars: - - ansible_ssh_user: "{{ ethernet_switch_username }}" - ansible_ssh_password: "{{ ethernet_switch_password }}" + ansible_ssh_user: "{{ ethernet_switch_username }}" + ansible_ssh_password: "{{ ethernet_switch_password }}" register: eth_port_status rescue: - name: Status msg when port is already splitted with different breakout value diff --git a/network/roles/ethernet/tasks/configure_vlan.yml b/network/roles/ethernet/tasks/configure_vlan.yml index 410a5efec..10f454d85 100644 --- a/network/roles/ethernet/tasks/configure_vlan.yml +++ b/network/roles/ethernet/tasks/configure_vlan.yml @@ -19,8 +19,8 @@ - 'no shutdown' parents: ['interface vlan1'] vars: - - ansible_ssh_user: "{{ ethernet_switch_username }}" - ansible_ssh_password: "{{ ethernet_switch_password }}" + ansible_ssh_user: "{{ ethernet_switch_username }}" + ansible_ssh_password: "{{ ethernet_switch_password }}" when: item.value.admin == 'up' - name: Bring vlan interface down @@ -29,6 +29,6 @@ - 'shutdown' parents: ['interface vlan1'] vars: - - ansible_ssh_user: "{{ ethernet_switch_username }}" - ansible_ssh_password: "{{ ethernet_switch_password }}" + ansible_ssh_user: "{{ ethernet_switch_username }}" + ansible_ssh_password: "{{ ethernet_switch_password }}" when: item.value.admin == 'down' diff --git a/network/roles/ethernet/tasks/main.yml b/network/roles/ethernet/tasks/main.yml index 9fad912a2..d2c673d18 100644 --- a/network/roles/ethernet/tasks/main.yml +++ b/network/roles/ethernet/tasks/main.yml @@ -28,6 +28,6 @@ ansible.builtin.include_tasks: snmp_config.yml - name: Save current switch configuration to startup-configuration - dellos10_command: + dellemc.os10.os10_command: commands: "copy running-configuration startup-configuration" when: save_changes_to_startup diff --git a/network/roles/ethernet/tasks/pre_requisites.yml b/network/roles/ethernet/tasks/pre_requisites.yml index 17c220188..c6e4394a5 100644 --- a/network/roles/ethernet/tasks/pre_requisites.yml +++ b/network/roles/ethernet/tasks/pre_requisites.yml @@ -20,10 +20,12 @@ executable: pip3 - name: Install paramiko - ansible.builtin.command: pip3 install paramiko -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com + ansible.builtin.pip: + name: paramiko + state: present + executable: pip3 + extra_args: "-i https://pypi.org/simple/" delegate_to: localhost - run_once: true - changed_when: false - name: Check if ethernet_tor_input.yml exists ansible.builtin.stat: @@ -60,8 +62,8 @@ commands: show system | grep "Current Type" register: model_type vars: - - ansible_ssh_user: "{{ ethernet_switch_username }}" - ansible_ssh_password: "{{ ethernet_switch_password }}" + ansible_ssh_user: "{{ ethernet_switch_username }}" + ansible_ssh_password: "{{ ethernet_switch_password }}" - name: Save switch model name ansible.builtin.set_fact: @@ -112,6 +114,7 @@ when: snmp_trap_destination | length > 1 - name: Validate snmp parameters + when: snmp_enabled block: - name: Assert snmp trap destination address ansible.builtin.assert: @@ -126,4 +129,3 @@ - snmp_community_name | length > 1 success_msg: "{{ snmp_community_success_msg }}" fail_msg: "{{ snmp_community_fail_msg }}" - when: snmp_enabled diff --git a/network/roles/ethernet/tasks/setup_ports.yml b/network/roles/ethernet/tasks/setup_ports.yml index 3eb3a6c05..4a2fabe38 100644 --- a/network/roles/ethernet/tasks/setup_ports.yml +++ b/network/roles/ethernet/tasks/setup_ports.yml @@ -22,8 +22,8 @@ commands: "{{ os10_config }}" when: os10_config is defined vars: - - ansible_ssh_user: "{{ ethernet_switch_username }}" - ansible_ssh_password: "{{ ethernet_switch_password }}" + ansible_ssh_user: "{{ ethernet_switch_username }}" + ansible_ssh_password: "{{ ethernet_switch_password }}" - name: Set empty dictionary if ethernet_ports not defined ansible.builtin.set_fact: diff --git a/network/roles/ethernet/tasks/snmp_config.yml b/network/roles/ethernet/tasks/snmp_config.yml index d14f07bbd..aa56ab8b0 100644 --- a/network/roles/ethernet/tasks/snmp_config.yml +++ b/network/roles/ethernet/tasks/snmp_config.yml @@ -16,29 +16,29 @@ - name: Configure SNMP block: - - name: Set parameters for SNMP configuration - ansible.builtin.set_fact: - os10_snmp: - snmp_community: - - name: "{{ snmp_community_name }}" - access_mode: ro - state: present - snmp_traps: - - name: all - state: present - snmp_host: - - ip: "{{ snmp_trap_destination }}" - communitystring: "{{ snmp_community_name }}" - version: "1" - state: present - when: snmp_trap_destination | length > 1 + - name: Set parameters for SNMP configuration + ansible.builtin.set_fact: + os10_snmp: + snmp_community: + - name: "{{ snmp_community_name }}" + access_mode: ro + state: present + snmp_traps: + - name: all + state: present + snmp_host: + - ip: "{{ snmp_trap_destination }}" + communitystring: "{{ snmp_community_name }}" + version: "1" + state: present + when: snmp_trap_destination | length > 1 - name: Apply SNMP configuration ansible.builtin.include_role: name: dellemc.os10.os10_snmp vars: - - hostname: "{{ inventory_hostname }}" - os10_cfg_generate: true - ansible_ssh_user: "{{ ethernet_switch_username }}" - ansible_ssh_password: "{{ ethernet_switch_password }}" + hostname: "{{ inventory_hostname }}" + os10_cfg_generate: true + ansible_ssh_user: "{{ ethernet_switch_username }}" + ansible_ssh_password: "{{ ethernet_switch_password }}" when: os10_snmp is defined diff --git a/network/roles/ethernet/tasks/validate_ethernet_vars.yml b/network/roles/ethernet/tasks/validate_ethernet_vars.yml index fe3eccb89..5ef11dbe1 100644 --- a/network/roles/ethernet/tasks/validate_ethernet_vars.yml +++ b/network/roles/ethernet/tasks/validate_ethernet_vars.yml @@ -40,11 +40,3 @@ ansible.builtin.command: whereis ansible-galaxy changed_when: false register: ansible_galaxy_path - -- name: Install ansible galaxy collection - ansible.builtin.command: "{{ ansible_galaxy_path.stdout.split(' ')[1] }} collection install {{ item }}" - changed_when: true - register: install_ansible_galaxy - until: install_ansible_galaxy is not failed - retries: "{{ max_retries }}" - with_items: "{{ ansible_collections }}" diff --git a/network/roles/ethernet/vars/main.yml b/network/roles/ethernet/vars/main.yml index 89af0ad9f..f87103dcf 100644 --- a/network/roles/ethernet/vars/main.yml +++ b/network/roles/ethernet/vars/main.yml @@ -25,8 +25,6 @@ fail_msg_save_config: save_changes_to_startup variable can only be set to true o fail_msg_ethernet_credentials: 'command:ansible-playbook ethernet_switch_config.yml -i inventory -e ethernet_switch_username="" -e ethernet_switch_password=""' max_length: 30 min_username_length: 4 -ansible_collections: - - dellemc.os10:1.1.1 max_retries: 20 success_snmp_trap_dest: "SNMP trap destination IP validated" fail_snmp_trap_dest: "Failed. Incorrect SNMP trap destination IP format provided in ethernet input file" diff --git a/network/roles/infiniband/tasks/apply_split_port.yml b/network/roles/infiniband/tasks/apply_split_port.yml index 67acdd9b0..548a325f9 100644 --- a/network/roles/infiniband/tasks/apply_split_port.yml +++ b/network/roles/infiniband/tasks/apply_split_port.yml @@ -22,6 +22,7 @@ # This block will execute when "Split Ready: no". - name: Configure switch to enable splitting + when: not split_ready_status block: - name: Apply changes for changing IB switch to split ready ansible.builtin.include_tasks: configure_split_ready.yml @@ -36,13 +37,13 @@ ansible.builtin.set_fact: ib_login_password: "{{ ib_admin_password }}" no_log: true - when: not split_ready_status - name: Fetch split status of IB switch # Check for "Split Ready: yes/no". ansible.builtin.include_tasks: ib_split_mode.yml # This block will execute when "Split Ready: yes" - name: Configuration switch for port splitting + when: split_ready_status block: - name: Fetch ports on IB switch before port splitting ansible.builtin.include_tasks: fetch_avail_ports.yml @@ -51,6 +52,7 @@ ansible.builtin.include_tasks: authenticate.yml - name: Split ports + when: ib_ports_list | default("", true) | length >= 1 block: - name: Split ports for ndr switch ansible.builtin.include_tasks: split_port_ndr.yml @@ -61,5 +63,3 @@ ansible.builtin.include_tasks: split_port.yml with_items: "{{ ib_ports_list }}" when: ib_switch_type | lower == 'hdr' or ib_switch_type | lower == 'edr' - when: ib_ports_list | default("", true) | length >= 1 - when: split_ready_status diff --git a/network/roles/infiniband/tasks/configure_interface.yml b/network/roles/infiniband/tasks/configure_interface.yml index 303439290..ea714ebda 100644 --- a/network/roles/infiniband/tasks/configure_interface.yml +++ b/network/roles/infiniband/tasks/configure_interface.yml @@ -13,7 +13,7 @@ # limitations under the License. --- -- name: Apply interface configuration for "{{ avail_port }}" port of IB - "{{ inventory_hostname }}" +- name: Apply interface configuration for "{{ avail_port }}" port of IB - "{{ inventory_hostname }}" # noqa: name[template] ansible.builtin.uri: url: http://{{ inventory_hostname }}/admin/launch?script=json method: POST diff --git a/network/roles/infiniband/tasks/configure_split_ready.yml b/network/roles/infiniband/tasks/configure_split_ready.yml index 8a09c3f23..04914e1ba 100644 --- a/network/roles/infiniband/tasks/configure_split_ready.yml +++ b/network/roles/infiniband/tasks/configure_split_ready.yml @@ -13,7 +13,7 @@ # limitations under the License. --- -- name: Change switch to split ready on "{{ inventory_hostname }}"- This will take 4 mins +- name: Change switch to split ready on "{{ inventory_hostname }}"- This will take 4 mins # noqa: name[template] ansible.builtin.command: "python3 {{ split_ready_loc }} {{ inventory_hostname }} {{ ib_username }} {{ ib_password }}" register: split_status no_log: true diff --git a/network/roles/infiniband/tasks/global_config.yml b/network/roles/infiniband/tasks/global_config.yml index e67373d5c..000e57fb6 100644 --- a/network/roles/infiniband/tasks/global_config.yml +++ b/network/roles/infiniband/tasks/global_config.yml @@ -13,7 +13,7 @@ # limitations under the License. --- -- name: Apply configuration for- "{{ inventory_hostname }}" - "{{ item }}" +- name: Apply configuration for- "{{ inventory_hostname }}" - "{{ item }}" # noqa: name[template] ansible.builtin.uri: url: http://{{ inventory_hostname }}/admin/launch?script=json method: POST @@ -29,7 +29,7 @@ } register: global_conf -- name: Status check for - "{{ inventory_hostname }}" - "{{ item }}" +- name: Status check for - "{{ inventory_hostname }}" - "{{ item }}" # noqa: name[template] ansible.builtin.fail: msg: "{{ global_conf.json }}" when: diff --git a/network/roles/infiniband/tasks/interface_config.yml b/network/roles/infiniband/tasks/interface_config.yml index 751619d1e..ae0994605 100644 --- a/network/roles/infiniband/tasks/interface_config.yml +++ b/network/roles/infiniband/tasks/interface_config.yml @@ -19,6 +19,10 @@ default_desc: "{{ 'port ' + item.key.split(' ')[1] }}" - name: Changes for split ports + when: + - 'ib_port + " " not in ports_avail or + split_port1 in ports_avail or + split_port2 in ports_avail' block: - name: Initialize port and description with "{{ item.key + '/1' }}" ansible.builtin.set_fact: @@ -35,12 +39,9 @@ - name: Apply changes on "{{ avail_port }}" ansible.builtin.include_tasks: configure_interface.yml - when: - - 'ib_port + " " not in ports_avail or - split_port1 in ports_avail or - split_port2 in ports_avail' - name: Changes for unsplit ports + when: 'ib_port + " " in ports_avail' block: - name: Initialize port & description with "{{ item.key }}" ansible.builtin.set_fact: @@ -49,4 +50,3 @@ - name: Apply changes on "{{ avail_port }}" ansible.builtin.include_tasks: configure_interface.yml - when: 'ib_port + " " in ports_avail' diff --git a/network/roles/infiniband/tasks/pre_requisites.yml b/network/roles/infiniband/tasks/pre_requisites.yml index 15244935a..8ca8b1b93 100644 --- a/network/roles/infiniband/tasks/pre_requisites.yml +++ b/network/roles/infiniband/tasks/pre_requisites.yml @@ -109,18 +109,15 @@ when: snmp_trap_destination | length > 1 - name: Install netaddr - ansible.builtin.command: "{{ pip_version }} install netaddr" - changed_when: false + ansible.builtin.pip: + name: netaddr + state: present - name: Install common packages ansible.builtin.package: name: "{{ common_pkg }}" state: present -- name: Install ansible.utils galaxy collection - ansible.builtin.command: ansible-galaxy collection install "{{ utils_collection }}" - changed_when: true - - name: Validate snmp trap destination address ansible.builtin.assert: that: diff --git a/network/roles/infiniband/tasks/pre_requisites_split_ports.yml b/network/roles/infiniband/tasks/pre_requisites_split_ports.yml index 55bc58427..5c22826db 100644 --- a/network/roles/infiniband/tasks/pre_requisites_split_ports.yml +++ b/network/roles/infiniband/tasks/pre_requisites_split_ports.yml @@ -50,6 +50,7 @@ split_ready_status: false - name: Fetch ports needs to be splitted + when: ib_split_ports | default("", true) | length > 1 block: # This task is used when user do not specify ports as a range - name: Initialize range_ports variable with user provided ports @@ -66,7 +67,6 @@ ansible.builtin.set_fact: ib_ports_list: "{{ lookup('vars', 'item').split(',') | map('trim') | unique | select | list }}" with_items: "{{ range_ports }}" - when: ib_split_ports | default("", true) | length > 1 - name: Validate admin password is not username ansible.builtin.fail: diff --git a/network/roles/infiniband/tasks/save_config.yml b/network/roles/infiniband/tasks/save_config.yml index 425ff82fa..bea5a9271 100644 --- a/network/roles/infiniband/tasks/save_config.yml +++ b/network/roles/infiniband/tasks/save_config.yml @@ -14,6 +14,7 @@ --- - name: Set parameters to save configuration + when: save_changes_to_startup block: - name: Setting parameters to save configuration ansible.builtin.set_fact: @@ -22,4 +23,3 @@ - name: Save current switch configuration to startup-configuration ansible.builtin.include_tasks: global_config.yml with_items: "{{ save_config }}" - when: save_changes_to_startup diff --git a/network/roles/infiniband/tasks/snmp_config.yml b/network/roles/infiniband/tasks/snmp_config.yml index 48adc7de5..a18705892 100644 --- a/network/roles/infiniband/tasks/snmp_config.yml +++ b/network/roles/infiniband/tasks/snmp_config.yml @@ -14,6 +14,7 @@ --- - name: Set parameters for SNMP configuration + when: snmp_trap_destination | length > 1 block: - name: Setting parameters for SNMP configuration ansible.builtin.set_fact: @@ -26,4 +27,3 @@ - name: Configure SNMP on Switch ansible.builtin.include_tasks: global_config.yml with_items: "{{ snmp_config }}" - when: snmp_trap_destination | length > 1 diff --git a/network/roles/infiniband/tasks/split_port.yml b/network/roles/infiniband/tasks/split_port.yml index 9fbc3f669..c52f0c3dc 100644 --- a/network/roles/infiniband/tasks/split_port.yml +++ b/network/roles/infiniband/tasks/split_port.yml @@ -26,8 +26,9 @@ ib_port_name: "{{ '1/' + item }}" - name: Split ports of IB + when: 'temp_port in ports_avail or "-" in ib_port_name' block: - - name: Split "{{ ib_port_name }}" port of IB - "{{ inventory_hostname }}" + - name: Split "{{ ib_port_name }}" port of IB - "{{ inventory_hostname }}" # noqa: name[template] ansible.builtin.uri: url: http://{{ inventory_hostname }}/admin/launch?script=json method: POST @@ -54,7 +55,6 @@ loop_var: results label: "{{ results.executed_command }}" failed_when: false - when: 'temp_port in ports_avail or "-" in ib_port_name' - name: Status msg when port is not available ansible.builtin.assert: diff --git a/network/roles/infiniband/tasks/split_port_ndr.yml b/network/roles/infiniband/tasks/split_port_ndr.yml index a39827acb..99a48f156 100644 --- a/network/roles/infiniband/tasks/split_port_ndr.yml +++ b/network/roles/infiniband/tasks/split_port_ndr.yml @@ -26,8 +26,9 @@ ib_port_name: "{{ '1/' + item }}" - name: Split ports of IB + when: 'temp_port in ports_avail or "-" in ib_port_name' block: - - name: Split "{{ ib_port_name }}" port of IB - "{{ inventory_hostname }}" + - name: Split "{{ ib_port_name }}" port of IB - "{{ inventory_hostname }}" # noqa: name[template] ansible.builtin.uri: url: http://{{ inventory_hostname }}/admin/launch?script=json method: POST @@ -54,7 +55,6 @@ loop_var: results label: "{{ results.executed_command }}" failed_when: false - when: 'temp_port in ports_avail or "-" in ib_port_name' - name: Status msg when port is not available ansible.builtin.assert: diff --git a/network/roles/infiniband/vars/main.yml b/network/roles/infiniband/vars/main.yml index 75bc6d230..2b1bf6991 100644 --- a/network/roles/infiniband/vars/main.yml +++ b/network/roles/infiniband/vars/main.yml @@ -31,8 +31,7 @@ success_snmp_trap_dest: "SNMP trap destination IP validated" fail_snmp_trap_dest: "Failed. Incorrect SNMP trap destination IP format provided in base_address.yml" common_pkg: - python3-netaddr -utils_collection: ansible.utils:2.5.2 -pip_version: pip3.8 +python_version: "{{ ansible_python_interpreter }}" success_valid_msg: "Success, Input parameters validated successfully" fail_valid_msg: - '|____________________________________________________________________________________________________________________________________|' diff --git a/network/roles/mlnx_ofed/tasks/initiate_nfs_server.yml b/network/roles/mlnx_ofed/tasks/initiate_nfs_server.yml index 903dc84b4..a4584f645 100644 --- a/network/roles/mlnx_ofed/tasks/initiate_nfs_server.yml +++ b/network/roles/mlnx_ofed/tasks/initiate_nfs_server.yml @@ -28,11 +28,10 @@ # Setup nfs server when nfs_mlnx_ofed status is set to true - name: Setup NFS Server for Rhel compute nodes + when: nfs_mlnx_ofed is true block: - name: Setup nfs server on localhost ansible.builtin.include_tasks: setup_nfs_server.yml - name: Setup mlnx_repo on localhost ansible.builtin.include_tasks: setup_mlnx_ofed_repo.yml - - when: nfs_mlnx_ofed is true diff --git a/network/roles/mlnx_ofed/tasks/install_mlnx_leap.yml b/network/roles/mlnx_ofed/tasks/install_mlnx_leap.yml index 4c7f46728..8222c01aa 100644 --- a/network/roles/mlnx_ofed/tasks/install_mlnx_leap.yml +++ b/network/roles/mlnx_ofed/tasks/install_mlnx_leap.yml @@ -27,7 +27,7 @@ - name: Check IB nic availability block: - name: Check IB nic availability - ansible.builtin.shell: "ip a | grep -o ' ib.'" + ansible.builtin.shell: "ip a | grep -o ' ib.'" # noqa: risky-shell-pipe changed_when: false register: ib_nics diff --git a/network/roles/mlnx_ofed/tasks/install_mlnx_ofed.yml b/network/roles/mlnx_ofed/tasks/install_mlnx_ofed.yml index 4c18f7072..bba9af2e6 100644 --- a/network/roles/mlnx_ofed/tasks/install_mlnx_ofed.yml +++ b/network/roles/mlnx_ofed/tasks/install_mlnx_ofed.yml @@ -43,6 +43,7 @@ ansible.builtin.command: "{{ item + ' -y' }}" with_items: "{{ missing_packages }}" when: missing_packages | default("", true) | length > 0 + changed_when: false - name: Message to user ansible.builtin.fail: @@ -78,7 +79,7 @@ - name: Check IB nic availability block: - name: Check IB nic status - ansible.builtin.shell: "ip a | grep -o ' ib.'" + ansible.builtin.shell: "ip a | grep -o ' ib.'" # noqa: risky-shell-pipe changed_when: false register: ib_nics diff --git a/network/roles/mlnx_ofed/tasks/main.yml b/network/roles/mlnx_ofed/tasks/main.yml index 66418f17e..9896ed767 100644 --- a/network/roles/mlnx_ofed/tasks/main.yml +++ b/network/roles/mlnx_ofed/tasks/main.yml @@ -17,6 +17,9 @@ ansible.builtin.include_vars: "{{ role_path }}/../../../input/network_config.yml" - name: Install mlnx_ofed if not already installed + when: + - os_supported_rhel in ansible_facts['distribution'] | lower + or os_supported_rocky in ansible_facts['distribution'] | lower block: - name: Check status of openibd service ansible.builtin.command: systemctl status openibd @@ -25,6 +28,7 @@ register: openibd_status - name: Setup nfs-client if not already installed + when: "'Active: active' not in openibd_status.stdout" block: - name: Setup nfs client on nodes with infiniband support ansible.builtin.include_tasks: setup_nfs_client.yml @@ -34,12 +38,6 @@ - name: Install mlnx_ofed on redhat nodes ansible.builtin.include_tasks: install_mlnx_ofed.yml - when: "'Active: active' not in openibd_status.stdout" - - when: - - os_supported_rhel in ansible_facts['distribution'] | lower - or os_supported_rocky in ansible_facts['distribution'] | lower - - name: Install mlnx_ofed on leap nodes ansible.builtin.include_tasks: install_mlnx_leap.yml when: os_supported_leap in ansible_facts['distribution'] | lower diff --git a/network/roles/mlnx_ofed/tasks/setup_nfs_client.yml b/network/roles/mlnx_ofed/tasks/setup_nfs_client.yml index 2408d8327..69e6d2e03 100644 --- a/network/roles/mlnx_ofed/tasks/setup_nfs_client.yml +++ b/network/roles/mlnx_ofed/tasks/setup_nfs_client.yml @@ -31,6 +31,7 @@ - name: Unmount if mount exists already ansible.builtin.command: umount "{{ mlnx_ofed_nfs_path }}" when: mlnx_ofed_nfs_path in mounted_share.stdout + changed_when: false - name: Check if directory exists ansible.builtin.stat: @@ -45,10 +46,8 @@ when: not nfs_directory_stat.stat.exists - name: Mount NFS client - ansible.builtin.command: "mount -o {{ client_mount_options }} -t nfs {{ server_ip }}:{{ mlnx_ofed_nfs_path }} {{ mlnx_ofed_nfs_path }}" + ansible.builtin.command: "mount -o {{ client_mount_options }} -t nfs {{ server_ip }}:{{ mlnx_ofed_nfs_path }} {{ mlnx_ofed_nfs_path }}" # noqa: command-instead-of-module noqa: yaml[line-length] changed_when: true - args: - warn: false - name: Add mount configuration to /etc/fstab file ansible.builtin.lineinfile: diff --git a/network/roles/mlnx_ofed/tasks/setup_nfs_server.yml b/network/roles/mlnx_ofed/tasks/setup_nfs_server.yml index a3f792c2b..c93a03745 100644 --- a/network/roles/mlnx_ofed/tasks/setup_nfs_server.yml +++ b/network/roles/mlnx_ofed/tasks/setup_nfs_server.yml @@ -66,7 +66,7 @@ ansible.builtin.set_fact: ib_inventory_start_octets: "{{ groups['redhat_rocky_inventory'][0].split('.')[0:2] | join('.') }}" -# NFS Server IP (control plane IP) should be in same range as that of NFS Clients(compute nodes IP) +# NFS Server IP (Omnia Infrastructure Manager IP) should be in same range as that of NFS Clients(compute nodes IP) - name: Find server IP in the range matching with node_inventory ansible.builtin.set_fact: server_ip: "{{ item }}" diff --git a/network/roles/mlnx_ofed/tasks/validations.yml b/network/roles/mlnx_ofed/tasks/validations.yml index fae183c7e..606af1c92 100644 --- a/network/roles/mlnx_ofed/tasks/validations.yml +++ b/network/roles/mlnx_ofed/tasks/validations.yml @@ -25,6 +25,7 @@ - "'redhat' in hostvars[item].ansible_distribution | lower or 'rocky' in hostvars[item].ansible_distribution | lower" - name: Perform validations when compute nodes have redhat/rocky os installed + when: redhat_node_ip | default([]) | length > 0 block: - name: Set Redhat distro ansible.builtin.set_fact: @@ -40,6 +41,7 @@ when: mlnx_ofed_offline_path | default("", true) | length > 0 - name: Check if correct mlnx_ofed file exists + when: offline_path block: - name: Check if file is .tar file ansible.builtin.assert: @@ -56,9 +58,8 @@ msg: "{{ fail_mlnx_file + mlnx_ofed_offline_path }}" when: not stat_result.stat.exists - when: offline_path - - name: Check if mlnx_ofed version and distro details are given correctly + when: not offline_path block: - name: Check status of mlnx_ofed repo url ansible.builtin.uri: @@ -73,8 +74,6 @@ ansible.builtin.fail: msg: "{{ fail_mlnx_url }}" - when: not offline_path - - name: Set mlnx_ofed_add_kernel_support value ansible.builtin.set_fact: mlnx_ofed_add_kernel_support: "{{ mlnx_ofed_add_kernel_support | lower }}" @@ -87,5 +86,3 @@ - name: Set mlnx_ofed arguments ansible.builtin.set_fact: mlnx_ofed_args: "{{ '--add-kernel-support' if mlnx_ofed_add_kernel_support else '' }}" - - when: redhat_node_ip | default([]) | length > 0 diff --git a/network/tests/test_ethernet.yml b/network/tests/test_ethernet.yml index 719e59bbf..d8f1950c0 100644 --- a/network/tests/test_ethernet.yml +++ b/network/tests/test_ethernet.yml @@ -22,7 +22,7 @@ vars_files: - test_vars/test_ethernet_vars.yml - ../../input/network_config.yml - - ../../input/control_plane_config.yml + - ../../input/oim_config.yml tasks: - name: Check if ethernet_inventory present at the inventory path. diff --git a/network/tests/test_ofed_beegfs.yml b/network/tests/test_ofed_beegfs.yml index c69eb9b22..5c3adf34c 100644 --- a/network/tests/test_ofed_beegfs.yml +++ b/network/tests/test_ofed_beegfs.yml @@ -36,7 +36,7 @@ - name: Decrpyt variable file ansible.legacy.shell: >- - {{ ansible_vault_path.stdout.split(' ')[1] }} decrypt {{ omnia_config_file }} + ansible-vault decrypt {{ omnia_config_file }} --vault-password-file {{ omnia_vault_key }} when: "'$ANSIBLE_VAULT;' in config_content.stdout" changed_when: false @@ -51,7 +51,7 @@ - name: Encrypt variable file ansible.legacy.shell: >- - {{ ansible_vault_path.stdout.split(' ')[1] }} encrypt {{ omnia_config_file }} + ansible-vault encrypt {{ omnia_config_file }} --vault-password-file {{ omnia_vault_key }} changed_when: false when: "'$ANSIBLE_VAULT;' in config_content.stdout" @@ -132,7 +132,7 @@ - name: Decrpyt variable file ansible.legacy.shell: >- - {{ ansible_vault_path.stdout.split(' ')[1] }} decrypt {{ omnia_config_file }} + ansible-vault decrypt {{ omnia_config_file }} --vault-password-file {{ omnia_vault_key }} when: "'$ANSIBLE_VAULT;' in config_content.stdout" changed_when: false @@ -147,7 +147,7 @@ - name: Encrypt variable file ansible.legacy.shell: >- - {{ ansible_vault_path.stdout.split(' ')[1] }} encrypt {{ omnia_config_file }} + ansible-vault encrypt {{ omnia_config_file }} --vault-password-file {{ omnia_vault_key }} changed_when: false when: "'$ANSIBLE_VAULT;' in config_content.stdout" diff --git a/network/tests/test_vars/test_ethernet_vars.yml b/network/tests/test_vars/test_ethernet_vars.yml index 9cb48a857..c910bc22d 100644 --- a/network/tests/test_vars/test_ethernet_vars.yml +++ b/network/tests/test_vars/test_ethernet_vars.yml @@ -16,7 +16,7 @@ # vars file for test_ethernet.yml file input_params_folder: "../../input" -Control_plane_dir: "../" +oim_dir: "../" ethernet_validation_script_path: test_ethernet_validation.yml Ethernet_vars_file_path: test_vars/test_ethernet_vars.yml inventory: ../inventory diff --git a/network/tests/test_vars/test_ib_switch_config_vars.yml b/network/tests/test_vars/test_ib_switch_config_vars.yml index 2c6085e5f..c15624718 100644 --- a/network/tests/test_vars/test_ib_switch_config_vars.yml +++ b/network/tests/test_vars/test_ib_switch_config_vars.yml @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -login_vars_file: "{{ playbook_dir }}/../../input/control_plane_config.yml" -login_vault_file: "{{ playbook_dir }}/../../input/.control_plane_config_key.yml" +login_vars_file: "{{ playbook_dir }}/../../input/oim_config.yml" +login_vault_file: "{{ playbook_dir }}/../../input/.oim_config_key.yml" file_perm: '0644' username: "" password: "" diff --git a/omnia.yml b/omnia.yml index 54e155288..0ba2c19b2 100644 --- a/omnia.yml +++ b/omnia.yml @@ -13,6 +13,27 @@ # limitations under the License. --- +- name: Check if virtual environment is active + ansible.builtin.import_playbook: utils/check_venv.yml + when: not ( check_venv_executed | default(false) | bool ) + +- name: Check if package manager is not locked + ansible.builtin.import_playbook: utils/check_package_lock.yml + when: not ( hostvars['127.0.0.1']['apt_lock_status'] | default(false) | bool ) + +- name: Set flag + hosts: localhost + connection: local + gather_facts: false + tasks: + - name: Set flag to indicate check_venv.yml has been executed + ansible.builtin.set_fact: + check_venv_executed: true + + - name: Set flag to indicate check_package_lock.yml has been executed + ansible.builtin.set_fact: + apt_lock_status: true + - name: Update Inventory with ansible_host information ansible.builtin.import_playbook: utils/servicetag_host_mapping.yml when: not ( update_inventory_executed | default(false) | bool ) @@ -50,6 +71,12 @@ - name: Import playbook to install ROCm on nodes ansible.builtin.import_playbook: utils/rocm_installation.yml +- name: Verify Gaudi nodes + ansible.builtin.import_playbook: utils/verify_intel_gaudi/verify_intel_gaudi_installation.yml + +- name: Import playbook to set performance profile on nodes with Intel Gaudi accelerator + ansible.builtin.import_playbook: utils/performance_profile/performance_profile.yml + - name: Configure security on nodes ansible.builtin.import_playbook: security/security.yml diff --git a/platforms/ansible.cfg b/platforms/ansible.cfg index 4324a0a54..ce834ecbe 100644 --- a/platforms/ansible.cfg +++ b/platforms/ansible.cfg @@ -3,6 +3,7 @@ log_path = /var/log/omnia/platforms.log host_key_checking = false forks = 5 timeout = 180 +collections_path = $VIRTUAL_ENV [persistent_connection] command_timeout = 180 @@ -10,4 +11,4 @@ connect_timeout = 180 [ssh_connection] retries = 3 -ssh_args = -o ControlMaster=auto -o ControlPersist=180 \ No newline at end of file +ssh_args = -o ControlMaster=auto -o ControlPersist=180 diff --git a/prepare_cp/roles/configure_proxy/tasks/configure_proxy_rocky.yml b/prepare_cp/roles/configure_proxy/tasks/configure_proxy_rocky.yml deleted file mode 120000 index dc8eabaa9..000000000 --- a/prepare_cp/roles/configure_proxy/tasks/configure_proxy_rocky.yml +++ /dev/null @@ -1 +0,0 @@ -configure_proxy_redhat.yml \ No newline at end of file diff --git a/prepare_cp/roles/install_xcat/common/tasks/pre_requisites.yml b/prepare_cp/roles/install_xcat/common/tasks/pre_requisites.yml deleted file mode 100644 index a95a19a48..000000000 --- a/prepare_cp/roles/install_xcat/common/tasks/pre_requisites.yml +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Check selinux status - when: - - control_plane_os in control_plane_os_redhat or - control_plane_os in control_plane_os_rocky - block: - - name: Fetch selinux mode - ansible.builtin.command: sestatus - register: sestatus_current - changed_when: false - - - name: Disable selinux - ansible.builtin.replace: - path: /etc/sysconfig/selinux - regexp: 'SELINUX=[a-z]+' - replace: 'SELINUX=disabled' - when: '"SELinux status: enabled" in sestatus_current.stdout_lines' - - - name: Status of selinux - ansible.builtin.fail: - msg: "{{ selinux_status_fail_msg }}" - when: '"SELinux status: enabled" in sestatus_current.stdout_lines' - -- name: Initialize variables - ansible.builtin.set_fact: - xcat_installation_status: false - validation_status: true - -- name: Check output of network table - ansible.builtin.command: lsdef -t network - changed_when: false - failed_when: false - register: network_table_check - -- name: Restart postgresql and xcat services if xcat already installed - ansible.builtin.service: - name: "{{ item }}" - state: restarted - enabled: true - failed_when: false - with_items: - - postgresql - - xcatd - when: - - network_table_check.stderr is defined - - xcat_connection_search_key in network_table_check.stderr | lower - -- name: Gathering service facts - ansible.builtin.service_facts: - -- name: Check output of site table - ansible.builtin.command: tabdump site - changed_when: false - failed_when: false - register: site_table_check - -- name: Set xcat_installation_status to true - ansible.builtin.set_fact: - xcat_installation_status: true - when: - - xcatd_service in ansible_facts.services - - postgresql_service in ansible_facts.services - - "'running' in ansible_facts.services[xcatd_service].state" - - "'running' in ansible_facts.services[postgresql_service].state" - - "'#key,value' in site_table_check.stdout" diff --git a/prepare_cp/roles/install_xcat/redhat/tasks/configure_postgres.yml b/prepare_cp/roles/install_xcat/redhat/tasks/configure_postgres.yml deleted file mode 100644 index a0954cb51..000000000 --- a/prepare_cp/roles/install_xcat/redhat/tasks/configure_postgres.yml +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Setup postgres database (This task may take 10 mins) - ansible.builtin.command: "{{ pgsqlsetup_path }} -i -V" - changed_when: true - no_log: true - environment: - XCATPGPW: "{{ postgresdb_password }}" - -- name: Start and enable xcat services in rhel/rocky - ansible.builtin.service: - name: "{{ item }}" - state: started - enabled: true - with_items: "{{ xcat_services }}" - -- name: Create conf directory for postgresql in rhel/rocky - ansible.builtin.file: - path: "{{ postgresql_conf_dir }}" - state: directory - mode: "{{ file_permission }}" - -- name: Copy postgresql.conf in rhel/rocky - ansible.builtin.copy: - src: "{{ postgresql_conf_src }}" - dest: "{{ postgresql_conf_dest }}" - mode: preserve - -- name: Reload systemd - ansible.builtin.systemd: - daemon-reload: true - -- name: Remove .postgres directory - ansible.builtin.file: - path: "{{ postgres_file_path }}" - state: absent - failed_when: false - -- name: Create .postgres directory - ansible.builtin.file: - path: "{{ postgres_file_path }}" - state: directory - mode: "{{ directory_permissions }}" - -- name: Get encrypted_file status - ansible.builtin.stat: - path: "{{ encrypted_file_path }}" - register: key_status - -- name: Invoke python utility to generate key if not exists - ansible.builtin.command: | - {{ python_version }} {{ utility_path }} {{ postgresdb_password }} - changed_when: false - no_log: true - when: not key_status.stat.exists - -- name: Set default pg_hba_conf_update - ansible.builtin.set_fact: - pg_hba_conf_update: false - -- name: Read pg_hba_conf file - ansible.builtin.slurp: - path: "{{ pg_hba_conf_path }}" - register: pg_hba_conf_content - -- name: Set pg_hba_conf_update status - ansible.builtin.set_fact: - pg_hba_conf_update: "{{ pg_hba_conf_content.content | b64decode | regex_search('^local\\s+all\\s+all\\s+md5', multiline=True) is not none }}" - -- name: Check pg_hba_conf_update status - when: pg_hba_conf_update == false - block: - - name: Set PostgreSQL password for postgres user - become: true - become_user: postgres - ansible.builtin.shell: "echo \"ALTER USER postgres WITH PASSWORD '{{ postgresdb_password }}';\" | psql" - no_log: true - - - name: Replace peer to md5 in pg_hba.conf - ansible.builtin.replace: - path: "{{ pg_hba_conf_path }}" - regexp: "^local(.*)all(.*)all(.*)" - replace: "local all all md5" - register: pg_hba_conf_update - -- name: Restart postgresql service - ansible.builtin.systemd: - name: postgresql - state: restarted - when: pg_hba_conf_update.changed # noqa: no-handler diff --git a/prepare_cp/roles/install_xcat/rocky b/prepare_cp/roles/install_xcat/rocky deleted file mode 120000 index 4d0827986..000000000 --- a/prepare_cp/roles/install_xcat/rocky +++ /dev/null @@ -1 +0,0 @@ -redhat \ No newline at end of file diff --git a/prepare_cp/roles/install_xcat/ubuntu/tasks/config_network.yml b/prepare_cp/roles/install_xcat/ubuntu/tasks/config_network.yml deleted file mode 100644 index 8c70e0c23..000000000 --- a/prepare_cp/roles/install_xcat/ubuntu/tasks/config_network.yml +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Check confignetwork script applied - ansible.builtin.shell: > - set -o pipefail && \ - cat {{ config_nw_script_path }} | grep -F '{{ config_nw_check_key }}' - changed_when: false - failed_when: false - register: config_nw_check - -- name: Apply config network patch - when: config_nw_check_key not in config_nw_check.stdout - block: - - name: Copy config_nw patch - ansible.builtin.copy: - src: "{{ item.src }}" - dest: "{{ item.dest }}" - mode: "{{ item.mode }}" - with_items: "{{ config_nw_patch_path }}" - - - name: Patch config network file - ansible.builtin.shell: patch confignetwork < confignetwork.patch # noqa: command-instead-of-module - changed_when: true - args: - chdir: "{{ config_nw_path }}" - - - name: Delete confignetwork script patch - ansible.builtin.file: - path: "{{ item.dest }}" - state: absent - with_items: "{{ config_nw_patch_path }}" diff --git a/prepare_cp/roles/pre_requisite/tasks/prepare_cp_status.yml b/prepare_cp/roles/pre_requisite/tasks/prepare_cp_status.yml deleted file mode 100644 index e709b6eed..000000000 --- a/prepare_cp/roles/pre_requisite/tasks/prepare_cp_status.yml +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Set control_plane_os - ansible.builtin.set_fact: - control_plane_os: "{{ ansible_distribution | lower }}" - -- name: Initialize variables - ansible.builtin.set_fact: - xcat_installation_status: false - prep_inv_status: false - prepare_cp_status: false - -- name: Gathering service facts - ansible.builtin.service_facts: - -- name: Set xcat_installation_status to true - ansible.builtin.set_fact: - xcat_installation_status: true - when: - - control_plane_os == "redhat" or control_plane_os == "rocky" - - xcatd_service in ansible_facts.services - - postgresql_service_rhel in ansible_facts.services - - "'running' in ansible_facts.services[xcatd_service].state" - - "'running' in ansible_facts.services[postgresql_service_rhel].state" - -- name: Set xcat_installation_status to true - ansible.builtin.set_fact: - xcat_installation_status: true - when: - - control_plane_os == "ubuntu" - - xcatd_service in ansible_facts.services - - postgresql_service_ubuntu in ansible_facts.services - - "'running' in ansible_facts.services[xcatd_service].state" - - "'running' in ansible_facts.services[postgresql_service_ubuntu].state" - -- name: Check if inventory files are present - ansible.builtin.stat: - path: "{{ inv_file_path_list }}" - register: stat_result - -- name: Set fact if inventory is present - ansible.builtin.set_fact: - prep_inv_status: true - when: stat_result.stat.exists - -- name: Set fact for prepare cp - ansible.builtin.set_fact: - prepare_cp_status: true - when: - - xcat_installation_status - - prep_inv_status - -- name: Prepare_cp needs to be executed - ansible.builtin.fail: - msg: "{{ prepare_cp_execution_req }}" - when: not prepare_cp_status diff --git a/prepare_oim/ansible.cfg b/prepare_oim/ansible.cfg new file mode 100644 index 000000000..f19bce5bd --- /dev/null +++ b/prepare_oim/ansible.cfg @@ -0,0 +1,15 @@ +[defaults] +log_path = /var/log/omnia/prepare_oim.log +host_key_checking = false +forks = 5 +timeout = 180 +executable = /bin/bash +collections_path = $VIRTUAL_ENV + +[persistent_connection] +command_timeout = 180 +connect_timeout = 180 + +[ssh_connection] +retries = 3 +ssh_args = -o ControlMaster=auto -o ControlPersist=180 diff --git a/prepare_cp/prepare_cp.yml b/prepare_oim/prepare_oim.yml similarity index 61% rename from prepare_cp/prepare_cp.yml rename to prepare_oim/prepare_oim.yml index f3e272d67..d242c202c 100644 --- a/prepare_cp/prepare_cp.yml +++ b/prepare_oim/prepare_oim.yml @@ -13,7 +13,15 @@ # limitations under the License. --- -- name: Prepare control plane for provisioning +- name: Check if virtual environment is active + ansible.builtin.import_playbook: ../utils/check_venv.yml + when: not ( hostvars['127.0.0.1']['check_venv_executed'] | default(false) | bool ) + +- name: Check if package manager is not locked + ansible.builtin.import_playbook: ../utils/check_package_lock.yml + when: not ( hostvars['127.0.0.1']['apt_lock_status'] | default(false) | bool ) + +- name: Prepare Omnia Infrastructure Manager for provisioning hosts: localhost connection: local roles: @@ -21,6 +29,6 @@ - role: configure_proxy - role: install_xcat/common # noqa: role-name[path] - role: db_creation - - role: omnia_telemetry_cp - - role: omnia_appliance_cp + - role: omnia_telemetry_oim + - role: omnia_appliance_oim - role: create_inventory diff --git a/prepare_cp/roles/configure_proxy/tasks/configure_proxy_redhat.yml b/prepare_oim/roles/configure_proxy/tasks/configure_proxy_redhat.yml similarity index 100% rename from prepare_cp/roles/configure_proxy/tasks/configure_proxy_redhat.yml rename to prepare_oim/roles/configure_proxy/tasks/configure_proxy_redhat.yml diff --git a/prepare_oim/roles/configure_proxy/tasks/configure_proxy_rocky.yml b/prepare_oim/roles/configure_proxy/tasks/configure_proxy_rocky.yml new file mode 100644 index 000000000..dc8eabaa9 --- /dev/null +++ b/prepare_oim/roles/configure_proxy/tasks/configure_proxy_rocky.yml @@ -0,0 +1 @@ +configure_proxy_redhat.yml \ No newline at end of file diff --git a/prepare_cp/roles/configure_proxy/tasks/configure_proxy_ubuntu.yml b/prepare_oim/roles/configure_proxy/tasks/configure_proxy_ubuntu.yml similarity index 93% rename from prepare_cp/roles/configure_proxy/tasks/configure_proxy_ubuntu.yml rename to prepare_oim/roles/configure_proxy/tasks/configure_proxy_ubuntu.yml index a6698a197..b824b4213 100644 --- a/prepare_cp/roles/configure_proxy/tasks/configure_proxy_ubuntu.yml +++ b/prepare_oim/roles/configure_proxy/tasks/configure_proxy_ubuntu.yml @@ -18,6 +18,10 @@ - name: Apt update ansible.builtin.apt: update_cache: true + register: update_repos + until: update_repos is not failed + retries: "{{ repo_retries }}" + delay: "{{ repo_delay }}" rescue: - name: Failed to update repo ansible.builtin.fail: diff --git a/prepare_cp/roles/configure_proxy/tasks/configure_user_registry_port.yml b/prepare_oim/roles/configure_proxy/tasks/configure_user_registry_port.yml similarity index 100% rename from prepare_cp/roles/configure_proxy/tasks/configure_user_registry_port.yml rename to prepare_oim/roles/configure_proxy/tasks/configure_user_registry_port.yml diff --git a/prepare_cp/roles/configure_proxy/tasks/main.yml b/prepare_oim/roles/configure_proxy/tasks/main.yml similarity index 100% rename from prepare_cp/roles/configure_proxy/tasks/main.yml rename to prepare_oim/roles/configure_proxy/tasks/main.yml diff --git a/prepare_cp/roles/configure_proxy/vars/main.yml b/prepare_oim/roles/configure_proxy/vars/main.yml similarity index 97% rename from prepare_cp/roles/configure_proxy/vars/main.yml rename to prepare_oim/roles/configure_proxy/vars/main.yml index b2e8985f9..dce496925 100644 --- a/prepare_cp/roles/configure_proxy/vars/main.yml +++ b/prepare_oim/roles/configure_proxy/vars/main.yml @@ -19,6 +19,8 @@ squid_proxy_conf_regxp: "^#(.*)http_access allow localnet" squid_proxy_conf_replace: "http_access allow localnet" repo_update_fail_msg: "Failed. Unable to run apt update. Please make sure all required repos are reachable. Remove or Update unreachable repos configured and re-run the playbook." +repo_retries: 5 +repo_delay: 10 # Usage: configure_proxy_redhat.yml nerdctl_registry_restart_fail_msg: "Unable to restart nerdctl-registry service. Please restart nerdctl-registry manually." diff --git a/prepare_cp/roles/create_inventory/tasks/main.yml b/prepare_oim/roles/create_inventory/tasks/main.yml similarity index 79% rename from prepare_cp/roles/create_inventory/tasks/main.yml rename to prepare_oim/roles/create_inventory/tasks/main.yml index 2d7b6fe16..38a14c79a 100644 --- a/prepare_cp/roles/create_inventory/tasks/main.yml +++ b/prepare_oim/roles/create_inventory/tasks/main.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -38,6 +38,16 @@ when: item.item not in item.content | b64decode changed_when: true +- name: Add header comment to inventory files where header comment is absent + ansible.builtin.lineinfile: + path: "{{ item.invocation.module_args.path }}" + line: "{{ inventory_header }}" + insertbefore: BOF + create: true + mode: "{{ inventory_file_mode }}" + with_items: "{{ inventory_content.results }}" + when: inventory_header not in item.content | b64decode + - name: Add Group Name to inventory files where group name is absent ansible.builtin.lineinfile: path: "{{ item.invocation.module_args.path }}" diff --git a/prepare_cp/roles/create_inventory/vars/main.yml b/prepare_oim/roles/create_inventory/vars/main.yml similarity index 74% rename from prepare_cp/roles/create_inventory/vars/main.yml rename to prepare_oim/roles/create_inventory/vars/main.yml index 8a7bf85bd..3325180ef 100644 --- a/prepare_cp/roles/create_inventory/vars/main.yml +++ b/prepare_oim/roles/create_inventory/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,7 +21,8 @@ compute_cpu_intel_filename: "compute_cpu_intel" compute_cpu_amd_filename: "compute_cpu_amd" compute_gpu_amd_filename: "compute_gpu_amd" compute_gpu_nvidia_filename: "compute_gpu_nvidia" -compute_servicetag_ip_filename: "compute_servicetag_ip" +compute_gpu_intel_filename: "compute_gpu_intel" +compute_hostname_ip_filename: "compute_hostname_ip" # List of Filename inventory_file_name_list: @@ -29,7 +30,8 @@ inventory_file_name_list: - "{{ compute_cpu_amd_filename }}" - "{{ compute_gpu_amd_filename }}" - "{{ compute_gpu_nvidia_filename }}" - - "{{ compute_servicetag_ip_filename }}" + - "{{ compute_gpu_intel_filename }}" + - "{{ compute_hostname_ip_filename }}" # Filepaths inventory_file_path_list: @@ -37,8 +39,12 @@ inventory_file_path_list: - "{{ inventory_directory_path }}/{{ compute_cpu_amd_filename }}" - "{{ inventory_directory_path }}/{{ compute_gpu_amd_filename }}" - "{{ inventory_directory_path }}/{{ compute_gpu_nvidia_filename }}" - - "{{ inventory_directory_path }}/{{ compute_servicetag_ip_filename }}" + - "{{ inventory_directory_path }}/{{ compute_gpu_intel_filename }}" + - "{{ inventory_directory_path }}/{{ compute_hostname_ip_filename }}" # Permissions inventory_dir_mode: "644" inventory_file_mode: "444" + +# Inventory Header +inventory_header: "# This file is generated by omnia, and should not be edited" diff --git a/prepare_cp/roles/db_creation/files/create_omniadb_tables.py b/prepare_oim/roles/db_creation/files/create_omniadb_tables.py similarity index 93% rename from prepare_cp/roles/db_creation/files/create_omniadb_tables.py rename to prepare_oim/roles/db_creation/files/create_omniadb_tables.py index 22cf5d56c..8261b28cb 100644 --- a/prepare_cp/roles/db_creation/files/create_omniadb_tables.py +++ b/prepare_oim/roles/db_creation/files/create_omniadb_tables.py @@ -12,20 +12,23 @@ # See the License for the specific language governing permissions and # limitations under the License. - import sys + db_path = sys.argv[1] sys.path.insert(0, db_path) import omniadb_connection import psycopg2 from cryptography.fernet import Fernet +key_file_path = '/opt/omnia/.postgres/.postgres_pass.key' +pass_file_path = '/opt/omnia/.postgres/.encrypted_pwd' + def create_db(): - with open('/opt/omnia/.postgres/.postgres_pass.key', 'rb') as passfile: + with open(key_file_path, 'rb') as passfile: key = passfile.read() fernet = Fernet(key) - with open('/opt/omnia/.postgres/.encrypted_pwd', 'rb') as datafile: + with open(pass_file_path, 'rb') as datafile: encrypted_file_data = datafile.read() decrypted_pwd = fernet.decrypt(encrypted_file_data).decode() conn = None diff --git a/prepare_cp/roles/db_creation/tasks/main.yml b/prepare_oim/roles/db_creation/tasks/main.yml similarity index 100% rename from prepare_cp/roles/db_creation/tasks/main.yml rename to prepare_oim/roles/db_creation/tasks/main.yml diff --git a/prepare_cp/roles/db_creation/tasks/omniadb_tables_creation.yml b/prepare_oim/roles/db_creation/tasks/omniadb_tables_creation.yml similarity index 100% rename from prepare_cp/roles/db_creation/tasks/omniadb_tables_creation.yml rename to prepare_oim/roles/db_creation/tasks/omniadb_tables_creation.yml diff --git a/prepare_cp/roles/db_creation/vars/main.yml b/prepare_oim/roles/db_creation/vars/main.yml similarity index 94% rename from prepare_cp/roles/db_creation/vars/main.yml rename to prepare_oim/roles/db_creation/vars/main.yml index c11d57119..625630fc9 100644 --- a/prepare_cp/roles/db_creation/vars/main.yml +++ b/prepare_oim/roles/db_creation/vars/main.yml @@ -14,6 +14,6 @@ --- # Usage: omniadb_tables_creation.yml -python_version: "python3.9" +python_version: "{{ ansible_python_interpreter }}" postgres_utility_path: "{{ role_path }}/files/create_omniadb_tables.py" db_path: "{{ role_path }}/../../../discovery/roles/db_operations/files" diff --git a/prepare_cp/roles/install_xcat/common/files/confignetwork.patch b/prepare_oim/roles/install_xcat/common/files/confignetwork.patch similarity index 100% rename from prepare_cp/roles/install_xcat/common/files/confignetwork.patch rename to prepare_oim/roles/install_xcat/common/files/confignetwork.patch diff --git a/prepare_oim/roles/install_xcat/common/files/ddns.patch b/prepare_oim/roles/install_xcat/common/files/ddns.patch new file mode 100644 index 000000000..8f5446c13 --- /dev/null +++ b/prepare_oim/roles/install_xcat/common/files/ddns.patch @@ -0,0 +1,20 @@ +--- ddns.pm 2024-08-29 03:48:16.270158152 +0000 ++++ updated_ddns.pm 2024-08-29 03:48:42.450154921 +0000 +@@ -1284,11 +1284,15 @@ + push @newnamed, "\t};\n"; + my $bind_version_cmd="/usr/sbin/named -v | cut -d' ' -f2 | grep -Eo '[0-9]+\.[0-9]+\.[0-9]+'"; + my @bind_version =xCAT::Utils->runcmd($bind_version_cmd, 0); +- # Turn off DNSSEC if running with bind vers 9.16.6 or higher +- if ((scalar @bind_version > 0) && (xCAT::Utils::CheckVersion($bind_version[0], "9.16.6") >= 0)) { ++ # Turn off DNSSEC if running with bind version greater than or equal to 9.16.6 and less than 9.18.0 ++ if ((scalar @bind_version > 0) && (xCAT::Utils::CheckVersion($bind_version[0], "9.16.6") >= 0) && (xCAT::Utils::CheckVersion($bind_version[0], "9.18.0") < 0)) { + push @newnamed, "\tdnssec-enable no;\n"; + push @newnamed, "\tdnssec-validation no;\n"; + } ++ # Turn off DNSSEC validation if running with bind version greater than or equal to 9.18.0 ++ if ((scalar @bind_version > 0) && (xCAT::Utils::CheckVersion($bind_version[0], "9.18.0") >= 0)) { ++ push @newnamed, "\tdnssec-validation no;\n"; ++ } + } + + if ($ctx->{forwardmode}){ diff --git a/prepare_cp/roles/install_xcat/common/files/doxcat.patch b/prepare_oim/roles/install_xcat/common/files/doxcat.patch similarity index 100% rename from prepare_cp/roles/install_xcat/common/files/doxcat.patch rename to prepare_oim/roles/install_xcat/common/files/doxcat.patch diff --git a/prepare_cp/roles/install_xcat/common/files/encrypt_pwd.py b/prepare_oim/roles/install_xcat/common/files/encrypt_pwd.py similarity index 100% rename from prepare_cp/roles/install_xcat/common/files/encrypt_pwd.py rename to prepare_oim/roles/install_xcat/common/files/encrypt_pwd.py diff --git a/prepare_cp/roles/install_xcat/common/files/postgresql.conf b/prepare_oim/roles/install_xcat/common/files/postgresql.conf similarity index 100% rename from prepare_cp/roles/install_xcat/common/files/postgresql.conf rename to prepare_oim/roles/install_xcat/common/files/postgresql.conf diff --git a/prepare_cp/roles/install_xcat/common/tasks/main.yml b/prepare_oim/roles/install_xcat/common/tasks/main.yml similarity index 74% rename from prepare_cp/roles/install_xcat/common/tasks/main.yml rename to prepare_oim/roles/install_xcat/common/tasks/main.yml index 663f2f9d0..36a45284e 100644 --- a/prepare_cp/roles/install_xcat/common/tasks/main.yml +++ b/prepare_oim/roles/install_xcat/common/tasks/main.yml @@ -20,12 +20,12 @@ MANPATH: "{{ xcat_manpath_env }}" PERL_BADLANG: "{{ perl_badlang_env }}" block: - - name: Set control_plane_os + - name: Set oim_os ansible.builtin.set_fact: - control_plane_os: "{{ ansible_distribution | lower }}" + oim_os: "{{ ansible_distribution | lower }}" - - name: Include vars for {{ control_plane_os }} - ansible.builtin.include_vars: "{{ role_path }}/../{{ control_plane_os }}/vars/main.yml" + - name: Include vars for {{ oim_os }} + ansible.builtin.include_vars: "{{ role_path }}/../{{ oim_os }}/vars/main.yml" - name: Check xcat installation pre-requisites ansible.builtin.include_tasks: pre_requisites.yml @@ -34,5 +34,5 @@ ansible.builtin.include_tasks: package_installation.yml when: not xcat_installation_status - - name: Install xCAT on {{ control_plane_os }} - ansible.builtin.include_tasks: "{{ role_path }}/../{{ control_plane_os }}/tasks/main.yml" + - name: Install xCAT on {{ oim_os }} + ansible.builtin.include_tasks: "{{ role_path }}/../{{ oim_os }}/tasks/main.yml" diff --git a/prepare_cp/roles/install_xcat/common/tasks/package_installation.yml b/prepare_oim/roles/install_xcat/common/tasks/package_installation.yml similarity index 78% rename from prepare_cp/roles/install_xcat/common/tasks/package_installation.yml rename to prepare_oim/roles/install_xcat/common/tasks/package_installation.yml index 59191a7d2..ccb4ea8f6 100644 --- a/prepare_cp/roles/install_xcat/common/tasks/package_installation.yml +++ b/prepare_oim/roles/install_xcat/common/tasks/package_installation.yml @@ -18,9 +18,9 @@ name: "{{ common_packages_xcat }}" state: present -- name: Install python snmp - ansible.builtin.command: "{{ python_version }} -m pip install {{ snmp_python_package }}" - changed_when: true +# - name: Install python snmp +# ansible.builtin.command: "{{ python_version }} -m pip install {{ snmp_python_package }}" +# changed_when: true - name: Install python postgres ansible.builtin.command: "{{ python_version }} -m pip install {{ postgres_python_package }}" @@ -43,16 +43,8 @@ changed_when: true - name: Install netaddr and pexpect - ansible.builtin.command: "{{ pip_version }} install {{ item }}" + ansible.builtin.command: "{{ python_version }} -m pip install {{ item }}" changed_when: true with_items: - "{{ netaddr_pip_package }}" - "{{ pexpect_pip_package }}" - -- name: Install ansible galaxy collection ansible.utils - ansible.builtin.command: ansible-galaxy collection install "{{ item }}" - changed_when: true - register: ansible_collection_install - until: ansible_collection_install is not failed - retries: "{{ max_retries }}" - with_items: "{{ ansible_galaxy_collection }}" diff --git a/prepare_oim/roles/install_xcat/common/tasks/pre_requisites.yml b/prepare_oim/roles/install_xcat/common/tasks/pre_requisites.yml new file mode 100644 index 000000000..00cc05480 --- /dev/null +++ b/prepare_oim/roles/install_xcat/common/tasks/pre_requisites.yml @@ -0,0 +1,136 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check selinux status + when: + - oim_os in oim_os_redhat or + oim_os in oim_os_rocky + block: + - name: Fetch selinux mode + ansible.builtin.command: sestatus + register: sestatus_current + changed_when: false + + - name: Disable selinux + ansible.builtin.replace: + path: /etc/sysconfig/selinux + regexp: 'SELINUX=[a-z]+' + replace: 'SELINUX=disabled' + when: '"SELinux status: enabled" in sestatus_current.stdout_lines' + + - name: Status of selinux + ansible.builtin.fail: + msg: "{{ selinux_status_fail_msg }}" + when: '"SELinux status: enabled" in sestatus_current.stdout_lines' + +- name: Initialize variables + ansible.builtin.set_fact: + xcat_installation_status: false + validation_status: true + +- name: Gathering service facts + ansible.builtin.service_facts: + +- name: Fetch network table entries + ansible.builtin.command: "{{ xcat_path }}/lsdef -t network" + changed_when: false + failed_when: false + register: fetch_network + +- name: Try restart postgresql service if not running in Ubuntu + when: + - oim_os == oim_os_ubuntu + - fetch_network.rc != 0 + block: + - name: Try restart postgresql service if not running in Ubuntu + ansible.builtin.systemd: + name: "{{ postgresql_service_ubuntu }}" + state: restarted + register: postgresql_restart + until: postgresql_restart is not failed + retries: "{{ service_retries }}" + when: + - postgresql_service_ubuntu in ansible_facts.services + rescue: + - name: Unable to start postgresql services + ansible.builtin.debug: + msg: "{{ postgresql_start_fail_msg }}" + +- name: Try restart postgresql service if not running in RHEL/Rocky + when: + - oim_os == oim_os_redhat or oim_os == oim_os_rocky + - fetch_network.rc != 0 + block: + - name: Try restart postgresql service if not running in RHEL/Rocky + ansible.builtin.systemd: + name: "{{ postgresql_service_rhel }}" + state: restarted + register: postgresql_restart + until: postgresql_restart is not failed + retries: "{{ service_retries }}" + when: + - postgresql_service_rhel in ansible_facts.services + rescue: + - name: Unable to start postgresql services + ansible.builtin.debug: + msg: "{{ postgresql_start_fail_msg }}" + +- name: Try restart xcatd service if not running + block: + - name: Try restart xcatd service if not running + ansible.builtin.systemd: + name: xcatd + state: restarted + register: xcatd_restart + until: xcatd_restart is not failed + retries: "{{ service_retries }}" + when: + - xcatd_service in ansible_facts.services + - "'running' not in ansible_facts.services[xcatd_service].state" + rescue: + - name: Unable to start xcatd services + ansible.builtin.debug: + msg: "{{ xcat_start_fail_msg }}" + +- name: Gathering service facts + ansible.builtin.service_facts: + +- name: Fetch network table entries + ansible.builtin.command: "{{ xcat_path }}/lsdef -t network" + changed_when: false + failed_when: false + register: fetch_network + +- name: Set xcat_installation_status to true for RHEL/Rocky + ansible.builtin.set_fact: + xcat_installation_status: true + when: + - oim_os == oim_os_redhat or oim_os == oim_os_rocky + - xcatd_service in ansible_facts.services + - postgresql_service_rhel in ansible_facts.services + - "'running' in ansible_facts.services[xcatd_service].state" + - "'running' in ansible_facts.services[postgresql_service_rhel].state" + - fetch_network.rc == 0 + +- name: Set xcat_installation_status to true for Ubuntu + ansible.builtin.set_fact: + xcat_installation_status: true + when: + - oim_os == oim_os_ubuntu + - xcatd_service in ansible_facts.services + - postgresql_service_ubuntu in ansible_facts.services + - "'running' in ansible_facts.services[xcatd_service].state" + - "'running' in ansible_facts.services[postgresql_service_ubuntu].state" + - fetch_network.rc == 0 diff --git a/prepare_cp/roles/install_xcat/common/vars/main.yml b/prepare_oim/roles/install_xcat/common/vars/main.yml similarity index 76% rename from prepare_cp/roles/install_xcat/common/vars/main.yml rename to prepare_oim/roles/install_xcat/common/vars/main.yml index a549df5ae..4e6be4b39 100644 --- a/prepare_cp/roles/install_xcat/common/vars/main.yml +++ b/prepare_oim/roles/install_xcat/common/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -24,21 +24,25 @@ selinux_status_fail_msg: "selinux is not disabled. Disable it in /etc/sysconfig/ xcat_connection_search_key: "connection failure" warning_wait_time: 30 reprovision_warning_msg: "[WARNING] xcatd services are not running in the server. Re-provisioning of nodes will be skipped for the given inventory." -control_plane_os_redhat: "redhat" -control_plane_os_rocky: "rocky" +oim_os_redhat: "redhat" +oim_os_rocky: "rocky" +oim_os_ubuntu: "ubuntu" +xcat_path: /opt/xcat/bin +xcat_sbin_path: /opt/xcat/sbin +xcatd_service: "xcatd.service" +postgresql_service_rhel: "postgresql.service" +postgresql_service_ubuntu: "postgresql" +postgresql_start_fail_msg: "Failed to start postgresql services" +xcat_start_fail_msg: "Failed to start xcatd services" +service_retries: 3 +python_version: "{{ ansible_python_interpreter }}" -# Usage: package_installation.yml -ansible_galaxy_collection: - - ansible.utils:2.5.2 - - community.general:4.8.7 -python_version: python3.9 -snmp_python_package: easysnmp +# snmp_python_package: easysnmp postgres_python_package: psycopg2-binary requests_python_package: requests pyarrow_python_package: pyarrow pandas_python_package: pandas passlib_python_package: passlib -pip_version: pip3.9 netaddr_pip_package: netaddr pexpect_pip_package: pexpect diff --git a/prepare_cp/roles/install_xcat/redhat/files/xcat-cmdline.patch b/prepare_oim/roles/install_xcat/redhat/files/xcat-cmdline.patch similarity index 100% rename from prepare_cp/roles/install_xcat/redhat/files/xcat-cmdline.patch rename to prepare_oim/roles/install_xcat/redhat/files/xcat-cmdline.patch diff --git a/prepare_cp/roles/install_xcat/redhat/files/xcat-genesis-base-x86_64.tar.gz b/prepare_oim/roles/install_xcat/redhat/files/xcat-genesis-base-x86_64.tar.gz similarity index 100% rename from prepare_cp/roles/install_xcat/redhat/files/xcat-genesis-base-x86_64.tar.gz rename to prepare_oim/roles/install_xcat/redhat/files/xcat-genesis-base-x86_64.tar.gz diff --git a/prepare_cp/roles/install_xcat/redhat/tasks/config_network.yml b/prepare_oim/roles/install_xcat/redhat/tasks/config_network.yml similarity index 100% rename from prepare_cp/roles/install_xcat/redhat/tasks/config_network.yml rename to prepare_oim/roles/install_xcat/redhat/tasks/config_network.yml diff --git a/upgrade/roles/prepare_cp_for_upgrade/tasks/configure_postgres.yml b/prepare_oim/roles/install_xcat/redhat/tasks/configure_postgres.yml similarity index 82% rename from upgrade/roles/prepare_cp_for_upgrade/tasks/configure_postgres.yml rename to prepare_oim/roles/install_xcat/redhat/tasks/configure_postgres.yml index 66b5b45d1..761307d3a 100644 --- a/upgrade/roles/prepare_cp_for_upgrade/tasks/configure_postgres.yml +++ b/prepare_oim/roles/install_xcat/redhat/tasks/configure_postgres.yml @@ -16,7 +16,7 @@ - name: Setup postgres database (This task may take 10 mins) ansible.builtin.command: "{{ pgsqlsetup_path }} -i -V" changed_when: true -# no_log: true + no_log: true environment: XCATPGPW: "{{ postgresdb_password }}" @@ -64,7 +64,7 @@ ansible.builtin.command: | {{ python_version }} {{ utility_path }} {{ postgresdb_password }} changed_when: false -# no_log: true + no_log: true when: not key_status.stat.exists - name: Set default pg_hba_conf_update @@ -83,21 +83,33 @@ - name: Check pg_hba_conf_update status when: not pg_hba_conf_update block: + # Set PostgreSQL password for postgres user - name: Set PostgreSQL password for postgres user become: true become_user: postgres - ansible.builtin.shell: "echo \"ALTER USER postgres WITH PASSWORD '{{ postgresdb_password }}';\" | psql" # noqa: yaml[line-length] risky-shell-pipe - changed_when: false + ansible.builtin.shell: | + set -o pipefail && \ + echo "ALTER USER postgres WITH PASSWORD '{{ postgresdb_password }}';" | psql no_log: true + changed_when: false + register: postgresdb_config - - name: Replace peer to md5 in pg_hba.conf + - name: Replace peer with md5 in pg_hba.conf ansible.builtin.replace: path: "{{ pg_hba_conf_path }}" regexp: "^local(.*)all(.*)all(.*)" replace: "local all all md5" register: pg_hba_conf_update -- name: Restart postgresql service + rescue: + - name: Failed to set PostgreSQL password for postgres user + when: + - postgresdb_config.module_error is defined + - postgresdb_config.module_error | length > 0 + ansible.builtin.fail: + msg: "{{ postgres_set_password_fail_msg }} Error: {{ postgresdb_config.module_error }}" + +- name: Restart PostgreSQL service ansible.builtin.systemd: name: postgresql state: restarted diff --git a/prepare_cp/roles/install_xcat/redhat/tasks/firewall_settings.yml b/prepare_oim/roles/install_xcat/redhat/tasks/firewall_settings.yml similarity index 99% rename from prepare_cp/roles/install_xcat/redhat/tasks/firewall_settings.yml rename to prepare_oim/roles/install_xcat/redhat/tasks/firewall_settings.yml index b9e350594..12fa85bdd 100644 --- a/prepare_cp/roles/install_xcat/redhat/tasks/firewall_settings.yml +++ b/prepare_oim/roles/install_xcat/redhat/tasks/firewall_settings.yml @@ -42,4 +42,3 @@ - name: Message if nerdctl-registry restart fails ansible.builtin.debug: msg: "{{ nerdctl_registry_restart_fail_msg }}" - diff --git a/prepare_cp/roles/install_xcat/redhat/tasks/install_genesis.yml b/prepare_oim/roles/install_xcat/redhat/tasks/install_genesis.yml similarity index 96% rename from prepare_cp/roles/install_xcat/redhat/tasks/install_genesis.yml rename to prepare_oim/roles/install_xcat/redhat/tasks/install_genesis.yml index b2579b182..454b8cdb6 100644 --- a/prepare_cp/roles/install_xcat/redhat/tasks/install_genesis.yml +++ b/prepare_oim/roles/install_xcat/redhat/tasks/install_genesis.yml @@ -38,7 +38,7 @@ - name: Untar genesis base package ansible.builtin.unarchive: src: "{{ xcat_genesis_tar_file }}" - dest: "{{ role_path }}/../{{ control_plane_os }}/files" + dest: "{{ role_path }}/../{{ oim_os }}/files" changed_when: true register: untar_genesis_base until: untar_genesis_base is not failed @@ -90,7 +90,7 @@ with_items: "{{ doxcat_patch_path }}" - name: Create genesis image - ansible.builtin.command: "mknb {{ control_plane_arch }}" + ansible.builtin.command: "{{ xcat_sbin_path }}/mknb {{ oim_arch }}" changed_when: true when: - genesis_image_creation or diff --git a/prepare_cp/roles/install_xcat/redhat/tasks/install_xcat.yml b/prepare_oim/roles/install_xcat/redhat/tasks/install_xcat.yml similarity index 100% rename from prepare_cp/roles/install_xcat/redhat/tasks/install_xcat.yml rename to prepare_oim/roles/install_xcat/redhat/tasks/install_xcat.yml diff --git a/prepare_cp/roles/install_xcat/redhat/tasks/main.yml b/prepare_oim/roles/install_xcat/redhat/tasks/main.yml similarity index 100% rename from prepare_cp/roles/install_xcat/redhat/tasks/main.yml rename to prepare_oim/roles/install_xcat/redhat/tasks/main.yml diff --git a/prepare_cp/roles/install_xcat/redhat/vars/main.yml b/prepare_oim/roles/install_xcat/redhat/vars/main.yml similarity index 94% rename from prepare_cp/roles/install_xcat/redhat/vars/main.yml rename to prepare_oim/roles/install_xcat/redhat/vars/main.yml index f806fcfac..4a8590baa 100644 --- a/prepare_cp/roles/install_xcat/redhat/vars/main.yml +++ b/prepare_oim/roles/install_xcat/redhat/vars/main.yml @@ -24,6 +24,7 @@ common_packages_xcat: - unzip - bzip2 - tar + - chrony common_packages_provision: - python3-netaddr - openssl @@ -97,7 +98,7 @@ xcat_packages: - perl-DBD-Pg # Usage: install_genesis.yml -control_plane_arch: x86_64 +oim_arch: x86_64 xcat_git_version: 2.16.4 xcat_genesis_tar_file: "{{ role_path }}/../redhat/files/xcat-genesis-base-x86_64.tar.gz" xcat_genesis_rpm_file: "{{ role_path }}/../redhat/files/xCAT-genesis-base-x86_64-2.16.4-snap202308211541.noarch.rpm" @@ -108,7 +109,8 @@ doxcat_patch_path: - { src: "{{ role_path }}/../common/files/doxcat.patch", dest: "{{ xcat_bin_path }}/doxcat.patch", mode: "755" } untar_genesis_fail_msg: "Failed. Unable to untar {{ xcat_genesis_tar_file }}. {{ xcat_genesis_tar_file }} not downloaded successfully during prereq.sh or while cloning the github repository. -Please re-run pre_req.sh again and verify the file is downloaded successfully." +Please re-run prereq.sh again and verify the file is downloaded successfully." +xcat_sbin_path: /opt/xcat/sbin # Usage: configure_postgres.yml xcat_services: @@ -118,6 +120,7 @@ xcat_services: - snmpd postgresql_conf_dir: /etc/systemd/system/postgresql.service.d postgresql_conf_dest: "{{ postgresql_conf_dir }}/postgresql.conf" +postgres_set_password_fail_msg: "Failed to set Postgresql password for user- postgres." # Usage: configure_postgres.yml pg_hba_conf_path: /var/lib/pgsql/data/pg_hba.conf diff --git a/prepare_oim/roles/install_xcat/rocky b/prepare_oim/roles/install_xcat/rocky new file mode 100644 index 000000000..4d0827986 --- /dev/null +++ b/prepare_oim/roles/install_xcat/rocky @@ -0,0 +1 @@ +redhat \ No newline at end of file diff --git a/prepare_cp/roles/install_xcat/ubuntu/files/omshell.tar.gz b/prepare_oim/roles/install_xcat/ubuntu/files/omshell.tar.gz similarity index 100% rename from prepare_cp/roles/install_xcat/ubuntu/files/omshell.tar.gz rename to prepare_oim/roles/install_xcat/ubuntu/files/omshell.tar.gz diff --git a/prepare_cp/roles/install_xcat/ubuntu/files/root_apt_permission b/prepare_oim/roles/install_xcat/ubuntu/files/root_apt_permission similarity index 100% rename from prepare_cp/roles/install_xcat/ubuntu/files/root_apt_permission rename to prepare_oim/roles/install_xcat/ubuntu/files/root_apt_permission diff --git a/prepare_cp/roles/install_xcat/ubuntu/files/xcat-genesis-base-deb-x86_64.tar.gz b/prepare_oim/roles/install_xcat/ubuntu/files/xcat-genesis-base-deb-x86_64.tar.gz similarity index 100% rename from prepare_cp/roles/install_xcat/ubuntu/files/xcat-genesis-base-deb-x86_64.tar.gz rename to prepare_oim/roles/install_xcat/ubuntu/files/xcat-genesis-base-deb-x86_64.tar.gz diff --git a/server_spec_update/roles/update_node_object/tasks/main.yml b/prepare_oim/roles/install_xcat/ubuntu/tasks/apply_xcat_patch.yml similarity index 77% rename from server_spec_update/roles/update_node_object/tasks/main.yml rename to prepare_oim/roles/install_xcat/ubuntu/tasks/apply_xcat_patch.yml index 7e27d4b61..7b31b8023 100644 --- a/server_spec_update/roles/update_node_object/tasks/main.yml +++ b/prepare_oim/roles/install_xcat/ubuntu/tasks/apply_xcat_patch.yml @@ -13,8 +13,9 @@ # limitations under the License. --- -- name: Update node object - when: add_network_status - block: - - name: Updating the node objects based on nodes_nic_info - ansible.builtin.include_tasks: update_nodes.yml +- name: Apply xcat patches + ansible.posix.patch: + src: "{{ item.src }}" + dest: "{{ item.dest }}" + state: present + with_items: "{{ xcat_patch_files }}" diff --git a/prepare_cp/roles/install_xcat/ubuntu/tasks/configure_dhcp.yml b/prepare_oim/roles/install_xcat/ubuntu/tasks/configure_dhcp.yml similarity index 100% rename from prepare_cp/roles/install_xcat/ubuntu/tasks/configure_dhcp.yml rename to prepare_oim/roles/install_xcat/ubuntu/tasks/configure_dhcp.yml diff --git a/prepare_cp/roles/install_xcat/ubuntu/tasks/configure_genesis.yml b/prepare_oim/roles/install_xcat/ubuntu/tasks/configure_genesis.yml similarity index 97% rename from prepare_cp/roles/install_xcat/ubuntu/tasks/configure_genesis.yml rename to prepare_oim/roles/install_xcat/ubuntu/tasks/configure_genesis.yml index 417d8251c..43de6e748 100644 --- a/prepare_cp/roles/install_xcat/ubuntu/tasks/configure_genesis.yml +++ b/prepare_oim/roles/install_xcat/ubuntu/tasks/configure_genesis.yml @@ -85,7 +85,7 @@ with_items: "{{ doxcat_patch_path }}" - name: Create genesis image - ansible.builtin.command: "mknb {{ control_plane_arch }}" + ansible.builtin.command: "{{ xcat_sbin_path }}/mknb {{ oim_arch }}" changed_when: true when: - genesis_image_creation or diff --git a/prepare_cp/roles/install_xcat/ubuntu/tasks/configure_postgres.yml b/prepare_oim/roles/install_xcat/ubuntu/tasks/configure_postgres.yml similarity index 82% rename from prepare_cp/roles/install_xcat/ubuntu/tasks/configure_postgres.yml rename to prepare_oim/roles/install_xcat/ubuntu/tasks/configure_postgres.yml index a8aa1ec92..48016a7a7 100644 --- a/prepare_cp/roles/install_xcat/ubuntu/tasks/configure_postgres.yml +++ b/prepare_oim/roles/install_xcat/ubuntu/tasks/configure_postgres.yml @@ -98,22 +98,35 @@ pg_hba_conf_update: "{{ pg_hba_conf_content.content | b64decode | regex_search('^local\\s+all\\s+postgres\\s+md5', multiline=True) is not none }}" - name: Check pg_hba_conf_update status - when: pg_hba_conf_update == false + when: not pg_hba_conf_update block: + # Set PostgreSQL password for postgres user - name: Set PostgreSQL password for postgres user become: true become_user: postgres - ansible.builtin.shell: "echo \"ALTER USER postgres WITH PASSWORD '{{ postgresdb_password }}';\" | psql" + ansible.builtin.shell: | + set -o pipefail && \ + echo "ALTER USER postgres WITH PASSWORD '{{ postgresdb_password }}';" | psql no_log: true + changed_when: false + register: postgresdb_config - - name: Replace peer to md5 in pg_hba.conf + - name: Replace peer with md5 in pg_hba.conf ansible.builtin.replace: path: "{{ pg_hba_conf_path }}" regexp: "^local(.*)all(.*)postgres(.*)" - replace: "local all postgres md5" + replace: "local all postgres md5" register: pg_hba_conf_update -- name: Restart postgresql service + rescue: + - name: Failed to set PostgreSQL password for postgres user + when: + - postgresdb_config.module_error is defined + - postgresdb_config.module_error | length > 0 + ansible.builtin.fail: + msg: "{{ postgres_set_password_fail_msg }} Error: {{ postgresdb_config.module_error }}" + +- name: Restart PostgreSQL service ansible.builtin.systemd: name: "postgresql@{{ psql_version }}-main.service" state: restarted diff --git a/prepare_cp/roles/install_xcat/ubuntu/tasks/configure_rsyslog.yml b/prepare_oim/roles/install_xcat/ubuntu/tasks/configure_rsyslog.yml similarity index 100% rename from prepare_cp/roles/install_xcat/ubuntu/tasks/configure_rsyslog.yml rename to prepare_oim/roles/install_xcat/ubuntu/tasks/configure_rsyslog.yml diff --git a/prepare_cp/roles/install_xcat/ubuntu/tasks/install_xcat.yml b/prepare_oim/roles/install_xcat/ubuntu/tasks/install_xcat.yml similarity index 100% rename from prepare_cp/roles/install_xcat/ubuntu/tasks/install_xcat.yml rename to prepare_oim/roles/install_xcat/ubuntu/tasks/install_xcat.yml diff --git a/prepare_cp/roles/install_xcat/ubuntu/tasks/main.yml b/prepare_oim/roles/install_xcat/ubuntu/tasks/main.yml similarity index 87% rename from prepare_cp/roles/install_xcat/ubuntu/tasks/main.yml rename to prepare_oim/roles/install_xcat/ubuntu/tasks/main.yml index 488f7411b..b30991576 100644 --- a/prepare_cp/roles/install_xcat/ubuntu/tasks/main.yml +++ b/prepare_oim/roles/install_xcat/ubuntu/tasks/main.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -24,12 +24,11 @@ - name: Configure genesis ansible.builtin.include_tasks: configure_genesis.yml -- name: Config network - ansible.builtin.include_tasks: config_network.yml +- name: Apply xcat patch + ansible.builtin.include_tasks: apply_xcat_patch.yml - name: Configure dhcp ansible.builtin.include_tasks: configure_dhcp.yml - name: Configure postgres ansible.builtin.include_tasks: configure_postgres.yml - diff --git a/prepare_cp/roles/install_xcat/ubuntu/vars/main.yml b/prepare_oim/roles/install_xcat/ubuntu/vars/main.yml similarity index 88% rename from prepare_cp/roles/install_xcat/ubuntu/vars/main.yml rename to prepare_oim/roles/install_xcat/ubuntu/vars/main.yml index 989e17782..98e357944 100644 --- a/prepare_cp/roles/install_xcat/ubuntu/vars/main.yml +++ b/prepare_oim/roles/install_xcat/ubuntu/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -61,6 +61,7 @@ xcat_services_ubuntu: postgresql_conf_dir_ubuntu: /etc/systemd/system/postgresql@.service.d postgresql_conf_dest_ubuntu: "{{ postgresql_conf_dir_ubuntu }}/postgresql.conf" pg_hba_conf_path: /etc/postgresql/{{ psql_version }}/main/pg_hba.conf +postgres_set_password_fail_msg: "Failed to set Postgresql password for user postgres." # Usage: configure_rsyslog.yml rsyslog_dest: /etc/rsyslog.conf @@ -80,10 +81,10 @@ omshell_script_dest: /usr/bin/omshell omshell_untar_dest: "{{ role_path }}/../ubuntu/files" untar_omshell_fail_msg: "Failed. Unable to untar {{ omshell_tar_src }}. {{ omshell_tar_src }} not downloaded successfully during prereq.sh or while cloning the github repository. -Please re-run pre_req.sh again and verify the file is downloaded successfully." +Please re-run prereq.sh again and verify the file is downloaded successfully." # Usage: configure_genesis.yml -control_plane_arch: x86_64 +oim_arch: x86_64 xcat_bin_path: /opt/xcat/share/xcat/netboot/genesis/x86_64/fs/usr/bin doxcat_script_path: "{{ xcat_bin_path }}/doxcat" doxcat_check_key: 'ipmitool raw 0x00 0x08 0x03 0x08' @@ -96,7 +97,8 @@ xcat_genesis_tar_deb_file: "{{ xcat_genesis_deb_dir }}/xcat-genesis-base-deb-x86 xcat_genesis_deb_file: "{{ xcat_genesis_deb_dir }}/xcat-genesis-base-amd64_2.16.4-snap202308211541_all.deb" untar_genesis_deb_fail_msg: "Failed. Unable to untar {{ xcat_genesis_tar_deb_file }}. {{ xcat_genesis_tar_deb_file }} not downloaded successfully during prereq.sh or while cloning the github repository. -Please re-run pre_req.sh again and verify the file is downloaded successfully." +Please re-run prereq.sh again and verify the file is downloaded successfully." +xcat_sbin_path: /opt/xcat/sbin # Usage: configure_postgres.yml postgres_file_path: "/opt/omnia/.postgres/" @@ -104,9 +106,7 @@ directory_permissions: "0600" utility_path: "{{ role_path }}/files/encrypt_pwd.py" encrypted_file_path: "/opt/omnia/.postgres/.encrypted_pwd" -# Usage: config_network.yml -config_nw_path: "/install/postscripts" -config_nw_script_path: "/install/postscripts/confignetwork" -config_nw_check_key: if [ $is_redhat -eq 1 ] || [ $is_debian -eq 1 ] -config_nw_patch_path: - - { src: "{{ role_path }}/../common/files/confignetwork.patch", dest: "{{ config_nw_path }}/confignetwork.patch", mode: "755" } +# Usage: apply_xcat_patch.yml +xcat_patch_files: + - { src: "{{ role_path }}/../common/files/confignetwork.patch", dest: "/install/postscripts/confignetwork" } + - { src: "{{ role_path }}/../common/files/ddns.patch", dest: "/opt/xcat/lib/perl/xCAT_plugin/ddns.pm" } diff --git a/prepare_cp/roles/omnia_appliance_cp/files/requirements_collections.yml b/prepare_oim/roles/omnia_appliance_oim/files/requirements_collections.yml similarity index 97% rename from prepare_cp/roles/omnia_appliance_cp/files/requirements_collections.yml rename to prepare_oim/roles/omnia_appliance_oim/files/requirements_collections.yml index fbb7b6d91..74f91e340 100644 --- a/prepare_cp/roles/omnia_appliance_cp/files/requirements_collections.yml +++ b/prepare_oim/roles/omnia_appliance_oim/files/requirements_collections.yml @@ -16,4 +16,4 @@ collections: - name: https://github.com/kubernetes-sigs/kubespray type: git - version: v2.23.2 \ No newline at end of file + version: v2.25.0 diff --git a/prepare_oim/roles/omnia_appliance_oim/files/requirements_pip.txt b/prepare_oim/roles/omnia_appliance_oim/files/requirements_pip.txt new file mode 100644 index 000000000..1b141c8d3 --- /dev/null +++ b/prepare_oim/roles/omnia_appliance_oim/files/requirements_pip.txt @@ -0,0 +1,10 @@ +ansible==9.5.1 +cryptography==44.0.0 +jinja2==3.1.4 +jmespath==1.0.1 +jsonschema==4.22.0 +MarkupSafe==2.1.5 +netaddr==1.2.1 +pbr==6.0.0 +ruamel.yaml==0.18.6 +ruamel.yaml.clib==0.2.8 diff --git a/prepare_cp/roles/omnia_appliance_cp/files/requirements_pip.txt b/prepare_oim/roles/omnia_appliance_oim/files/requirements_pip_161.txt similarity index 64% rename from prepare_cp/roles/omnia_appliance_cp/files/requirements_pip.txt rename to prepare_oim/roles/omnia_appliance_oim/files/requirements_pip_161.txt index d94e982db..94ccbf1b8 100644 --- a/prepare_cp/roles/omnia_appliance_cp/files/requirements_pip.txt +++ b/prepare_oim/roles/omnia_appliance_oim/files/requirements_pip_161.txt @@ -1,7 +1,7 @@ -cryptography==41.0.1 +cryptography==44.0.0 jmespath==1.0.1 MarkupSafe==2.1.3 netaddr==0.8.0 pbr==5.11.1 ruamel.yaml==0.17.31 -ruamel.yaml.clib==0.2.7 \ No newline at end of file +ruamel.yaml.clib==0.2.7 diff --git a/prepare_cp/roles/omnia_appliance_cp/tasks/main.yml b/prepare_oim/roles/omnia_appliance_oim/tasks/main.yml similarity index 63% rename from prepare_cp/roles/omnia_appliance_cp/tasks/main.yml rename to prepare_oim/roles/omnia_appliance_oim/tasks/main.yml index 855aaab60..7f2c31866 100644 --- a/prepare_cp/roles/omnia_appliance_cp/tasks/main.yml +++ b/prepare_oim/roles/omnia_appliance_oim/tasks/main.yml @@ -13,17 +13,16 @@ # limitations under the License. --- -- name: Install kubespray collection - block: - - name: Install kubespray ansible-collection - ansible.builtin.command: "ansible-galaxy install -r {{ ansible_collection_req_file }}" - retries: "{{ retry_count }}" - changed_when: false - rescue: - - name: Failed to install kubespray ansible-collection - ansible.builtin.debug: - msg: "{{ ansible_collection_fail_msg }}" +- name: Check virtual ENV + ansible.builtin.set_fact: + venv_path: "{{ lookup('ansible.builtin.env', 'VIRTUAL_ENV') }}" -- name: Install required pip modules +- name: Install required pip modules for omnia1.6.1 venv + when: "'omnia161_venv' in venv_path" + ansible.builtin.command: "{{ python_version }} -m pip install -r {{ pip_modules_req_file_161 }}" + changed_when: false + +- name: Install required pip modules for omnia1.7 venv + when: "'omnia17_venv' in venv_path" ansible.builtin.command: "{{ python_version }} -m pip install -r {{ pip_modules_req_file }}" changed_when: false diff --git a/prepare_cp/roles/omnia_appliance_cp/vars/main.yml b/prepare_oim/roles/omnia_appliance_oim/vars/main.yml similarity index 88% rename from prepare_cp/roles/omnia_appliance_cp/vars/main.yml rename to prepare_oim/roles/omnia_appliance_oim/vars/main.yml index 42985a388..8cb025596 100644 --- a/prepare_cp/roles/omnia_appliance_cp/vars/main.yml +++ b/prepare_oim/roles/omnia_appliance_oim/vars/main.yml @@ -15,7 +15,8 @@ # Usage: main.yml ansible_collection_req_file: "{{ role_path }}/files/requirements_collections.yml" -python_version: python3.9 +python_version: "{{ ansible_python_interpreter }}" pip_modules_req_file: "{{ role_path }}/files/requirements_pip.txt" +pip_modules_req_file_161: "{{ role_path }}/files/requirements_pip_161.txt" retry_count: 5 ansible_collection_fail_msg: "Failed to install kubespray ansible-collection. Please try installing it manually or re-run the playbook again." diff --git a/prepare_cp/roles/omnia_telemetry_cp/files/collector.py b/prepare_oim/roles/omnia_telemetry_oim/files/collector.py similarity index 100% rename from prepare_cp/roles/omnia_telemetry_cp/files/collector.py rename to prepare_oim/roles/omnia_telemetry_oim/files/collector.py diff --git a/prepare_cp/roles/omnia_telemetry_cp/files/common_logging.py b/prepare_oim/roles/omnia_telemetry_oim/files/common_logging.py similarity index 100% rename from prepare_cp/roles/omnia_telemetry_cp/files/common_logging.py rename to prepare_oim/roles/omnia_telemetry_oim/files/common_logging.py diff --git a/prepare_cp/roles/omnia_telemetry_cp/files/common_parser.py b/prepare_oim/roles/omnia_telemetry_oim/files/common_parser.py similarity index 100% rename from prepare_cp/roles/omnia_telemetry_cp/files/common_parser.py rename to prepare_oim/roles/omnia_telemetry_oim/files/common_parser.py diff --git a/prepare_cp/roles/omnia_telemetry_cp/files/common_security.py b/prepare_oim/roles/omnia_telemetry_oim/files/common_security.py similarity index 100% rename from prepare_cp/roles/omnia_telemetry_cp/files/common_security.py rename to prepare_oim/roles/omnia_telemetry_oim/files/common_security.py diff --git a/prepare_cp/roles/omnia_telemetry_cp/files/data_collector_amd_gpu.py b/prepare_oim/roles/omnia_telemetry_oim/files/data_collector_amd_gpu.py similarity index 94% rename from prepare_cp/roles/omnia_telemetry_cp/files/data_collector_amd_gpu.py rename to prepare_oim/roles/omnia_telemetry_oim/files/data_collector_amd_gpu.py index 3a93d69f5..3afa07e74 100644 --- a/prepare_cp/roles/omnia_telemetry_cp/files/data_collector_amd_gpu.py +++ b/prepare_oim/roles/omnia_telemetry_oim/files/data_collector_amd_gpu.py @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -22,12 +22,13 @@ # --------------------------------AMD GPU metric collection--------------------------------- +rocm_bin_path = "/opt/rocm/bin/" def get_amd_gpu_temp(): ''' This method collects amd gpu temp from rocm query output and stores it in gpu metric dictionary ''' - amd_metrics_query = "rocm-smi -t --csv" + amd_metrics_query = rocm_bin_path + "rocm-smi -t --csv" command_result = invoke_commands.run_command(amd_metrics_query) if command_result is not None: gpu_temp = {} @@ -92,7 +93,7 @@ def get_amd_gpu_utilization(): This method collects amd gpu utilization from rocm query output and stores it in gpu metric dictionary ''' - amd_metrics_query = "rocm-smi -u --csv" + amd_metrics_query = rocm_bin_path + "rocm-smi -u --csv" command_result = invoke_commands.run_command(amd_metrics_query) if command_result is not None: try: @@ -127,16 +128,16 @@ def get_gpu_health_driver(): ''' This method collects amd gpu driver health from rocm query output ''' - amd_metrics_query = "rocm-smi --showdriverversion --csv" + amd_metrics_query = "/opt/rocm/bin/rocm-smi --showdriverversion --csv -t" command_result = invoke_commands.run_command(amd_metrics_query) - list_info = invoke_commands.run_command("rocm-smi -i --csv") + list_info = invoke_commands.run_command("/opt/rocm/bin/rocm-smi -i --csv") gpu_driver = {} if command_result is not None and list_info is not None: try: command_result_df = common_parser.get_df_format(command_result) gpu_util_list = common_parser.get_col_from_df(command_result_df, 'Driver version') list_info_df = common_parser.get_df_format(list_info) - gpu_list = common_parser.get_col_from_df(list_info_df, 'GPU ID') + gpu_list = common_parser.get_col_from_df(list_info_df, 'Device ID') for index,item in enumerate(gpu_list): gpu_driver[index] = gpu_util_list[0] return gpu_driver @@ -157,7 +158,7 @@ def get_gpu_health_pcie(): ''' This method collects amd gpu pcie health from rocm query output ''' - amd_metrics_query = "rocm-smi --showbus --csv" + amd_metrics_query = rocm_bin_path + "rocm-smi --showbus --csv" command_result = invoke_commands.run_command(amd_metrics_query) if command_result is not None: try: @@ -181,7 +182,7 @@ def get_gpu_health_power(): ''' This method collects amd gpu power health from rocm query output ''' - amd_metrics_query = "rocm-smi -P -M --csv" + amd_metrics_query = rocm_bin_path + "rocm-smi -P -M --csv" command_result = invoke_commands.run_command(amd_metrics_query) if command_result is not None: try: @@ -201,7 +202,7 @@ def get_gpu_health_thermal(): ''' This method collects amd gpu thermal health from rocm query output ''' - amd_metrics_query = "rocm-smi -t --csv" + amd_metrics_query = rocm_bin_path + "rocm-smi -t --csv" command_result = invoke_commands.run_command(amd_metrics_query) if command_result is not None: command_result_df = common_parser.get_df_format(command_result) diff --git a/prepare_oim/roles/omnia_telemetry_oim/files/data_collector_amd_proc_acc.py b/prepare_oim/roles/omnia_telemetry_oim/files/data_collector_amd_proc_acc.py new file mode 100644 index 000000000..a3336927a --- /dev/null +++ b/prepare_oim/roles/omnia_telemetry_oim/files/data_collector_amd_proc_acc.py @@ -0,0 +1,95 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +''' +Module to gather amd gpu metrics +''' + +import common_parser +import invoke_commands +import common_logging + +# --------------------------------AMD GPU metric collection--------------------------------- + +rocm_bin_path = "/opt/rocm/bin/" + +def get_amd_gpu_temp(): + ''' + This method collects amd gpu temp from rocm query output + and stores it in gpu metric dictionary + ''' + amd_metrics_query = rocm_bin_path + "rocm-smi -t --csv" + command_result = invoke_commands.run_command(amd_metrics_query) + if command_result is not None: + gpu_temp = {} + command_result_df = common_parser.get_df_format(command_result) + try: + gpu_temp['sensor_junction'] = common_parser.get_col_from_df(command_result_df, + 'Temperature (Sensor junction) (C)') + except Exception as err: + gpu_temp['sensor_junction'] = None + common_logging.log_error("data_collector_amd_proc_acc:get_amd_gpu_temp", + "could not parse sensor_junction temp from rocm-smi" + str(err)) + try: + gpu_temp['sensor_memory'] = common_parser.get_col_from_df(command_result_df, + 'Temperature (Sensor memory) (C)') + except Exception as err: + gpu_temp['sensor_memory'] = None + common_logging.log_error("data_collector_amd_proc_acc:get_amd_gpu_temp", + "could not parse sensor_memory temp from rocm-smi" + str(err)) + return gpu_temp + + common_logging.log_error("data_collector_amd_proc_acc:get_amd_gpu_temp", + "rocm-smi command did not give output for gpu temperature metrics.") + return None + +# -------------------------------AMD GPU health metric collection------------------------------- + +def get_gpu_health_power(): + ''' + This method collects amd gpu power health from rocm query output + ''' + amd_metrics_query = rocm_bin_path + "rocm-smi -P -M --csv" + command_result = invoke_commands.run_command(amd_metrics_query) + if command_result is not None: + try: + command_result_df = common_parser.get_df_format(command_result) + gpu_util_list_max = common_parser.get_col_from_df(command_result_df, + 'Max Graphics Package Power (W)') + gpu_util_list_avg = common_parser.get_col_from_df(command_result_df, + 'Current Socket Graphics Package Power (W)') + return gpu_util_list_max,gpu_util_list_avg + except Exception as err: + common_logging.log_error("data_collector_amd_proc_acc:get_gpu_health_power", + "could not parse gpu power health from rocm-smi. "+str(err)) + return None,None + return None,None + +def get_gpu_health_thermal(): + ''' + This method collects amd gpu thermal health from rocm query output + ''' + amd_metrics_query = rocm_bin_path + "rocm-smi -t --csv" + command_result = invoke_commands.run_command(amd_metrics_query) + if command_result is not None: + command_result_df = common_parser.get_df_format(command_result) + try: + gpu_temp = common_parser.get_col_from_df(command_result_df, + 'Temperature (Sensor junction) (C)') + return gpu_temp + except Exception as err: + common_logging.log_error("data_collector_amd_proc_acc:get_gpu_health_thermal", + "could not parse sensor_edge temp from rocm-smi. " + str(err)) + return None + return None diff --git a/prepare_oim/roles/omnia_telemetry_oim/files/data_collector_gaudi.py b/prepare_oim/roles/omnia_telemetry_oim/files/data_collector_gaudi.py new file mode 100644 index 000000000..692e55c73 --- /dev/null +++ b/prepare_oim/roles/omnia_telemetry_oim/files/data_collector_gaudi.py @@ -0,0 +1,136 @@ +# Copyright 2024 Intel Corporation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +''' +Module to gather gaudi metrics +''' +import shlex +import common_parser +import invoke_commands +import common_logging + +def get_col_data(cmd_output, column): + if cmd_output is None: + return None + return common_parser.get_col_from_df(cmd_output, column) + +# --------------------------------Gaudi metric collection--------------------------------- +def get_gaudi_metrics_output(): + ''' + This method collects command output for hl-smi command for gaudi metrics + :return: gaudi query output + ''' + gaudi_metrics_query = "hl-smi --query-aip=Name,driver_version,bus_id,power.draw,temperature.aip,utilization.aip --format=csv,nounits" + command_result = invoke_commands.call_command(gaudi_metrics_query) + if command_result is None: + common_logging.log_error("data_collector_gaudi:get_gaudi_metrics_output", + "hl-smi command did not give output for gaudi metrics.") + return None + return common_parser.get_df_format(command_result) + +def get_gaudi_temp(gaudi_metrics_cmd_output): + ''' + This method collects gaudi temp from gaudi query output + :param gaudi_metrics_cmd_output: gaudi query output + ''' + return get_col_data(gaudi_metrics_cmd_output, 'temperature.aip [C]') + +def get_gaudi_utilization(gaudi_metrics_cmd_output): + ''' + This method collects gaudi utilization from gaudi query output + :param gaudi_metrics_cmd_output: gaudi query output + ''' + return get_col_data(gaudi_metrics_cmd_output, 'utilization.aip [%]') + +def get_gaudi_avg_utilization(gaudi_metrics_cmd_output): + ''' + This method calculates average gaudi utilization on the node + :param gaudi_metrics_cmd_output: gaudi query output + ''' + gaudi_util_list = get_col_data(gaudi_metrics_cmd_output, 'utilization.aip [%]') + if gaudi_util_list is not None and len(gaudi_util_list) != 0: + return sum(gaudi_util_list)/len(gaudi_util_list) + return None + + +# ------------------------------- Gaudi health metric collection------------------------------- +def get_gpu_health_driver(gaudi_metrics_cmd_output): + ''' + This method collects gaudi driver health from gaudi query output + ''' + return get_col_data(gaudi_metrics_cmd_output, 'driver_version') + +def get_gpu_health_pcie(gaudi_metrics_cmd_output): + ''' + This method collects gaudi pcie from gaudi query output + ''' + return get_col_data(gaudi_metrics_cmd_output, 'bus_id') + +def get_gpu_health_power(gaudi_metrics_cmd_output): + ''' + This method collects gaudi power from gaudi query output + ''' + gaudi_power_draw_list = get_col_data(gaudi_metrics_cmd_output, 'power.draw [W]') + gaudi_pci_list = get_col_data(gaudi_metrics_cmd_output, 'bus_id') + if gaudi_power_draw_list is None: + return None,None + if gaudi_pci_list is None: + return None,None + gaudi_power_limit_list = [] + for pci in gaudi_pci_list: + pci = shlex.quote(pci).strip("'\"") + cmd = ["hl-smi", "-q", "-d", "POWER", "-i", pci] + ''' + grep "Power Limit" cannot work in the systemd process + so just find the first "Power Limit" and use the substring + Expected output should be like: + ================ HL-SMI LOG ================ + ... + " Power Limit : 550 W\n" + ... + find the line and capture the value 550 + ''' + power_limit_output = invoke_commands.run_command(cmd) + if power_limit_output is None: + return None,None + prefix = "Power Limit" + num_idx = power_limit_output.find(prefix) + if num_idx == -1: + return None,None + power_limit_output = power_limit_output[num_idx:] + prefix = ": " + num_idx = power_limit_output.find(prefix) + if num_idx == -1: + return None,None + num_idx += len(prefix) + sub_str = power_limit_output[num_idx:] + unit_idx = sub_str.find(" W") + if unit_idx == -1: + return None,None + power_limit = sub_str[:unit_idx] + try: + float(power_limit) + except ValueError: + return None,None + gaudi_power_limit_list.append(power_limit) + if len(gaudi_power_limit_list) > 0: + return (gaudi_power_limit_list, gaudi_power_draw_list) + return None,None + +def get_gpu_health_thermal(gaudi_metrics_cmd_output): + ''' + This method collects gaudi thermal from gaudi query output + ''' + return get_col_data(gaudi_metrics_cmd_output, 'temperature.aip [C]') diff --git a/prepare_cp/roles/omnia_telemetry_cp/files/data_collector_kubernetes.py b/prepare_oim/roles/omnia_telemetry_oim/files/data_collector_kubernetes.py similarity index 93% rename from prepare_cp/roles/omnia_telemetry_cp/files/data_collector_kubernetes.py rename to prepare_oim/roles/omnia_telemetry_oim/files/data_collector_kubernetes.py index 4c515de1a..ea5eaf102 100644 --- a/prepare_cp/roles/omnia_telemetry_cp/files/data_collector_kubernetes.py +++ b/prepare_oim/roles/omnia_telemetry_oim/files/data_collector_kubernetes.py @@ -96,16 +96,20 @@ def get_kubectl_get_nodes(): total_nodes=len(nodes_json["items"]) if total_nodes>0: for index in range(total_nodes): + scheduling_disabled = "False" + # Check if node is unschedulable + if "unschedulable" in nodes_json["items"][index]["spec"].keys(): + scheduling_disabled = nodes_json["items"][index]["spec"]["unschedulable"] #Get the status and check if it is "Ready" or not kubelet_status = next( key for key in nodes_json["items"][index]["status"]["conditions"] if key["type"] == "Ready") - if kubelet_status["status"] != "True": + if (scheduling_disabled != "False") or (kubelet_status["status"] != "True"): flag_all_nodes_up = "False" # In case single node is present, then that is both master and child node if total_nodes==1: flag_child_nodes_up = "False" else: #Check if non Ready node is a child node - if "node-role.kubernetes.io/master" not in nodes_json["items"][index]["metadata"]["labels"].keys(): + if "node-role.kubernetes.io/control-plane" not in nodes_json["items"][index]["metadata"]["labels"].keys(): flag_child_nodes_up = "False" #break since we found non ready status in child nodes break diff --git a/prepare_cp/roles/omnia_telemetry_cp/files/data_collector_nvidia_gpu.py b/prepare_oim/roles/omnia_telemetry_oim/files/data_collector_nvidia_gpu.py similarity index 100% rename from prepare_cp/roles/omnia_telemetry_cp/files/data_collector_nvidia_gpu.py rename to prepare_oim/roles/omnia_telemetry_oim/files/data_collector_nvidia_gpu.py diff --git a/prepare_cp/roles/omnia_telemetry_cp/files/data_collector_os.py b/prepare_oim/roles/omnia_telemetry_oim/files/data_collector_os.py similarity index 100% rename from prepare_cp/roles/omnia_telemetry_cp/files/data_collector_os.py rename to prepare_oim/roles/omnia_telemetry_oim/files/data_collector_os.py diff --git a/prepare_cp/roles/omnia_telemetry_cp/files/data_collector_psutil.py b/prepare_oim/roles/omnia_telemetry_oim/files/data_collector_psutil.py similarity index 100% rename from prepare_cp/roles/omnia_telemetry_cp/files/data_collector_psutil.py rename to prepare_oim/roles/omnia_telemetry_oim/files/data_collector_psutil.py diff --git a/prepare_cp/roles/omnia_telemetry_cp/files/data_collector_slurm.py b/prepare_oim/roles/omnia_telemetry_oim/files/data_collector_slurm.py similarity index 100% rename from prepare_cp/roles/omnia_telemetry_cp/files/data_collector_slurm.py rename to prepare_oim/roles/omnia_telemetry_oim/files/data_collector_slurm.py diff --git a/prepare_cp/roles/omnia_telemetry_cp/files/data_collector_smart.py b/prepare_oim/roles/omnia_telemetry_oim/files/data_collector_smart.py similarity index 99% rename from prepare_cp/roles/omnia_telemetry_cp/files/data_collector_smart.py rename to prepare_oim/roles/omnia_telemetry_oim/files/data_collector_smart.py index cbda1fb18..22a1ea110 100644 --- a/prepare_cp/roles/omnia_telemetry_cp/files/data_collector_smart.py +++ b/prepare_oim/roles/omnia_telemetry_oim/files/data_collector_smart.py @@ -86,4 +86,4 @@ def get_using_smartctl(parameter): common_logging.log_error("data_collector_smart:get_using_smartctl",command+ " output is None") else: common_logging.log_error("data_collector_smart:get_using_smartctl","smartctl scan output is None") - return dict_smartctl \ No newline at end of file + return dict_smartctl diff --git a/prepare_cp/roles/omnia_telemetry_cp/files/data_collector_storage.py b/prepare_oim/roles/omnia_telemetry_oim/files/data_collector_storage.py similarity index 99% rename from prepare_cp/roles/omnia_telemetry_cp/files/data_collector_storage.py rename to prepare_oim/roles/omnia_telemetry_oim/files/data_collector_storage.py index 87d3131a4..7343c97a3 100644 --- a/prepare_cp/roles/omnia_telemetry_cp/files/data_collector_storage.py +++ b/prepare_oim/roles/omnia_telemetry_oim/files/data_collector_storage.py @@ -61,4 +61,4 @@ def get_beegfs_details(): common_logging.log_error("data_collector_storage:get_beegfs_details", "beegfs gave error" + error_str) - return beegfs_op_dict \ No newline at end of file + return beegfs_op_dict diff --git a/prepare_cp/roles/omnia_telemetry_cp/files/dbupdate.py b/prepare_oim/roles/omnia_telemetry_oim/files/dbupdate.py similarity index 98% rename from prepare_cp/roles/omnia_telemetry_cp/files/dbupdate.py rename to prepare_oim/roles/omnia_telemetry_oim/files/dbupdate.py index c65500be5..f7f1afda2 100644 --- a/prepare_cp/roles/omnia_telemetry_cp/files/dbupdate.py +++ b/prepare_oim/roles/omnia_telemetry_oim/files/dbupdate.py @@ -110,7 +110,7 @@ def db_insert(self, db_query): def update_db(self, combined_result_dict,combined_unit_dict, service_tag, hostname): ''' - This module updates the Timescaledb on the control plane with telemetry data + This module updates the Timescaledb on the Omnia Infrastructure Manager with telemetry data Args: Combined metric dictionary {dict} diff --git a/prepare_cp/roles/omnia_telemetry_cp/files/gpu_metric_collector.py b/prepare_oim/roles/omnia_telemetry_oim/files/gpu_metric_collector.py similarity index 55% rename from prepare_cp/roles/omnia_telemetry_cp/files/gpu_metric_collector.py rename to prepare_oim/roles/omnia_telemetry_oim/files/gpu_metric_collector.py index c9432549b..1437b4dbd 100644 --- a/prepare_cp/roles/omnia_telemetry_cp/files/gpu_metric_collector.py +++ b/prepare_oim/roles/omnia_telemetry_oim/files/gpu_metric_collector.py @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,6 +18,8 @@ import math import data_collector_nvidia_gpu import data_collector_amd_gpu +import data_collector_amd_proc_acc +import data_collector_gaudi import utility import prerequisite @@ -98,6 +100,75 @@ def get_amd_metrics(self): else: self.gpu_metric_output_dict["gpu_utilization:average"] = utility.Result.NO_DATA.value + def get_amd_proc_acc_metrics(self): + ''' + This method collects all the AMD gpu metrics + ''' + # get temperature details for AMD GPU + gpu_temp = data_collector_amd_proc_acc.get_amd_gpu_temp() + if gpu_temp is not None: + for keys, values in gpu_temp.items(): + for index, value in enumerate(values): + if not math.isnan(float(value)): + self.gpu_metric_output_dict['gpu_temperature:' + keys + ':gpu' + str(index)] = str(value) + self.gpu_unit['gpu_temperature:' + keys] = "C" + else: + self.gpu_metric_output_dict[ + 'gpu_temperature:' + keys + ':gpu' + str(index)] = utility.Result.NO_DATA.value + else: + self.gpu_metric_output_dict["gpu_temperature:gpu"] = utility.Result.NO_DATA.value + + # This function is same as in data_collector_amd_gpu + # get utilization details for AMD GPU + gpu_util = data_collector_amd_gpu.get_amd_gpu_utilization() + if gpu_util is not None: + for index, item in enumerate(gpu_util): + self.gpu_metric_output_dict["gpu_utilization:gpu" + str(index)] = str(item) + self.gpu_unit["gpu_utilization:gpu"] = "percent" + else: + self.gpu_metric_output_dict["gpu_utilization:gpu"] = utility.Result.NO_DATA.value + + # This function is same as in data_collector_amd_gpu + # get average of utilization of all GPUs in the system + gpu_avg_util = data_collector_amd_gpu.get_amd_gpu_avg_utilization(gpu_util) + if gpu_avg_util is not None: + self.gpu_metric_output_dict["gpu_utilization:average"] = str(gpu_avg_util) + self.gpu_unit["gpu_utilization:average"] = "percent" + else: + self.gpu_metric_output_dict["gpu_utilization:average"] = utility.Result.NO_DATA.value + + def get_gaudi_metrics(self): + ''' + This method collects all the gaudi metrics + ''' + # run hl-smi command and store output in a variable + gaudi_metrics_cmd_output = data_collector_gaudi.get_gaudi_metrics_output() + + # get temperature details for Gaudi + gpu_temp = data_collector_gaudi.get_gaudi_temp(gaudi_metrics_cmd_output) + if gpu_temp is not None: + for index, item in enumerate(gpu_temp): + self.gpu_metric_output_dict["gpu_temperature:gpu" + str(index)] = str(item) + self.gpu_unit["gpu_temperature"] = "C" + else: + self.gpu_metric_output_dict["gpu_temperature:gpu"] = utility.Result.NO_DATA.value + + # get utilization details for Gaudi + gpu_util = data_collector_gaudi.get_gaudi_utilization(gaudi_metrics_cmd_output) + if gpu_util is not None: + for index, item in enumerate(gpu_util): + self.gpu_metric_output_dict["gpu_utilization:gpu" + str(index)] = str(item) + self.gpu_unit["gpu_utilization"] = "percent" + else: + self.gpu_metric_output_dict["gpu_utilization:gpu"] = utility.Result.NO_DATA.value + + # get average of utilization of all Gaudi in the system + gpu_avg_util = data_collector_gaudi.get_gaudi_avg_utilization(gaudi_metrics_cmd_output) + if gpu_avg_util is not None: + self.gpu_metric_output_dict["gpu_utilization:average"] = str(gpu_avg_util) + self.gpu_unit["gpu_utilization"] = "percent" + else: + self.gpu_metric_output_dict["gpu_utilization:average"] = utility.Result.NO_DATA.value def metric_collector(self, aggregation_level): ''' This method collects all the gpu metric parameters. @@ -108,9 +179,15 @@ def metric_collector(self, aggregation_level): # Run only when amd gpu present if prerequisite.dict_component_existence['amdgpu']: self.get_amd_metrics() + # Run only when amd processing accelerator present + if prerequisite.dict_component_existence['amd_proc_acc']: + self.get_amd_proc_acc_metrics() + # Run only when gaudi present + if prerequisite.dict_component_existence['intelgaudi']: + self.get_gaudi_metrics() if prerequisite.dict_component_existence['nvidiagpu'] is False and prerequisite.dict_component_existence[ - 'amdgpu'] is False: + 'amdgpu'] is False and prerequisite.dict_component_existence['amd_proc_acc'] is False and prerequisite.dict_component_existence['intelgaudi'] is False: self.gpu_metric_output_dict["gpu_temperature"] = utility.Result.NO_DATA.value self.gpu_metric_output_dict["gpu_utilization"] = utility.Result.NO_DATA.value - self.gpu_metric_output_dict["gpu_utilization:average"] = utility.Result.NO_DATA.value \ No newline at end of file + self.gpu_metric_output_dict["gpu_utilization:average"] = utility.Result.NO_DATA.value diff --git a/prepare_cp/roles/omnia_telemetry_cp/files/health_check_metric_collector.py b/prepare_oim/roles/omnia_telemetry_oim/files/health_check_metric_collector.py similarity index 82% rename from prepare_cp/roles/omnia_telemetry_cp/files/health_check_metric_collector.py rename to prepare_oim/roles/omnia_telemetry_oim/files/health_check_metric_collector.py index ce1c48826..1eefa1e84 100644 --- a/prepare_cp/roles/omnia_telemetry_cp/files/health_check_metric_collector.py +++ b/prepare_oim/roles/omnia_telemetry_oim/files/health_check_metric_collector.py @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -24,6 +24,8 @@ import utility import data_collector_nvidia_gpu import data_collector_amd_gpu +import data_collector_amd_proc_acc +import data_collector_gaudi import prerequisite class HealthCheckMetricCollector: @@ -119,7 +121,7 @@ def get_amd_metrics(self): health_metrics['gpu_pcie'] = data_collector_amd_gpu.get_gpu_health_pcie() # get pmu health details for AMD GPU - health_metrics['gpu_pmu'] = data_collector_amd_gpu.get_gpu_health_nvlink() + health_metrics['gpu_pmu'] = data_collector_amd_gpu.get_gpu_health_pmu() # get power health details for AMD GPU gpu_power_max,gpu_power_avg = data_collector_amd_gpu.get_gpu_health_power() @@ -131,6 +133,66 @@ def get_amd_metrics(self): self.gpu_health_metrics(health_metrics) + def get_amd_proc_acc_metrics(self): + ''' + This method collects all the amd gpu health metrics + ''' + health_metrics = defaultdict(list) + # This function is same as in data_collector_amd_gpu + # get driver health details for AMD GPU + health_metrics['gpu_driver'] = data_collector_amd_gpu.get_gpu_health_driver() + + # This function is same as in data_collector_amd_gpu + # get nvlink health details for AMD GPU + health_metrics['gpu_nvlink'] = data_collector_amd_gpu.get_gpu_health_nvlink() + + # This function is same as in data_collector_amd_gpu + # get pcie health details for AMD GPU + health_metrics['gpu_pcie'] = data_collector_amd_gpu.get_gpu_health_pcie() + + # This function is same as in data_collector_amd_gpu + # get pmu health details for AMD GPU + health_metrics['gpu_pmu'] = data_collector_amd_gpu.get_gpu_health_pmu() + + # get power health details for AMD GPU + gpu_power_max,gpu_power_avg = data_collector_amd_proc_acc.get_gpu_health_power() + health_metrics['gpu_power_max'] = gpu_power_max + health_metrics['gpu_power_avg'] = gpu_power_avg + + # get thermal health details for AMD GPU + health_metrics['gpu_thermal'] = data_collector_amd_proc_acc.get_gpu_health_thermal() + + self.gpu_health_metrics(health_metrics) + + def get_gaudi_metrics(self): + ''' + This method collects all the gaudi health metrics + ''' + health_metrics = defaultdict(list) + # run hl-smi command and store output in a variable + gaudi_metrics_cmd_output = data_collector_gaudi.get_gaudi_metrics_output() + + # get driver health details for Gaudi + health_metrics['gpu_driver'] = data_collector_gaudi.get_gpu_health_driver(gaudi_metrics_cmd_output) + # get nvlink health details for Gaudi + health_metrics['gpu_nvlink'] = None + + # get pcie health details for Gaudi + health_metrics['gpu_pcie'] = data_collector_gaudi.get_gpu_health_pcie(gaudi_metrics_cmd_output) + + # get pmu health details for Gaudi + health_metrics['gpu_pmu'] = None + + # get power health details for Gaudi + gpu_power_max,gpu_power_avg = data_collector_gaudi.get_gpu_health_power(gaudi_metrics_cmd_output) + health_metrics['gpu_power_max'] = gpu_power_max + health_metrics['gpu_power_avg'] = gpu_power_avg + + # get thermal health details for Gaudi + health_metrics['gpu_thermal'] = data_collector_gaudi.get_gpu_health_thermal(gaudi_metrics_cmd_output) + + self.gpu_health_metrics(health_metrics) + def gpu_health_metrics(self,health_metrics): ''' This method calls the gpu health metric methods for storing metric data @@ -287,7 +349,12 @@ def metric_collector(self, aggregation_level="compute"): # Run only when amd gpu present if prerequisite.dict_component_existence['amdgpu']: self.get_amd_metrics() - if prerequisite.dict_component_existence['nvidiagpu'] is False and prerequisite.dict_component_existence['amdgpu'] is False: + if prerequisite.dict_component_existence['amd_proc_acc']: + self.get_amd_proc_acc_metrics() + # Run only when gaudi present + if prerequisite.dict_component_existence['intelgaudi']: + self.get_gaudi_metrics() + if prerequisite.dict_component_existence['nvidiagpu'] is False and prerequisite.dict_component_existence['amdgpu'] is False and prerequisite.dict_component_existence['amd_proc_acc'] and prerequisite.dict_component_existence['intelgaudi'] is False: self.health_check_metric_output_dict["gpu_health_driver"] = \ utility.Result.UNKNOWN.value self.health_check_metric_output_dict["gpu_health_nvlink"] = \ diff --git a/prepare_cp/roles/omnia_telemetry_cp/files/invoke_commands.py b/prepare_oim/roles/omnia_telemetry_oim/files/invoke_commands.py similarity index 87% rename from prepare_cp/roles/omnia_telemetry_cp/files/invoke_commands.py rename to prepare_oim/roles/omnia_telemetry_oim/files/invoke_commands.py index 9c53c7369..36ea6093a 100644 --- a/prepare_cp/roles/omnia_telemetry_cp/files/invoke_commands.py +++ b/prepare_oim/roles/omnia_telemetry_oim/files/invoke_commands.py @@ -49,7 +49,7 @@ def call_command(command, pipe = False, output=''): common_logging.log_error('invoke_commands:call_command', f"Error : {output.stderr} Command : {command} ") else: common_logging.log_error('invoke_commands:call_command', f"Error output in: {command}") - + except subprocess.TimeoutExpired: common_logging.log_error('invoke_commands:call_command', f"Command invocation timeout: {command}") @@ -83,7 +83,7 @@ def call_command_with_pipe(command): return None return output.stdout.strip() if output else None -def run_command(command): +def run_command(command, output=''): """ Call a command using subprocess and return the output or log errors using syslog. Args: @@ -92,9 +92,13 @@ def run_command(command): str or None: The output of the command or None if an error occurred. """ try: - command = command.split() - output = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, - universal_newlines=True, check=False) + # Split the command by space into a list of tokens + list_command_split_by_space_quote = common_parser.split_by_space_and_quote(command) + output = subprocess.run(list_command_split_by_space_quote,input=output, \ + stdout=subprocess.PIPE, stderr=subprocess.PIPE, \ + timeout=float(utility.dict_telemetry_ini \ + ["metric_collection_timeout"]), \ + universal_newlines=True, check=True) return output.stdout.strip() if output.stdout else None except Exception as exc: diff --git a/prepare_cp/roles/omnia_telemetry_cp/files/prerequisite.py b/prepare_oim/roles/omnia_telemetry_oim/files/prerequisite.py similarity index 74% rename from prepare_cp/roles/omnia_telemetry_cp/files/prerequisite.py rename to prepare_oim/roles/omnia_telemetry_oim/files/prerequisite.py index ab2029d02..14a319b1f 100644 --- a/prepare_cp/roles/omnia_telemetry_cp/files/prerequisite.py +++ b/prepare_oim/roles/omnia_telemetry_oim/files/prerequisite.py @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -60,13 +60,31 @@ def check_amd_gpu_existence(): ''' Method to check whether AMD GPU is present ''' - amd_output = invoke_commands.call_command_with_pipe\ - ("lspci|grep \"Display controller: Advanced Micro Devices, Inc. \[AMD/ATI\]\"") - if (amd_output is not None) and len(amd_output)>0: + # Check for Display Controller AMD GPU + amd_output = invoke_commands.call_command_with_pipe("lspci|grep \"Display controller: Advanced Micro Devices, Inc. \[AMD/ATI\]\"") + if (amd_output is not None) and len(amd_output) > 0: dict_component_existence["amdgpu"] = True else: dict_component_existence["amdgpu"] = False + # Check for Processing Accelerators AMD GPU + amd_proc_acc_output = invoke_commands.call_command_with_pipe("lspci|grep \"Processing accelerators: Advanced Micro Devices, Inc. \[AMD/ATI\]\"") + if (amd_proc_acc_output is not None) and len(amd_proc_acc_output) > 0: + dict_component_existence["amd_proc_acc"] = True + else: + dict_component_existence["amd_proc_acc"] = False + +def check_gaudi_existence(): + ''' + Method to check whether Gaudi is present + ''' + gaudi_output = invoke_commands.call_command_with_pipe\ + ("lspci|grep \"Processing accelerators: Habana Labs Ltd.\"") + if (gaudi_output is not None) and len(gaudi_output)>0: + dict_component_existence["intelgaudi"] = True + else: + dict_component_existence["intelgaudi"] = False + def check_beegfs_existence(): ''' Method to check whether beegfs is present @@ -101,5 +119,6 @@ def check_component_existence(): check_nvidia_gpu_existence() check_amd_gpu_existence() + check_gaudi_existence() check_beegfs_existence() check_smartctl_existence() diff --git a/prepare_cp/roles/omnia_telemetry_cp/files/regular_metric_collector.py b/prepare_oim/roles/omnia_telemetry_oim/files/regular_metric_collector.py similarity index 100% rename from prepare_cp/roles/omnia_telemetry_cp/files/regular_metric_collector.py rename to prepare_oim/roles/omnia_telemetry_oim/files/regular_metric_collector.py diff --git a/prepare_cp/roles/omnia_telemetry_cp/files/utility.py b/prepare_oim/roles/omnia_telemetry_oim/files/utility.py similarity index 100% rename from prepare_cp/roles/omnia_telemetry_cp/files/utility.py rename to prepare_oim/roles/omnia_telemetry_oim/files/utility.py diff --git a/prepare_cp/roles/omnia_telemetry_cp/tasks/binary_creation.yml b/prepare_oim/roles/omnia_telemetry_oim/tasks/binary_creation.yml similarity index 100% rename from prepare_cp/roles/omnia_telemetry_cp/tasks/binary_creation.yml rename to prepare_oim/roles/omnia_telemetry_oim/tasks/binary_creation.yml diff --git a/prepare_cp/roles/omnia_telemetry_cp/tasks/main.yml b/prepare_oim/roles/omnia_telemetry_oim/tasks/main.yml similarity index 100% rename from prepare_cp/roles/omnia_telemetry_cp/tasks/main.yml rename to prepare_oim/roles/omnia_telemetry_oim/tasks/main.yml diff --git a/prepare_cp/roles/omnia_telemetry_cp/tasks/python_package_installation.yml b/prepare_oim/roles/omnia_telemetry_oim/tasks/python_package_installation.yml similarity index 100% rename from prepare_cp/roles/omnia_telemetry_cp/tasks/python_package_installation.yml rename to prepare_oim/roles/omnia_telemetry_oim/tasks/python_package_installation.yml diff --git a/prepare_cp/roles/omnia_telemetry_cp/vars/main.yml b/prepare_oim/roles/omnia_telemetry_oim/vars/main.yml similarity index 94% rename from prepare_cp/roles/omnia_telemetry_cp/vars/main.yml rename to prepare_oim/roles/omnia_telemetry_oim/vars/main.yml index eba1f2c74..62e0ada2c 100644 --- a/prepare_cp/roles/omnia_telemetry_cp/vars/main.yml +++ b/prepare_oim/roles/omnia_telemetry_oim/vars/main.yml @@ -16,7 +16,7 @@ # Usage: main.yml # Usage: python_package_installation.yml -python_version: python3.9 +python_version: "{{ ansible_python_interpreter }}" pyinstaller_python_package: pyinstaller psutil_python_package: psutil diff --git a/prepare_cp/roles/pre_requisite/tasks/main.yml b/prepare_oim/roles/pre_requisite/tasks/main.yml similarity index 91% rename from prepare_cp/roles/pre_requisite/tasks/main.yml rename to prepare_oim/roles/pre_requisite/tasks/main.yml index 1bd55c7bb..55059b847 100644 --- a/prepare_cp/roles/pre_requisite/tasks/main.yml +++ b/prepare_oim/roles/pre_requisite/tasks/main.yml @@ -13,7 +13,7 @@ # limitations under the License. --- -- name: Pre requisite for prepare cp +- name: Pre requisite for prepare oim environment: XCATROOT: "{{ xcat_root_env }}" PATH: "{{ ansible_env.PATH }}:{{ xcat_path_env }}" @@ -32,7 +32,7 @@ block: - name: Encrypt provision_config_credentials.yml ansible.builtin.command: >- - {{ ansible_vault_path.stdout.split(' ')[1] }} encrypt {{ provision_credentials_config_filename }} + ansible-vault encrypt {{ provision_credentials_config_filename }} --vault-password-file {{ provision_credentials_vault_path }} changed_when: false tags: init diff --git a/prepare_cp/roles/pre_requisite/tasks/pre_requisite.yml b/prepare_oim/roles/pre_requisite/tasks/pre_requisite.yml similarity index 92% rename from prepare_cp/roles/pre_requisite/tasks/pre_requisite.yml rename to prepare_oim/roles/pre_requisite/tasks/pre_requisite.yml index 17639b24e..f25ac2dcb 100644 --- a/prepare_cp/roles/pre_requisite/tasks/pre_requisite.yml +++ b/prepare_oim/roles/pre_requisite/tasks/pre_requisite.yml @@ -36,12 +36,6 @@ ansible.builtin.fail: msg: "{{ provision_config_syntax_fail_msg }} Error: {{ include_provision_config.message }}" -- name: Fetch ansible-vault path - ansible.builtin.command: whereis ansible-vault - changed_when: false - register: ansible_vault_path - tags: init - - name: Check provision_credentials_config.yml file is encrypted ansible.builtin.command: cat {{ provision_credentials_config_filename }} changed_when: false @@ -51,7 +45,7 @@ - name: Decrpyt provision_credentials_config.yml ansible.builtin.command: >- - {{ ansible_vault_path.stdout.split(' ')[1] }} decrypt {{ provision_credentials_config_filename }} + ansible-vault decrypt {{ provision_credentials_config_filename }} --vault-password-file {{ provision_credentials_vault_path }} changed_when: false when: ansible_vault_search_key in provision_credentials_config_content.stdout diff --git a/prepare_oim/roles/pre_requisite/tasks/prepare_oim_status.yml b/prepare_oim/roles/pre_requisite/tasks/prepare_oim_status.yml new file mode 100644 index 000000000..0a4687953 --- /dev/null +++ b/prepare_oim/roles/pre_requisite/tasks/prepare_oim_status.yml @@ -0,0 +1,141 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Set oim_os + ansible.builtin.set_fact: + oim_os: "{{ ansible_distribution | lower }}" + +- name: Initialize variables + ansible.builtin.set_fact: + xcat_installation_status: false + prep_inv_status: false + prepare_oim_status: false + +- name: Gathering service facts + ansible.builtin.service_facts: + +- name: Fetch network table entries + ansible.builtin.command: "{{ xcat_path }}/lsdef -t network" + changed_when: false + failed_when: false + register: fetch_network + +- name: Try restart postgresql service if not running in Ubuntu + when: + - oim_os == oim_os_ubuntu + - fetch_network.rc != 0 + block: + - name: Try restart postgresql service if not running in Ubuntu + ansible.builtin.systemd: + name: "{{ postgresql_service_ubuntu }}" + state: restarted + register: postgresql_restart + until: postgresql_restart is not failed + retries: "{{ service_retries }}" + when: + - postgresql_service_ubuntu in ansible_facts.services + rescue: + - name: Unable to start postgresql services + ansible.builtin.debug: + msg: "{{ postgresql_start_fail_msg }}" + +- name: Try restart postgresql service if not running in RHEL/Rocky + when: + - oim_os == oim_os_redhat or oim_os == oim_os_rocky + - fetch_network.rc != 0 + block: + - name: Try restart postgresql service if not running in RHEL/Rocky + ansible.builtin.systemd: + name: "{{ postgresql_service_rhel }}" + state: restarted + register: postgresql_restart + until: postgresql_restart is not failed + retries: "{{ service_retries }}" + when: + - postgresql_service_rhel in ansible_facts.services + rescue: + - name: Unable to start postgresql services + ansible.builtin.debug: + msg: "{{ postgresql_start_fail_msg }}" + +- name: Try restart xcatd service if not running + block: + - name: Try restart xcatd service if not running + ansible.builtin.systemd: + name: xcatd + state: restarted + register: xcatd_restart + until: xcatd_restart is not failed + retries: "{{ service_retries }}" + when: + - xcatd_service in ansible_facts.services + - "'running' not in ansible_facts.services[xcatd_service].state" + rescue: + - name: Unable to start xcatd services + ansible.builtin.debug: + msg: "{{ xcat_start_fail_msg }}" + +- name: Gathering service facts + ansible.builtin.service_facts: + +- name: Fetch network table entries + ansible.builtin.command: "{{ xcat_path }}/lsdef -t network" + changed_when: false + failed_when: false + register: fetch_network + +- name: Set xcat_installation_status to true for RHEL/Rocky + ansible.builtin.set_fact: + xcat_installation_status: true + when: + - oim_os == oim_os_redhat or oim_os == oim_os_rocky + - xcatd_service in ansible_facts.services + - postgresql_service_rhel in ansible_facts.services + - "'running' in ansible_facts.services[xcatd_service].state" + - "'running' in ansible_facts.services[postgresql_service_rhel].state" + - fetch_network.rc == 0 + +- name: Set xcat_installation_status to true for Ubuntu + ansible.builtin.set_fact: + xcat_installation_status: true + when: + - oim_os == oim_os_ubuntu + - xcatd_service in ansible_facts.services + - postgresql_service_ubuntu in ansible_facts.services + - "'running' in ansible_facts.services[xcatd_service].state" + - "'running' in ansible_facts.services[postgresql_service_ubuntu].state" + - fetch_network.rc == 0 + +- name: Check if inventory files are present + ansible.builtin.stat: + path: "{{ inv_file_path_list }}" + register: stat_result + +- name: Set fact if inventory is present + ansible.builtin.set_fact: + prep_inv_status: true + when: stat_result.stat.exists + +- name: Set fact for prepare oim + ansible.builtin.set_fact: + prepare_oim_status: true + when: + - xcat_installation_status + - prep_inv_status + +- name: Prepare_oim needs to be executed + ansible.builtin.fail: + msg: "{{ prepare_oim_execution_req }}" + when: not prepare_oim_status diff --git a/prepare_cp/roles/pre_requisite/tasks/validate_provision_credentials.yml b/prepare_oim/roles/pre_requisite/tasks/validate_provision_credentials.yml similarity index 100% rename from prepare_cp/roles/pre_requisite/tasks/validate_provision_credentials.yml rename to prepare_oim/roles/pre_requisite/tasks/validate_provision_credentials.yml diff --git a/prepare_cp/roles/pre_requisite/vars/main.yml b/prepare_oim/roles/pre_requisite/vars/main.yml similarity index 69% rename from prepare_cp/roles/pre_requisite/vars/main.yml rename to prepare_oim/roles/pre_requisite/vars/main.yml index 0ae20d257..3d1da2309 100644 --- a/prepare_cp/roles/pre_requisite/vars/main.yml +++ b/prepare_oim/roles/pre_requisite/vars/main.yml @@ -13,14 +13,24 @@ # limitations under the License. --- -# Usage: prepare_cp_status +# Usage: prepare_oim_status inv_directory_path: "/opt/omnia/omnia_inventory" -compute_servicetag_ip_filename: "compute_servicetag_ip" -inv_file_path_list: "{{ inv_directory_path }}/{{ compute_servicetag_ip_filename }}" +compute_hostname_ip_filename: "compute_hostname_ip" +inv_file_path_list: "{{ inv_directory_path }}/{{ compute_hostname_ip_filename }}" xcatd_service: "xcatd.service" postgresql_service_rhel: "postgresql.service" postgresql_service_ubuntu: "postgresql" -prepare_cp_execution_req: "Failed! Please run prepare_cp/prepare_cp.yml and then re-execute discovery/discovery.yml" +prepare_oim_execution_req: "Failed. If run the discovery/discovery.yml playbook, please ensure that you run the prepare_oim/prepare_oim.yml playbook before +executing the discovery/discovery.yml playbook. If you're encountering this issue while running the discovery_provision.yml playbook, +please cleanup provision tasks by running the 'ansible-playbook utils/oim_cleanup.yml --tags provision'. +After verifying your input files and Omnia Infrastructure Manager admin NIC configuration, re-run the playbook discovery_provision.yml." +xcat_path: /opt/xcat/bin +oim_os_redhat: "redhat" +oim_os_rocky: "rocky" +oim_os_ubuntu: "ubuntu" +postgresql_start_fail_msg: "Failed to start postgresql services" +xcat_start_fail_msg: "Failed to start xcatd services" +service_retries: 3 # Usage: pre_requisite.yml provision_config_filename: "{{ role_path }}/../../../input/provision_config.yml" @@ -34,7 +44,6 @@ conf_file_mode: "0644" local_repo_config_file: "{{ role_path }}/../../../input/local_repo_config.yml" local_repo_config_syntax_fail_msg: "Failed. Syntax errors present in local_repo_config.yml. Fix errors and re-run playbook again." - # Usage: Validate_provision_credentials.yml postgresdb_password_fail_msg: "Failed. postgresdb_password should contain only alphanumeric characters and minimum length 8 in provision_config.yml" min_length: 8 diff --git a/prereq.sh b/prereq.sh index 40168b2d6..8a4787853 100755 --- a/prereq.sh +++ b/prereq.sh @@ -1,148 +1,388 @@ #!/bin/bash -[ -d /opt/omnia ] || mkdir /opt/omnia -[ -d /var/log/omnia ] || mkdir /var/log/omnia +# Copyright © 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -default_py_version="3.9" -validate_rocky_os="$(cat /etc/os-release | grep 'ID="rocky"' | wc -l)" -validate_ubuntu_os="$(cat /etc/os-release | grep 'ID=ubuntu' | wc -l)" -sys_py_version="$(python3 --version)" -echo "System Python version: $sys_py_version" +# Define color codes +RED='\033[0;31m' +GREEN='\033[0;32m' +BLUE='\033[34m' +YELLOW='\033[1;33m' +MAGENTA='\033[0;35m' +NC='\033[0m' # No Color -if [[ "$validate_rocky_os" == "1" ]]; -then - echo "------------------------" - echo "INSTALLING EPEL RELEASE:" - echo "------------------------" - dnf install epel-release -y -fi +# Function to get OS information +get_os_info() { + if [ -f $os_release_data ]; then + . $os_release_data + OS_ID=$ID + # OS_VERSION=$(awk -F= '/VERSION_ID/ {print $2}' $os_release_data) + OS_VERSION=$VERSION_ID + echo "Operating System is $OS_ID version $OS_VERSION" + else + OS_ID="Unknown" + OS_VERSION="Unknown" + echo "Unable to determine OS version." + return + fi +} + +get_installed_ansible_version() { + $venv_py -m pip show ansible 2>/dev/null | grep Version | awk '{print $2}' +} -if [[ "$validate_ubuntu_os" == "1" ]]; -then - check_ubuntu22="$(cat /etc/os-release | grep 'VERSION_ID="22.04"' | wc -l)" - check_ubuntu20="$(cat /etc/os-release | grep 'VERSION_ID="20.04"' | wc -l)" - if [[ "$check_ubuntu22" == "1" ]] - then - echo "deb [trusted=yes] http://ppa.launchpad.net/deadsnakes/ppa/ubuntu jammy main" > /etc/apt/sources.list.d/deadsnakes-ppa.list - elif [[ "$check_ubuntu20" == "1" ]] - then - echo "deb [trusted=yes] http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal main" > /etc/apt/sources.list.d/deadsnakes-ppa.list +install_ansible() { + echo "----------------------------------------------------------" + echo "INSTALLING ANSIBLE $ansible_version IN THE OMNIA VIRTUAL ENVIRONMENT:" + echo "----------------------------------------------------------" + $venv_py -m pip install ansible=="$ansible_version" ansible-core=="$ansible_core_version" #--force-reinstall or --ignore-installed is not required +} + +disable_selinux() { + selinux_count="$(grep "^SELINUX=disabled" /etc/selinux/config | wc -l)" + if [[ $selinux_count == 0 ]]; then + echo "DISABLING SELINUX:" + sed -i 's/^SELINUX=.*/SELINUX=disabled/g' /etc/selinux/config + echo -e "${RED}Reboot Required to take effect${NC}" + # Move reboot message to the end + SELINUX_REBOOT_REQUIRED=true + fi +} + +check_python_version_venv() { + local venv_py_version=$(python --version 2>&1 | awk '{print $2}') + local venv_major_version=$(echo "$venv_py_version" | cut -d '.' -f 1) + local venv_minor_version=$(echo "$venv_py_version" | cut -d '.' -f 2) + echo "" + if [ "$venv_major_version" == "$py_major_version" ] && [ "$venv_minor_version" == "$py_minor_version" ]; then + echo "Python version $venv_py_version matches the required version $py_major_version.$py_minor_version." else - apt-add-repository ppa:deadsnakes/ppa -y + echo "Python version $venv_py_version does not match the required version $py_major_version.$py_minor_version." + exit 1 fi - apt update - echo "----------------------" - echo "INSTALLING PYTHON 3.9:" - echo "----------------------" - apt install python3.9* python3-pip -y -else - if [[ $(echo $sys_py_version | grep "3.9" | wc -l) != "1" || $(echo $sys_py_version | grep "Python" | wc -l) != "1" ]]; - then - echo "----------------------" - echo "INSTALLING PYTHON 3.9:" - echo "----------------------" - dnf install python39 -y +} + +compare_and_copy_config() { + local input_config="$1" + local example_config="$2" + + # Try to extract cluster_os_type from the input file + local input_os_type + local jq_output + jq_output=$(jq -r '.cluster_os_type' "$input_config" 2>&1) + + if [[ $? -ne 0 ]]; then + echo -e "${RED}Error: Failed to parse ${YELLOW}${input_config}${NC}" + echo -e "Below is the error details:" + echo -e "${RED}${jq_output}${NC}" + echo "" + echo -e "${RED}Please check software_config.json for syntax errors and correct them.${NC}" + echo -e "${RED}After correcting, please rerun ${YELLOW}prereq.sh${NC}" + exit 1 + else + input_os_type="$jq_output" fi + + # Check against OS_ID + if [[ "$input_os_type" == "$OS_ID" ]]; then + echo -e "${GREEN}Existing software_config.json matches the current OS type. No changes made.${NC}" + else + echo -e "${RED}Updating software_config.json to match the current OS type and version.${NC}" + copy_config "$input_config" "$example_config" + fi +} + +# Separate function for copying from examples +copy_config() { + local input_config="$1" + local example_config="$2" + + if [[ -f "$example_config" ]]; then + copy_output=$(cp -v "$example_config" "$input_config") + echo -e "${GREEN}${copy_output}${NC}" + echo "" + echo -e "${RED}Updating cluster_os_version: $OS_VERSION${NC}" + sed -i "s/\"cluster_os_version\": .*/\"cluster_os_version\": \"$OS_VERSION\",/" "$input_config" + + else + echo -e "${RED}Error: Example configuration file for $OS_ID not found.${NC}" + exit 1 + fi +} + +# Default settings +ansible_version="9.5.1" +ansible_core_version="2.16.13" +python_version="3.11" +py_major_version="3" +py_minor_version="11" +venv_py=python$python_version +os_release_data="/etc/os-release" +venv_location="/opt/omnia/omnia17_venv" # Do not give a trailing slash +unsupported_os=false +os_type="rhel" +SELINUX_REBOOT_REQUIRED=false + +# Start +get_os_info + +if [[ "$OS_ID" == "rhel" || "$OS_ID" == "rocky" ]]; then + os_type="rhel" + max_val=8.8 + if awk "BEGIN { exit !($OS_VERSION < $max_val) }"; then + unsupported_os=true + fi +fi + +if [[ "$OS_ID" == "ubuntu" ]]; then + os_type="ubuntu" + max_val=20.04 + if awk "BEGIN { exit !($OS_VERSION < $max_val) }"; then + unsupported_os=true + fi +fi + +if [ "$unsupported_os" = true ]; then + echo "Unsupported OS for Omnia v1.7 software stack. Creating venv for Omnia v1.6.1 software stack." + ansible_version="7.7.0" + ansible_core_version="2.14.12" + python_version="39" # RHEL-8.8 onwards and ubuntu-20.04 onwards this is '3.9' + py_major_version="3" + py_minor_version="9" + venv_py=python3.9 + venv_location="/opt/omnia/omnia161_venv" # Do not give a trailing slash fi -echo "--------------" -echo "UPGRADING PIP:" -echo "--------------" -python3.9 -m pip install --upgrade pip -echo "-------------------" -echo "INSTALLING ANSIBLE:" -echo "-------------------" -executable_path=$(ansible --version 2>/dev/null | awk -F'= ' '/executable location/ {print $2}') -get_os=$(awk -F= '/^ID/{print $2}' /etc/os-release | head -n1) -ansible_status=0 -if [[ ! -z "$executable_path" && "$executable_path" = "/usr/bin/ansible" ]]; then - if [[ "$get_os" == 'ubuntu' ]]; then - echo "Removing pre-installed ansible && ansible-core packages using apt…" - ansible_status=1 - sudo apt remove ansible -y - sudo apt remove ansible-core -y - elif [[ "$get_os" == '"rhel"' || "$get_os" == '"rocky"' ]]; then - echo "Removing pre-installed ansible && ansible-core packages using dnf…" - ansible_status=1 - sudo dnf remove ansible -y - sudo dnf remove ansible-core -y + +# Check if the OS version is unsupported and print a warning message +install_omnia_version=$(grep "omnia_version:" ".metadata/omnia_version" | cut -d ':' -f 2 | tr -d ' ') +if [[ "$OS_ID" == "rhel" || "$OS_ID" == "rocky" ]]; then + if [[ "$VERSION_ID" != "8.8" ]]; then + echo -e "Warning: Running Omnia $install_omnia_version on an unsupported OS ${OS_ID} ${VERSION_ID} may lead to failures in subsequent playbooks. To prevent such issues, please use a supported OS ${OS_ID} 8.8 ." fi +elif [[ "$OS_ID" == "ubuntu" ]]; then + if [ -e /var/log/installer/media-info ]; then + media_info=$(cat /var/log/installer/media-info) + if [[ ! $media_info == *"Ubuntu-Server"* ]]; then + echo -e "${YELLOW}Warning: Omnia supports only server edition of Ubuntu. Running Omnia on a non-server edition of Ubuntu may lead to failures in subsequent playbooks. +To prevent such issues, please use the Server edition of Ubuntu.${NC}" + fi + fi + if [[ "$VERSION_ID" != "22.04" ]]; then + echo -e "Warning: Running Omnia $install_omnia_version on an unsupported OS ${OS_ID} ${VERSION_ID} may lead to failures in subsequent playbooks. To prevent such issues, please use a supported OS ${OS_ID} 22.04 ." + fi +else + echo "WARNING: Unsupported OS ${OS_ID}" +fi - if ! grep -qxF 'export PATH="/usr/local/bin:$PATH"' ~/.bashrc; then - echo 'export PATH="/usr/local/bin:$PATH"' >> ~/.bashrc +# Check if already is a different activated venv +if [ -n "$VIRTUAL_ENV" ]; then + if [ "$VIRTUAL_ENV" != "$venv_location" ]; then + echo "Currently activated virtual environment: $VIRTUAL_ENV is not the Omnia virtual environment" + echo "Please deactivate this virtual environment, then run './prereq.sh'." + exit 1 fi fi -installed_ansible_version=$( ansible --version 2>/dev/null | grep -oP 'ansible \[core \K\d+\.\d+' | sed 's/]//') -target_ansible_version="2.14" +[ -d /opt/omnia ] || mkdir /opt/omnia +[ -d $venv_location ] || mkdir $venv_location +[ -d /var/log/omnia ] || mkdir /var/log/omnia -if [[ ! -z "$installed_ansible_version" && "$(echo -e "$installed_ansible_version\n$target_ansible_version" | sort -V | tail -n1)" != "$target_ansible_version" ]]; -then - echo "Error: Higher version of Ansible-core ($installed_ansible_version) is already installed. Please uninstall the existing ansible and re-run the prereq.sh again to install $target_ansible_version" - exit 1 +if [[ "$OS_ID" == "rocky" ]]; then + echo "------------------------" + echo "INSTALLING EPEL RELEASE:" + echo "------------------------" + dnf install epel-release -y fi -if [[ ! -z "$installed_ansible_version" && "$(echo -e "$installed_ansible_version\n$target_ansible_version" | sort -V | head -n1)" != "$target_ansible_version" ]]; -then - echo "Warning: prereq.sh is uninstalling the existing Ansible-core ($installed_ansible_version) and installing the $target_ansible_version" +allow_unauth_apt="" +echo "" +if command -v $venv_py >/dev/null 2>&1; then + echo "Python $python_version is already installed" +else + echo "Python $python_version is not installed" + echo "----------------------" + echo "INSTALLING PYTHON $python_version:" + echo "----------------------" + if [[ "$OS_ID" == "ubuntu" ]]; then + echo "Operating System: $OS_ID" + apt install software-properties-common -y + apt-add-repository ppa:deadsnakes/ppa -y + if [ $? -eq 0 ]; then + echo "Added repo successfully with GPG key" + else + check_ubuntu22="$(cat $os_release_data | grep 'VERSION_ID="22.04"' | wc -l)" + check_ubuntu20="$(cat $os_release_data | grep 'VERSION_ID="20.04"' | wc -l)" + allow_unauth_apt="--allow-unauthenticated" + if [[ "$check_ubuntu22" == "1" ]]; then + echo "Adding repo for jammy $OS_ID $OS_VERSION" + echo "deb [trusted=yes] http://ppa.launchpad.net/deadsnakes/ppa/ubuntu jammy main" > /etc/apt/sources.list.d/deadsnakes-ppa.list + elif [[ "$check_ubuntu20" == "1" ]]; then + echo "Adding repo for focal $OS_ID $OS_VERSION" + echo "deb [trusted=yes] http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal main" > /etc/apt/sources.list.d/deadsnakes-ppa.list + fi + fi + apt update + apt install python$python_version -y $allow_unauth_apt + else + echo "Operating System: $OS_ID" + dnf install python$python_version -y + fi fi -python3.9 -m pip install ansible==7.7.0 cryptography==41.0.7 -python3.9 -m pip install jinja2==3.1.2 +if ! command -v $venv_py >/dev/null 2>&1; then + echo "$venv_py installation failed !!" + exit 1 +fi -if [[ "$validate_ubuntu_os" == "1" ]]; -then - apt install git git-lfs -y +echo "" +# install the other packages +if [[ "$OS_ID" == "ubuntu" ]]; then + echo "Installing apt packages for - $OS_ID" + apt update + apt install python$python_version-dev python$python_version-venv -y $allow_unauth_apt + apt install git git-lfs jq -y $allow_unauth_apt git lfs pull else - dnf install git-lfs -y + echo "Installing dnf packages for - $OS_ID" + dnf install python$python_version-pip python$python_version-devel -y + dnf install git-lfs jq -y git lfs pull + disable_selinux +fi - selinux_count="$(grep "^SELINUX=disabled" /etc/selinux/config | wc -l)" - if [[ $selinux_count == 0 ]]; - then - echo "------------------" - echo "DISABLING SELINUX:" - echo "------------------" - sed -i 's/^SELINUX=.*/SELINUX=disabled/g' /etc/selinux/config - echo "SELinux is disabled. Reboot system to notice the change in status before executing playbooks in control plane!!" +echo "" +# Check if activated venv location equal to the venv_location +if [ "$VIRTUAL_ENV" != "$venv_location" ]; then + echo "Omnia virtual environment not activated in $venv_location" + if [ ! -f "$venv_location/bin/activate" ]; then + $venv_py -m venv $venv_location --prompt omnia17 + fi + echo "Activating the Omnia virtual environment .." + source $venv_location/bin/activate + + if [ "$VIRTUAL_ENV" == "$venv_location" ]; then + echo "Omnia virtual environment activated successfully at $venv_location" + else + echo "Failed to activate virtual environment." + echo "Please manually activate the virtual environment at $venv_location +and install the required package ansible-$ansible_version via pip, +before executing playbooks in Omnia Infrastructure Manager" + exit 1 fi +else + echo "Virtual environment already activated at $venv_location" fi +echo "" +echo "Making required changes to virtual environment at $VIRTUAL_ENV" + +check_python_version_venv + +# Upgrade pip +echo "" +echo "Upgrading pip in Omnia virtual environment:" +$venv_py -m ensurepip --upgrade +$venv_py -m pip install --upgrade pip + +INSTALLED_VERSION=$(get_installed_ansible_version) +echo "" +if [ "$INSTALLED_VERSION" == "$ansible_version" ]; then + echo -e "${GREEN}Ansible $ansible_version is already installed.${NC}" +else + echo -e "${RED}Ansible $ansible_version is not installed.${NC}" + install_ansible +fi + +echo "----------------------------------------------------" +echo "Installing collections in Omnia virtual environment:" +echo "----------------------------------------------------" + +max_retries=3 +retry_count=0 +venv_collection_req_file="requirements_collections.yml" +if [ "$unsupported_os" = true ]; then + echo "Unsupported OS: Installing collections in omnia v1.6.1 venv" + venv_collection_req_file="upgrade/roles/upgrade_oim/files/requirements_venv161.yml" +fi +while [ $retry_count -lt $max_retries ]; do + ansible-galaxy collection install -r $venv_collection_req_file + if [ $? -eq 0 ]; then + echo "Ansible collections installed successfully" + break + else + echo "Ansible collections installation failed. Retrying in 5 seconds..." + sleep 5 + retry_count=$((retry_count + 1)) + fi +done + +if [ $retry_count -eq $max_retries ]; then + echo "Ansible collections installation failed after $max_retries retries" + exit 1 +fi + echo "------------------------------" echo "UPDATING SOFTWARE_CONFIG.JSON:" echo "------------------------------" -os_version=$(awk -F= '/VERSION_ID/ {print $2}' /etc/os-release) -dir_path=$(dirname "$(realpath "$0")") -echo "system_os: $get_os" -echo "os_version: $os_version" - -if [[ "$get_os" == 'ubuntu' ]]; - then - cp "$dir_path/examples/ubuntu_software_config.json" "$dir_path/input/software_config.json" - elif [[ "$get_os" == '"rhel"' ]]; - then - cp "$dir_path/examples/rhel_software_config.json" "$dir_path/input/software_config.json" - elif [[ "$get_os" == '"rocky"' ]]; - then - cp "$dir_path/examples/rocky_software_config.json" "$dir_path/input/software_config.json" +echo "system_os: $OS_ID" +echo "os_version: $OS_VERSION" +dir_path=$(dirname "$(realpath "$BASH_SOURCE")") +echo "Omnia Directory Path: $dir_path" + +# Input software_config.json path +input_file="$dir_path/input/software_config.json" + +# Determine which example file to use based on OS +example_file="$dir_path/examples/${OS_ID}_software_config.json" + +# Ensure the input file exists +if [[ ! -f "$input_file" ]]; then + echo -e "${RED}No existing software_config.json found. Copying from example file.${NC}" + copy_config "$input_file" "$example_file" +else + compare_and_copy_config "$input_file" "$example_file" fi -sed -i "s/\"cluster_os_version\": .*/\"cluster_os_version\": $os_version,/" "$dir_path/input/software_config.json" +echo "--------------------------------------" +echo "INSTALLING OMNIA VIRTUAL ENVIRONMENT:" +echo "--------------------------------------" -echo "" -echo "" -if [[ "$ansible_status" -eq 1 ]]; then - echo "IMPORTANT: The pre-installed ansible packages were removed and installed ansible 2.14, user needs to refresh the session to apply changes." +touch $venv_location/.omnia + +echo -e "${GREEN}" +echo -e "The Omnia virtual environment has been setup now. +This virtual environment can be activated using the command +${YELLOW}'source $venv_location/bin/activate'${GREEN} +All the Omnia playbooks should be run from the activated Omnia virtual environment" +echo -e "${NC}" + +# Show SELinux reboot message if necessary +if [[ "$SELINUX_REBOOT_REQUIRED" == "true" ]]; then + echo -e "${RED}" + echo "SELinux has been successfully disabled. Please reboot the system before proceeding. However, if you are upgrading or restoring the Omnia Infrastructure Manager, avoid rebooting to prevent the loss of telemetry data." + echo -e "${NC}" fi + +echo -e "${BLUE}" +echo "Download the ISO file required to provision in the Omnia Infrastructure Manager." echo "" -echo "" -echo "Download the ISO file required to provision in the control plane." -echo "" -echo "Please configure all the NICs and set the hostname for the control plane in the format hostname.domain_name. Eg: controlplane.omnia.test" +echo "Please configure all the NICs and set the hostname for the Omnia Infrastructure Manager in the format hostname.domain_name. Eg: oimnode.omnia.test" echo "" echo "Once IP and hostname is set, provide inputs in input/local_repo_config.yml & input/software_config.json and execute the playbook local_repo/local_repo.yml to created offline repositories." echo "" echo "After local_repo.yml execution, to provision the nodes user can provide inputs in input/network_spec.yml, input/provision_config.yml & input/provision_config_credentials.yml and execute the playbook discovery_provision.yml" echo "" -echo "For more information: https://omnia-doc.readthedocs.io/en/latest/InstallationGuides/InstallingProvisionTool/index.html" \ No newline at end of file +echo -e "For more information: ${MAGENTA}https://omnia-doc.readthedocs.io/en/latest/" +echo -e "${NC}" diff --git a/provision/ansible.cfg b/provision/ansible.cfg index eedfccc6d..d742f55f7 100644 --- a/provision/ansible.cfg +++ b/provision/ansible.cfg @@ -4,6 +4,7 @@ host_key_checking = false forks = 5 timeout = 180 executable = /bin/bash +collections_path = $VIRTUAL_ENV [persistent_connection] command_timeout = 180 @@ -11,4 +12,4 @@ connect_timeout = 180 [ssh_connection] retries = 3 -ssh_args = -o ControlMaster=auto -o ControlPersist=180 \ No newline at end of file +ssh_args = -o ControlMaster=auto -o ControlPersist=180 diff --git a/provision/provision.yml b/provision/provision.yml index 95061f119..c7fba5130 100644 --- a/provision/provision.yml +++ b/provision/provision.yml @@ -13,11 +13,19 @@ # limitations under the License. --- -- name: Prepare control plane for provisioning nodes +- name: Check if virtual environment is active + ansible.builtin.import_playbook: ../utils/check_venv.yml + when: not ( hostvars['127.0.0.1']['check_venv_executed'] | default(false) | bool ) + +- name: Check if package manager is not locked + ansible.builtin.import_playbook: ../utils/check_package_lock.yml + when: not ( hostvars['127.0.0.1']['apt_lock_status'] | default(false) | bool ) + +- name: Prepare Omnia Infrastructure Manager for provisioning nodes hosts: localhost connection: local roles: - role: provision_validation - - role: ../prepare_cp/roles/pre_requisite # noqa: role-name[path] + - role: ../prepare_oim/roles/pre_requisite # noqa: role-name[path] - role: mapping_provision - role: mtms_provision diff --git a/provision/roles/mapping_provision/files/rinstall_nodes.py b/provision/roles/mapping_provision/files/rinstall_nodes.py index ab754e4a5..a670fa91d 100644 --- a/provision/roles/mapping_provision/files/rinstall_nodes.py +++ b/provision/roles/mapping_provision/files/rinstall_nodes.py @@ -44,7 +44,7 @@ def provision_map_nodes_bmc(): op = cursor_x.fetchone()[0] if op: mapping_bmc_nodes.append(node[0]) - command = f"rinstall {node[0]}" + command = f"/opt/xcat/bin/rinstall {node[0]}" command_list = command.split() node_objs = subprocess.run(command_list, capture_output=True) print(mapping_bmc_nodes) diff --git a/provision/roles/mapping_provision/vars/main.yml b/provision/roles/mapping_provision/vars/main.yml index bbd3a7619..a3bafcd69 100644 --- a/provision/roles/mapping_provision/vars/main.yml +++ b/provision/roles/mapping_provision/vars/main.yml @@ -14,7 +14,7 @@ --- # Usage: mapping_provision.yml -python_version: "python3.9" +python_version: "{{ ansible_python_interpreter }}" rinstall_nodes_py: "{{ role_path }}/files/rinstall_nodes.py" prov_db_path: "{{ role_path }}/../../../discovery/roles/db_operations/files" mapping_provision_msg: " nodes will be booted automatically. Ensure that IPMI is enabled on them. Remaining nodes, diff --git a/provision/roles/mtms_provision/files/check_discover_nodes_status.py b/provision/roles/mtms_provision/files/check_discover_nodes_status.py index 646b87df9..fabfd5148 100644 --- a/provision/roles/mtms_provision/files/check_discover_nodes_status.py +++ b/provision/roles/mtms_provision/files/check_discover_nodes_status.py @@ -9,7 +9,7 @@ def run_cmd(cmd): def get_discover_nodes(): - cmd = f'lsdef -t group -o bmc_discover | grep members | sed -n "/members=/s/ members=//p"' + cmd = f'/opt/xcat/bin/lsdef -t group -o bmc_discover | grep members | sed -n "/members=/s/ members=//p"' status, err, out = run_cmd(cmd) if status: return out.split(',') @@ -21,7 +21,7 @@ def check_discover_nodes(nodelist): bmc_list = list() for node in nodelist: node = node.strip() - cmd = f'lsdef {node} -i status -c | sed -n "/{node}: status=/s/{node}: status=//p"' + cmd = f'/opt/xcat/bin/lsdef {node} -i status -c | sed -n "/{node}: status=/s/{node}: status=//p"' status, err, out = run_cmd(cmd) if status: if len(out) == 1: diff --git a/provision/roles/mtms_provision/files/check_static_nodes_status.py b/provision/roles/mtms_provision/files/check_static_nodes_status.py index 0ec6b52f5..9c5301254 100644 --- a/provision/roles/mtms_provision/files/check_static_nodes_status.py +++ b/provision/roles/mtms_provision/files/check_static_nodes_status.py @@ -10,7 +10,7 @@ def run_cmd(cmd): def get_static_nodes(): - cmd = f'lsdef -t group -o bmc_static | grep members | sed -n "/members=/s/ members=//p"' + cmd = f'/opt/xcat/bin/lsdef -t group -o bmc_static | grep members | sed -n "/members=/s/ members=//p"' status, err, out = run_cmd(cmd) if status: return out.split(',') @@ -22,7 +22,7 @@ def check_static_nodes(nodelist): bmc_list = list() for node in nodelist: node = node.strip() - cmd = f'lsdef {node} -i status -c | sed -n "/{node}: status=/s/{node}: status=//p"' + cmd = f'/opt/xcat/bin/lsdef {node} -i status -c | sed -n "/{node}: status=/s/{node}: status=//p"' status, err, out = run_cmd(cmd) if status: if len(out) == 1: diff --git a/provision/roles/mtms_provision/tasks/fetch_dynamic_nodes.yml b/provision/roles/mtms_provision/tasks/fetch_dynamic_nodes.yml index 97ab35273..5ffcebc76 100644 --- a/provision/roles/mtms_provision/tasks/fetch_dynamic_nodes.yml +++ b/provision/roles/mtms_provision/tasks/fetch_dynamic_nodes.yml @@ -16,7 +16,7 @@ - name: Fetch nodes with dynamic IP's ansible.builtin.shell: | set -o pipefail && \ - lsdef | grep "node-" | sed -e 's/\s.*$//' + {{ xcat_path }}/lsdef | grep "node-" | sed -e 's/\s.*$//' changed_when: false failed_when: false register: dynamic_nodes diff --git a/provision/roles/mtms_provision/tasks/fetch_static_discover_nodes.yml b/provision/roles/mtms_provision/tasks/fetch_static_discover_nodes.yml index 54c5a500e..e4776c48f 100644 --- a/provision/roles/mtms_provision/tasks/fetch_static_discover_nodes.yml +++ b/provision/roles/mtms_provision/tasks/fetch_static_discover_nodes.yml @@ -19,7 +19,7 @@ bmc_discover_node_status: false - name: Fetch nodes with group {{ bmc_static_node_group }} - ansible.builtin.command: lsdef {{ bmc_static_node_group }} + ansible.builtin.command: "{{ xcat_path }}/lsdef {{ bmc_static_node_group }}" changed_when: false register: check_static_nodes failed_when: false @@ -32,7 +32,7 @@ - '"error" not in check_static_nodes.stderr | lower' - name: Fetch nodes with group {{ bmc_discover_node_group }} - ansible.builtin.command: lsdef {{ bmc_discover_node_group }} + ansible.builtin.command: "{{ xcat_path }}/lsdef {{ bmc_discover_node_group }}" changed_when: false register: check_discover_nodes failed_when: false diff --git a/provision/roles/mtms_provision/tasks/power_off_nodes.yml b/provision/roles/mtms_provision/tasks/power_off_nodes.yml index c842e4613..62c8cf6c4 100644 --- a/provision/roles/mtms_provision/tasks/power_off_nodes.yml +++ b/provision/roles/mtms_provision/tasks/power_off_nodes.yml @@ -16,7 +16,7 @@ - name: Task to power off the nodes for BMC block: - name: Power off the node - {{ item }} - ansible.builtin.command: rpower {{ item }} off + ansible.builtin.command: "{{ xcat_path }}/rpower {{ item }} off" changed_when: true register: power_off_status rescue: diff --git a/provision/roles/mtms_provision/tasks/set_provision_image_mtms.yml b/provision/roles/mtms_provision/tasks/set_provision_image_mtms.yml index ff15ca137..5b8c1aaa2 100644 --- a/provision/roles/mtms_provision/tasks/set_provision_image_mtms.yml +++ b/provision/roles/mtms_provision/tasks/set_provision_image_mtms.yml @@ -16,7 +16,7 @@ - name: Task for set PXE to nodes for bmc block: - name: Set PXE to node object for bmc - {{ item }} - ansible.builtin.command: rsetboot {{ item }} net + ansible.builtin.command: "{{ xcat_path }}/rsetboot {{ item }} net" changed_when: true register: set_pxe_bmc rescue: @@ -29,7 +29,7 @@ - name: Task to power on the nodes for bmc block: - name: Power on the node - {{ item }} - ansible.builtin.command: rpower {{ item }} on + ansible.builtin.command: "{{ xcat_path }}/rpower {{ item }} on" changed_when: true register: power_on_status diff --git a/provision/roles/mtms_provision/vars/main.yml b/provision/roles/mtms_provision/vars/main.yml index 95c4de587..c53d1fda6 100644 --- a/provision/roles/mtms_provision/vars/main.yml +++ b/provision/roles/mtms_provision/vars/main.yml @@ -35,3 +35,4 @@ xcat_manpath_env: "/opt/xcat/share/man:$MANPATH" perl_badlang_env: 0 on_timeout: "300" on_timeout_msg: " Wait for 5 mins before powering on the nodes, as iDRACs take time to power off properly." +xcat_path: /opt/xcat/bin diff --git a/provision/tests/test_mac_gathering.yml b/provision/tests/test_mac_gathering.yml index b9cc8d3d7..67e9ae4ae 100644 --- a/provision/tests/test_mac_gathering.yml +++ b/provision/tests/test_mac_gathering.yml @@ -19,7 +19,7 @@ - /root/omnia/provision/tests/test_vars/test_mac_gathering_vars.yml - /root/omnia/input/provision_config.yml tasks: - - name: Validate SNMP installation on control plane + - name: Validate SNMP installation on Omnia Infrastructure Manager block: - name: Identify the presence of SNMP ansible.legacy.shell: 'ps -C snmpd' diff --git a/provision/tests/test_vars/test_xcat_vars.yml b/provision/tests/test_vars/test_xcat_vars.yml index 84400d2ca..f17fef41a 100644 --- a/provision/tests/test_vars/test_xcat_vars.yml +++ b/provision/tests/test_vars/test_xcat_vars.yml @@ -25,4 +25,6 @@ postgres_enabled_fail_msg: "POstgres service is not enabled. Please check!!" domain_present_success_msg: "User defined domain name is present in xCAT" no_domain_fail_msg: "User defined domain name is not configured to xCAT. Please check!!" os_image_present_success_msg: "OS image creation verified successfully" -no_os_image_fail_msg: "OS image is not found" \ No newline at end of file +no_os_image_fail_msg: "OS image is not found" +xcat_sbin_path: /opt/xcat/sbin +xcat_path: /opt/xcat/bin \ No newline at end of file diff --git a/provision/tests/test_xcat.yml b/provision/tests/test_xcat.yml index b4b05f96a..0656d370f 100644 --- a/provision/tests/test_xcat.yml +++ b/provision/tests/test_xcat.yml @@ -22,7 +22,7 @@ - name: Validate xCAT installation block: - name: Identify xCAT is installed or not - ansible.builtin.shell: /etc/profile.d/xcat.sh | tabdump site + ansible.builtin.shell: /etc/profile.d/xcat.sh | {{ xcat_sbin_path }}/tabdump site register: xcat_installation_msg no_log: true @@ -110,12 +110,12 @@ - name: Validate user defined domain name configuration in xcat block: - name: Identify xCAT is installed or not - ansible.builtin.shell: /etc/profile.d/xcat.sh | tabdump site + ansible.builtin.shell: /etc/profile.d/xcat.sh | {{ xcat_sbin_path }}/tabdump site register: xcat_installation_msg no_log: true - name: Check domain name in xcat - ansible.builtin.shell: source /etc/profile.d/xcat.sh | tabdump site | grep domain + ansible.builtin.shell: source /etc/profile.d/xcat.sh | {{ xcat_sbin_path }}/tabdump site | grep domain register: site_table_details when: (xcat_installation_msg.stdout != "") @@ -137,12 +137,12 @@ - name: Validate OS image creation block: - name: Identify xcat installation - ansible.builtin.shell: /etc/profile.d/xcat.sh | tabdump site + ansible.builtin.shell: /etc/profile.d/xcat.sh | {{ xcat_sbin_path }}/tabdump site register: xcat_installation_msg no_log: true - name: Get the OS image details - ansible.builtin.shell: "lsdef -t osimage | grep install-compute" + ansible.builtin.shell: "{{ xcat_path }}/lsdef -t osimage | grep install-compute" register: xcat_os_image_details when: (xcat_installation_msg.stdout != "") diff --git a/requirements_collections.yml b/requirements_collections.yml new file mode 100644 index 000000000..a2a15f7b3 --- /dev/null +++ b/requirements_collections.yml @@ -0,0 +1,25 @@ +--- +collections: + - name: ansible.utils + version: 5.1.1 + - name: community.crypto + version: 2.22.0 + - name: community.docker + version: 3.12.1 + - name: community.general + version: 9.4.0 + - name: community.grafana + version: 2.1.0 + - name: community.mysql + version: 3.10.3 + - name: community.postgresql + version: 3.5.0 + - name: dellemc.os10 + version: 1.1.1 + - name: dellemc.openmanage + version: 9.6.0 + - name: kubernetes.core + version: 5.0.0 + - name: https://github.com/kubernetes-sigs/kubespray + type: git + version: v2.25.0 diff --git a/scheduler/ansible.cfg b/scheduler/ansible.cfg index 45cd66c51..8c58204b1 100644 --- a/scheduler/ansible.cfg +++ b/scheduler/ansible.cfg @@ -5,6 +5,7 @@ forks = 5 timeout = 180 executable = /bin/bash display_skipped_hosts = false +collections_path = $VIRTUAL_ENV [persistent_connection] command_timeout = 180 @@ -12,4 +13,4 @@ connect_timeout = 180 [ssh_connection] retries = 3 -ssh_args = -o ControlMaster=auto -o ControlPersist=180 \ No newline at end of file +ssh_args = -o ControlMaster=auto -o ControlPersist=180 diff --git a/scheduler/delete_roce_plugin.yml b/scheduler/delete_roce_plugin.yml index 41cd42cb3..1e703f01c 100644 --- a/scheduler/delete_roce_plugin.yml +++ b/scheduler/delete_roce_plugin.yml @@ -13,6 +13,10 @@ # limitations under the License. --- +- name: Update Inventory with ansible_host information + ansible.builtin.import_playbook: ../utils/servicetag_host_mapping.yml + when: not ( hostvars['127.0.0.1']['update_inventory_executed'] | default(false) | bool ) + - name: Delete ROCE pod and its dependencies from k8s control plane hosts: kube_control_plane gather_facts: true diff --git a/scheduler/deploy_roce_plugin.yml b/scheduler/deploy_roce_plugin.yml index c3a1dc25e..bade4b88a 100644 --- a/scheduler/deploy_roce_plugin.yml +++ b/scheduler/deploy_roce_plugin.yml @@ -13,6 +13,10 @@ # limitations under the License. --- +- name: Update Inventory with ansible_host information + ansible.builtin.import_playbook: ../utils/servicetag_host_mapping.yml + when: not ( hostvars['127.0.0.1']['update_inventory_executed'] | default(false) | bool ) + - name: Validate roce_plugin input parameters hosts: localhost connection: local diff --git a/scheduler/job_based_user_access.yml b/scheduler/job_based_user_access.yml index f82fdd33c..f5719feed 100644 --- a/scheduler/job_based_user_access.yml +++ b/scheduler/job_based_user_access.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,6 +17,10 @@ # The inventory queried in the below command is to be created by the user prior to running `omnia.yml`. # Command to execute: ansible-playbook job_based_user_access.yml -i inventory +- name: Update Inventory with ansible_host information + ansible.builtin.import_playbook: ../utils/servicetag_host_mapping.yml + when: not ( hostvars['127.0.0.1']['update_inventory_executed'] | default(false) | bool ) + - name: Include omnia_config.yml variables hosts: localhost connection: local diff --git a/scheduler/k8s_access.yml b/scheduler/k8s_access.yml index bd55060cd..261be916a 100644 --- a/scheduler/k8s_access.yml +++ b/scheduler/k8s_access.yml @@ -12,6 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. --- + +- name: Update Inventory with ansible_host information + ansible.builtin.import_playbook: ../utils/servicetag_host_mapping.yml + when: not ( hostvars['127.0.0.1']['update_inventory_executed'] | default(false) | bool ) + - name: Run Kubernetes Access Configuration hosts: kube_control_plane gather_facts: true diff --git a/scheduler/playbooks/k8s_add_node.yml b/scheduler/playbooks/k8s_add_node.yml index 441f551ef..836b011df 100644 --- a/scheduler/playbooks/k8s_add_node.yml +++ b/scheduler/playbooks/k8s_add_node.yml @@ -27,10 +27,15 @@ kube_network_plugin: "{{ hostvars['localhost']['k8s_cni'] }}" kube_service_addresses: "{{ hostvars['localhost']['k8s_service_addresses'] }}" kube_pods_subnet: "{{ hostvars['localhost']['k8s_pod_network_cidr'] }}" - metallb_enabled: true - metallb_speaker_enabled: true kube_proxy_strict_arp: true kube_proxy_mode: 'iptables' + kubelet_custom_flags: "{% if hostvars['localhost']['topology_manager_policy'] != 'none' %} + --topology-manager-policy={{ hostvars['localhost']['topology_manager_policy'] }} + --topology-manager-scope={{ hostvars['localhost']['topology_manager_scope'] }} + {% endif %}" + metallb_enabled: true + metallb_speaker_enabled: true + metallb_namespace: "metallb-system" metallb_config: address_pools: primary: @@ -54,22 +59,24 @@ - host: https://mirror.gcr.io capabilities: ["pull", "resolve"] skip_verify: false - kubeadm_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/kubeadm.tar.gz" - kubectl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/kubectl.tar.gz" - kubelet_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/kubelet.tar.gz" - crictl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/cri-tools-v1.26.1.tar.gz" - calicoctl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/calicoctl-v3.25.2.tar.gz" - calico_crds_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/calicocrds-v3.25.2.tar.gz" - cni_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/cni-plugins-v1.3.0.tar.gz" - nerdctl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/nerdctl-v1.5.0.tar.gz" - runc_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/runc.amd64.tar.gz" + kubeadm_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['kubeadm_package'] }}.tar.gz" + kubectl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['kubectl_package'] }}.tar.gz" + kubelet_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['kubelet_package'] }}.tar.gz" + crictl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['crictl_package'] }}.tar.gz" + calicoctl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['calicoctl_package'] }}.tar.gz" + calico_crds_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['calico_crds_package'] }}.tar.gz" + cni_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['cni_package'] }}.tar.gz" + nerdctl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['nerdctl_package'] }}.tar.gz" + runc_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['runc_package'] }}.tar.gz" docker_rh_repo_base_url: "" docker_rh_repo_gpgkey: "" - etcd_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/etcd-v3.5.10.tar.gz" - containerd_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/containerd-1.7.5.tar.gz" - helm_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/helm-v3.12.3.tar.gz" + etcd_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['etcd_package'] }}.tar.gz" + containerd_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['containerd_package'] }}.tar.gz" + helm_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['helm_package'] }}.tar.gz" http_proxy: "{{ hostvars['localhost']['http_proxy'] }}" https_proxy: "{{ hostvars['localhost']['https_proxy'] }}" + no_proxy: "localhost,127.0.0.1,{{ hostvars['localhost']['admin_nic_ip'] }},{{ hostvars['localhost']['k8s_service_addresses'] }},{{ hostvars['localhost']['k8s_pod_network_cidr'] }},{{ hostvars['localhost']['k8s_cni'] }},{{ hostvars['localhost']['oim_hostname'] }},{{ hostvars['localhost']['domain_name'] }}" # noqa: yaml[line-length] + additional_no_proxy: "{{ hostvars['localhost']['user_no_proxy'] if hostvars['localhost']['no_proxy_input_status'] else '' }}" # noqa: yaml[line-length] - name: Initiate add node when: @@ -85,9 +92,14 @@ kube_network_plugin: "{{ hostvars['localhost']['k8s_cni'] }}" kube_service_addresses: "{{ hostvars['localhost']['k8s_service_addresses'] }}" kube_pods_subnet: "{{ hostvars['localhost']['k8s_pod_network_cidr'] }}" + kubelet_custom_flags: "{% if hostvars['localhost']['topology_manager_policy'] != 'none' %} + --topology-manager-policy={{ hostvars['localhost']['topology_manager_policy'] }} + --topology-manager-scope={{ hostvars['localhost']['topology_manager_scope'] }} + {% endif %}" + kube_proxy_strict_arp: true metallb_enabled: true metallb_speaker_enabled: true - kube_proxy_strict_arp: true + metallb_namespace: "metallb-system" metallb_config: address_pools: primary: @@ -111,19 +123,21 @@ - host: https://mirror.gcr.io capabilities: ["pull", "resolve"] skip_verify: false - kubeadm_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/kubeadm.tar.gz" - kubectl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/kubectl.tar.gz" - kubelet_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/kubelet.tar.gz" - crictl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/cri-tools-v1.26.1.tar.gz" - calicoctl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/calicoctl-v3.25.2.tar.gz" - calico_crds_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/calicocrds-v3.25.2.tar.gz" - cni_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/cni-plugins-v1.3.0.tar.gz" - nerdctl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/nerdctl-v1.5.0.tar.gz" - runc_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/runc.amd64.tar.gz" + kubeadm_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['kubeadm_package'] }}.tar.gz" + kubectl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['kubectl_package'] }}.tar.gz" + kubelet_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['kubelet_package'] }}.tar.gz" + crictl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['crictl_package'] }}.tar.gz" + calicoctl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['calicoctl_package'] }}.tar.gz" + calico_crds_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['calico_crds_package'] }}.tar.gz" + cni_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['cni_package'] }}.tar.gz" + nerdctl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['nerdctl_package'] }}.tar.gz" + runc_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['runc_package'] }}.tar.gz" docker_rh_repo_base_url: "" docker_rh_repo_gpgkey: "" - etcd_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/etcd-v3.5.10.tar.gz" - containerd_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/containerd-1.7.5.tar.gz" - helm_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/helm-v3.12.3.tar.gz" + etcd_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['etcd_package'] }}.tar.gz" + containerd_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['containerd_package'] }}.tar.gz" + helm_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['helm_package'] }}.tar.gz" http_proxy: "{{ hostvars['localhost']['http_proxy'] }}" https_proxy: "{{ hostvars['localhost']['https_proxy'] }}" + no_proxy: "localhost,127.0.0.1,{{ hostvars['localhost']['admin_nic_ip'] }},{{ hostvars['localhost']['k8s_service_addresses'] }},{{ hostvars['localhost']['k8s_pod_network_cidr'] }},{{ hostvars['localhost']['oim_hostname'] }},{{ hostvars['localhost']['domain_name'] }}" # noqa: yaml[line-length] + additional_no_proxy: "{{ hostvars['localhost']['user_no_proxy'] if hostvars['localhost']['no_proxy_input_status'] else '' }}" # noqa: yaml[line-length] diff --git a/scheduler/playbooks/k8s_start_setup.yml b/scheduler/playbooks/k8s_start_setup.yml index 77c869e78..30481a1f2 100644 --- a/scheduler/playbooks/k8s_start_setup.yml +++ b/scheduler/playbooks/k8s_start_setup.yml @@ -26,10 +26,15 @@ kube_network_plugin: "{{ hostvars['localhost']['k8s_cni'] }}" kube_service_addresses: "{{ hostvars['localhost']['k8s_service_addresses'] }}" kube_pods_subnet: "{{ hostvars['localhost']['k8s_pod_network_cidr'] }}" - metallb_enabled: true - metallb_speaker_enabled: true + kubelet_custom_flags: "{% if hostvars['localhost']['topology_manager_policy'] != 'none' %} + --topology-manager-policy={{ hostvars['localhost']['topology_manager_policy'] }} + --topology-manager-scope={{ hostvars['localhost']['topology_manager_scope'] }} + {% endif %}" kube_proxy_strict_arp: true kube_proxy_mode: 'iptables' + metallb_enabled: true + metallb_speaker_enabled: true + metallb_namespace: "metallb-system" metallb_config: address_pools: primary: @@ -53,19 +58,21 @@ - host: https://mirror.gcr.io capabilities: ["pull", "resolve"] skip_verify: false - kubeadm_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/kubeadm.tar.gz" - kubectl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/kubectl.tar.gz" - kubelet_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/kubelet.tar.gz" - crictl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/cri-tools-v1.26.1.tar.gz" - calicoctl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/calicoctl-v3.25.2.tar.gz" - calico_crds_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/calicocrds-v3.25.2.tar.gz" - cni_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/cni-plugins-v1.3.0.tar.gz" - nerdctl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/nerdctl-v1.5.0.tar.gz" - runc_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/runc.amd64.tar.gz" + kubeadm_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['kubeadm_package'] }}.tar.gz" + kubectl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['kubectl_package'] }}.tar.gz" + kubelet_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['kubelet_package'] }}.tar.gz" + crictl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['crictl_package'] }}.tar.gz" + calicoctl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['calicoctl_package'] }}.tar.gz" + calico_crds_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['calico_crds_package'] }}.tar.gz" + cni_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['cni_package'] }}.tar.gz" + nerdctl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['nerdctl_package'] }}.tar.gz" + runc_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['runc_package'] }}.tar.gz" docker_rh_repo_base_url: "" docker_rh_repo_gpgkey: "" - etcd_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/etcd-v3.5.10.tar.gz" - containerd_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/containerd-1.7.5.tar.gz" - helm_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/helm-v3.12.3.tar.gz" + etcd_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['etcd_package'] }}.tar.gz" + containerd_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['containerd_package'] }}.tar.gz" + helm_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ hostvars['localhost']['helm_package'] }}.tar.gz" http_proxy: "{{ hostvars['localhost']['http_proxy'] }}" https_proxy: "{{ hostvars['localhost']['https_proxy'] }}" + no_proxy: "localhost,127.0.0.1,{{ hostvars['localhost']['admin_nic_ip'] }},{{ hostvars['localhost']['k8s_service_addresses'] }},{{ hostvars['localhost']['k8s_pod_network_cidr'] }},{{ hostvars['localhost']['oim_hostname'] }},{{ hostvars['localhost']['domain_name'] }}" # noqa: yaml[line-length] + additional_no_proxy: "{{ hostvars['localhost']['user_no_proxy'] if hostvars['localhost']['no_proxy_input_status'] else '' }}" # noqa: yaml[line-length] diff --git a/scheduler/readme.rst b/scheduler/readme.rst index 1599fe4ba..545aac3f9 100644 --- a/scheduler/readme.rst +++ b/scheduler/readme.rst @@ -1,18 +1,186 @@ Scheduler ========== +Before you build clusters +-------------------------- + +* Verify that all inventory files are updated. + +* If the target cluster requires more than 10 kubernetes nodes, use a docker enterprise account to avoid docker pull limits. + +* Verify that all nodes are assigned a group. Use the inventory as a reference. + + * The manager group should have exactly 1 manager node. + + * The compute group should have at least 1 node. + + * The login_node group is optional. If present, it should have exactly 1 node. + + * Users should also ensure that all repos are available on the target nodes running RHEL. + +.. note:: The inventory file accepts both IPs and FQDNs as long as they can be resolved by DNS. + + +* Nodes provisioned using the Omnia provision tool do not require a RedHat subscription to run ``scheduler.yml`` on RHEL target nodes. + +* For RHEL target nodes not provisioned by Omnia, ensure that RedHat subscription is enabled on all target nodes. Every target node will require a RedHat subscription. + **Features enabled by omnia.yml** - * Centralized authentication: Once all the required parameters in `security_config.yml `_ are filled in, ``omnia.yml`` can be used to set up FreeIPA/OpenLDAP. +* Slurm: Once all the required parameters in ``omnia_config.yml`` are filled in, ``omnia.yml`` can be used to set up slurm. + +* Login Node (Additionally secure login node) + +* Kubernetes: Once all the required parameters in ``omnia_config.yml`` are filled in, ``omnia.yml`` can be used to set up kubernetes. + +* BeeGFS bolt on installation + +* NFS bolt on support + + +Input parameters for the cluster +------------------------------------- + +These parameters are located in ``input/omnia_config.yml`` + +.. note:: + + The ``input/omnia_config.yml`` file is encrypted on the first run of the provision tool: + To view the encrypted parameters: :: + + ansible-vault view omnia_config.yml --vault-password-file .omnia_vault_key + + To edit the encrypted parameters: :: + + ansible-vault edit omnia_config.yml --vault-password-file .omnia_vault_key + + +Building clusters +------------------ + +1. In the ``input/omnia_config.yml`` file, provide the `required details `_. + +.. note:: Without the login node, Slurm jobs can be scheduled only through the manager node. + +2. Create an inventory file in the *omnia* folder. Add login node IP address under the manager node IP address under the *[manager]* group, compute node IP addresses under the *[compute]* group, and Login node IP under the *[login_node]* group,. Check out the `sample inventory for more information <../samplefiles.html>`_. + +.. note:: + * RedHat nodes that are not configured by Omnia need to have a valid subscription. To set up a subscription, `click here `_. + * Omnia creates a log file which is available at: ``/var/log/omnia.log``. + * If only Slurm is being installed on the cluster, docker credentials are not required. + +3. To run ``omnia.yml``: :: + + ansible-playbook omnia.yml -i inventory + + +.. note:: + * To visualize the cluster (Slurm/Kubernetes) metrics on Grafana (On the control plane) during the run of ``omnia.yml``, add the parameters ``grafana_username`` and ``grafana_password`` (That is ``ansible-playbook omnia.yml -i inventory -e grafana_username="" -e grafana_password=""``). Alternatively, Grafana is not installed by ``omnia.yml`` if it's not available on the Control Plane. + * Having the same node in the manager and login_node groups in the inventory is not recommended by Omnia. + +**Using Skip Tags** + +Using skip tags, the scheduler running on the cluster can be set to Slurm or Kubernetes while running the ``omnia.yml`` playbook. This choice can be made depending on the expected HPC/AI workloads. + + * Kubernetes: ``ansible-playbook omnia.yml -i inventory --skip-tags "kubernetes"`` (To set Slurm as the scheduler) + + * Slurm: ``ansible-playbook omnia.yml -i inventory --skip-tags "slurm"`` (To set Kubernetes as the scheduler) + +.. note:: + * If you want to view or edit the ``omnia_config.yml`` file, run the following command: + + - ``ansible-vault view omnia_config.yml --vault-password-file .omnia_vault_key`` -- To view the file. + + - ``ansible-vault edit omnia_config.yml --vault-password-file .omnia_vault_key`` -- To edit the file. + + * It is suggested that you use the ansible-vault view or edit commands and that you do not use the ansible-vault decrypt or encrypt commands. If you have used the ansible-vault decrypt or encrypt commands, provide 644 permission to ``omnia_config.yml``. + +**Kubernetes Roles** + +As part of setting up Kubernetes roles, ``omnia.yml`` handles the following tasks on the manager and compute nodes: + + * Docker is installed. + * Kubernetes is installed. + * Helm package manager is installed. + * All required services are started (Such as kubelet). + * Different operators are configured via Helm. + * Prometheus is installed. + +**Slurm Roles** + +As part of setting up Slurm roles, ``omnia.yml`` handles the following tasks on the manager and compute nodes: + + * Slurm is installed. + * All required services are started (Such as slurmd, slurmctld, slurmdbd). + * Prometheus is installed to visualize slurm metrics. + * Lua and Lmod are installed as slurm modules. + * Slurm restd is set up. + +**Login node** + +If a login node is available and mentioned in the inventory file, the following tasks are executed: + + * Slurmd is installed. + * All required configurations are made to ``slurm.conf`` file to enable a slurm login node. + +.. include:: ../../Appendices/hostnamereqs.rst + +.. note:: + + * To enable the login node, ensure that ``login_node_required`` in ``input/omnia_config.yml`` is set to true. + +**Slurm job based user access** + +To ensure security while running jobs on the cluster, users can be assigned permissions to access compute nodes only while their jobs are running. To enable the feature: :: + + cd scheduler + ansible-playbook job_based_user_access.yml -i inventory + + +.. note:: + + * The inventory queried in the above command is to be created by the user prior to running ``omnia.yml`` as ``scheduler.yml`` is invoked by ``omnia.yml`` + + * Only users added to the 'slurm' group can execute slurm jobs. To add users to the group, use the command: ``usermod -a -G slurm ``. + + + +**Running Slurm MPI jobs on clusters** + +To enhance the productivity of the cluster, Slurm allows users to run jobs in a parallel-computing architecture. This is used to efficiently utilize all available computing resources. + +.. note:: + + * Omnia does not install MPI packages by default. Users hoping to leverage the Slurm-based MPI execution feature are required to install the relevant packages from a source of their choosing. + + * Running jobs as individual users (and not as root) requires that passwordSSH be enabled between compute nodes for the user. + +**For Intel** + + +To run an MPI job on an intel processor, set the following environmental variables on the head nodes or within the job script: + + - ``I_MPI_PMI_LIBRARY`` = ``/usr/lib64/pmix/`` + - ``FI_PROVIDER`` = ``sockets`` (When InfiniBand network is not available, this variable needs to be set) + - ``LD_LIBRARY_PATH`` (Use this variable to point to the location of the Intel/Python library folder. For example: ``$LD_LIBRARY_PATH:/mnt/jobs/intelpython/python3.9/envs/2022.2.1/lib/``) + +**For AMD** + +To run an MPI job on an AMD processor, set the following environmental variables on the head nodes or within the job script: + + - ``PATH`` (Use this variable to point to the location of the OpenMPI binary folder. For example: ``PATH=$PATH:/appshare/openmpi/bin``) + - ``LD_LIBRARY_PATH`` (Use this variable to point to the location of the OpenMPI library folder. For example: ``$LD_LIBRARY_PATH:/appshare/openmpi/lib``) + - ``OMPI_ALLOW_RUN_AS_ROOT`` = ``1`` (To run jobs as a root user, set this variable to ``1``) + - ``OMPI_ALLOW_RUN_AS_ROOT_CONFIRM`` = ``1`` (To run jobs as a root user, set this variable to ``1``) + + + + + + - * Slurm: Once all the required parameters in `omnia_config.yml `_ are filled in, ``omnia.yml`` can be used to set up slurm. - * Login Node (Additionally secure login node) - * Kubernetes: Once all the required parameters in `omnia_config.yml `_ are filled in, ``omnia.yml`` can be used to set up kubernetes. - * BeeGFS bolt on installation: Once all the required parameters in `storage_config.yml `_ are filled in, ``omnia.yml`` can be used to set up NFS. - * NFS bolt on support: Once all the required parameters in `storage_config.yml `_ are filled in, ``omnia.yml`` can be used to set up BeeGFS. - * Telemetry: Once all the required parameters in `telemetry_config.yml `_ are filled in, ``omnia.yml`` sets up `Omnia telemetry and/or iDRAC telemetry `_. It also installs `Grafana `_ and `Loki `_ as Kubernetes pods. \ No newline at end of file diff --git a/scheduler/roles/cluster_validation/tasks/csi_powerscale_driver_api_validation.yml b/scheduler/roles/cluster_validation/tasks/csi_powerscale_driver_api_validation.yml new file mode 100644 index 000000000..54909d418 --- /dev/null +++ b/scheduler/roles/cluster_validation/tasks/csi_powerscale_driver_api_validation.yml @@ -0,0 +1,46 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Generate Base64 authentication token + ansible.builtin.shell: > + set -o pipefail && \ + echo -n "{{ item.username }}:{{ item.password }}" | base64 + register: auth_token + changed_when: false + no_log: true + +- name: Set the URL for the API request + ansible.builtin.set_fact: + api_url: >- + {{ + 'https://' + item.endpoint if 'https' not in item.endpoint else item.endpoint + }}:{{ item.endpointPort if item.endpointPort is defined else csi_powerscale_values_file.endpointPort }}/platform/1/auth/id + no_log: true + +- name: Make GET request to verify powerscale endpoint and credential + ansible.builtin.uri: + url: "{{ api_url }}" + method: GET + headers: + Authorization: "Basic {{ auth_token.stdout }}" + validate_certs: false + register: response + ignore_errors: true + no_log: true + +- name: Fail if API call to powerscale was not successful + ansible.builtin.fail: + msg: "{{ fail_msg_api_call }}: {{ response.msg }}" + when: response.status != 200 diff --git a/scheduler/roles/cluster_validation/tasks/csi_powerscale_driver_input_validation.yml b/scheduler/roles/cluster_validation/tasks/csi_powerscale_driver_input_validation.yml new file mode 100644 index 000000000..b681fa20b --- /dev/null +++ b/scheduler/roles/cluster_validation/tasks/csi_powerscale_driver_input_validation.yml @@ -0,0 +1,287 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Encryption for secret file +- name: Check if csi_powerscale_secret_vault exists + ansible.builtin.stat: + path: "{{ role_path }}/../k8s_csi_powerscale_plugin/files/{{ csi_powerscale_secret_vaultname }}" + register: vault_key_result + +- name: Create ansible vault key if it does not exist + ansible.builtin.set_fact: + vault_key: "{{ lookup('password', '/dev/null chars=ascii_letters') }}" + when: not vault_key_result.stat.exists + +- name: Save vault key + ansible.builtin.lineinfile: + path: "{{ role_path }}/../k8s_csi_powerscale_plugin/files/{{ csi_powerscale_secret_vaultname }}" + line: "{{ vault_key }}" + mode: "{{ vault_key_permission }}" + owner: root + create: true + when: not vault_key_result.stat.exists + +- name: Check if secret file is encrypted + ansible.builtin.command: cat "{{ hostvars['localhost']['csi_powerscale_driver_secret_file_path'] }}" + changed_when: false + register: config_content + +- name: Decrpyt secret file + ansible.builtin.command: >- + ansible-vault decrypt {{ hostvars['localhost']['csi_powerscale_driver_secret_file_path'] }} + --vault-password-file {{ role_path }}/../k8s_csi_powerscale_plugin/files/{{ csi_powerscale_secret_vaultname }} + when: "'$ANSIBLE_VAULT;' in config_content.stdout" + changed_when: false + +- name: Load secret file for input validation + ansible.builtin.include_vars: + file: "{{ hostvars['localhost']['csi_powerscale_driver_secret_file_path'] }}" + name: clusters + no_log: true + +- name: Encrypt secret file + ansible.builtin.command: >- + ansible-vault encrypt {{ hostvars['localhost']['csi_powerscale_driver_secret_file_path'] }} + --vault-password-file {{ role_path }}/../k8s_csi_powerscale_plugin/files/{{ csi_powerscale_secret_vaultname }} + changed_when: false + +# Validate secret file +- name: Validate isilonClusters configuration + block: + - name: Ensure isilonClusters is a list + ansible.builtin.assert: + that: + - clusters.isilonClusters is defined + - clusters.isilonClusters is iterable + - clusters.isilonClusters | length > 0 + msg: "{{ fail_msg_isilon_clusters }}" + + - name: Validate each cluster entry + block: + - name: Validate clusterName in secret.yaml + block: + - name: Validate clusterName is a non-empty string + ansible.builtin.assert: + that: + - item.clusterName is defined + - item.clusterName | length > 0 + loop: "{{ clusters.isilonClusters }}" + no_log: true + rescue: + - name: Invalid clusterName + ansible.builtin.fail: + msg: "{{ fail_msg_cluster_name }}" + + - name: Validate username in secret.yaml + block: + - name: Validate username is a non-empty string + ansible.builtin.assert: + that: + - item.username is defined + - item.username | length > 0 + loop: "{{ clusters.isilonClusters }}" + no_log: true + rescue: + - name: Invalid username + ansible.builtin.fail: + msg: "{{ fail_msg_user_name }}" + + - name: Validate password in secret.yaml + block: + - name: Validate password is a non-empty string + ansible.builtin.assert: + that: + - item.password is defined + - item.password | length > 0 + loop: "{{ clusters.isilonClusters }}" + no_log: true + rescue: + - name: Invalid password + ansible.builtin.fail: + msg: "{{ fail_msg_password }}" + + - name: Validate endpoint in secret.yaml + block: + - name: Validate endpoint is a non-empty string + ansible.builtin.assert: + that: + - item.endpoint is defined + - item.endpoint | length > 0 + loop: "{{ clusters.isilonClusters }}" + no_log: true + rescue: + - name: Invalid endpoint + ansible.builtin.fail: + msg: "{{ fail_msg_endpoint }}" + + - name: Validate endpointPort in secret.yaml + block: + - name: Validate endpointPort is a non-empty string + when: item.endpointPort is defined + ansible.builtin.assert: + that: + - item.endpointPort is integer + - item.endpointPort > 0 and item.endpointPort < 65536 + loop: "{{ clusters.isilonClusters }}" + no_log: true + rescue: + - name: Invalid endpointPort + ansible.builtin.fail: + msg: "{{ fail_msg_endpoint_port }}" + + - name: Validate isDefault in secret.yaml + block: + - name: Validate isDefault is boolean + ansible.builtin.assert: + that: + - item.isDefault is defined + - item.isDefault is boolean + loop: "{{ clusters.isilonClusters }}" + no_log: true + rescue: + - name: Invalid isDefault + ansible.builtin.fail: + msg: "{{ fail_msg_isdefault }}" + + - name: Validate skipCertificateValidation in secret.yaml + block: + - name: Validate skipCertificateValidation is true + when: item.skipCertificateValidation is defined + ansible.builtin.assert: + that: + - item.skipCertificateValidation in [true] + loop: "{{ clusters.isilonClusters }}" + no_log: true + rescue: + - name: Invalid skipCertificateValidation + ansible.builtin.fail: + msg: "{{ fail_msg_skip_certificate_validation }}" + + - name: Validate isiPath in secret.yaml + block: + - name: Validate isiPath is a valid Unix absolute path + when: item.isiPath is defined + ansible.builtin.assert: + that: + - item.isiPath is match('^/[^/].*') + loop: "{{ clusters.isilonClusters }}" + no_log: true + rescue: + - name: Invalid isiPath + ansible.builtin.fail: + msg: "{{ fail_msg_isipath }}" + + - name: Validate isiVolumePathPermissions in secret.yaml + block: + - name: Validate isiVolumePathPermissions is a valid octal mode number + when: item.isiVolumePathPermissions is defined + ansible.builtin.assert: + that: + - item.isiVolumePathPermissions is string + - item.isiVolumePathPermissions | length > 0 + loop: "{{ clusters.isilonClusters }}" + no_log: true + rescue: + - name: Invalid isiVolumePathPermissions + ansible.builtin.fail: + msg: "{{ fail_msg_isi_volume_path_permissions }}" + +# Validate mandate user input in values file for csi driver +- name: Load values.yaml file + ansible.builtin.include_vars: + file: "{{ hostvars['localhost']['csi_powerscale_driver_values_file_path'] }}" + name: csi_powerscale_values_file + +- name: Validate controller count + ansible.builtin.assert: + that: + - csi_powerscale_values_file.controller.controllerCount == 1 + msg: | + "Invalid controllerCount value: {{ csi_powerscale_values_file.controller.controllerCount }}. It must be 1 in values.yaml file." + +- name: Validate replication enabled + ansible.builtin.assert: + that: + - csi_powerscale_values_file.controller.replication.enabled is defined + - csi_powerscale_values_file.controller.replication.enabled in [false] + msg: | + "Invalid replication enabled value: {{ csi_powerscale_values_file.controller.replication.enabled }}. It must be false in values.yaml file." + +- name: Validate resizer enabled + ansible.builtin.assert: + that: + - csi_powerscale_values_file.controller.resizer.enabled is defined + - csi_powerscale_values_file.controller.resizer.enabled in [false, true] + msg: "Invalid resizer enabled value: {{ csi_powerscale_values_file.controller.resizer.enabled }}. It must be true or false in values.yaml file." + +- name: Validate snapshot enabled + ansible.builtin.assert: + that: + - csi_powerscale_values_file.controller.snapshot.enabled is defined + - csi_powerscale_values_file.controller.snapshot.enabled in [true] + msg: "Invalid snapshot enabled value: {{ csi_powerscale_values_file.controller.snapshot.enabled }}. It must be true in values.yaml file." + +- name: Validate endpointPort + ansible.builtin.assert: + that: + - csi_powerscale_values_file.endpointPort is defined + - csi_powerscale_values_file.endpointPort | int >= 1 + - csi_powerscale_values_file.endpointPort | int <= 65535 + msg: "Invalid endpointPort: {{ csi_powerscale_values_file.endpointPort }}. It must be between 1 and 65535 in values.yaml file." + +- name: Validate skipCertificateValidation + ansible.builtin.assert: + that: + - csi_powerscale_values_file.skipCertificateValidation is defined + - csi_powerscale_values_file.skipCertificateValidation in [true] + msg: "Invalid skipCertificateValidation value: {{ csi_powerscale_values_file.skipCertificateValidation }}. It must be true in values.yaml file." + +- name: Set skipCertificateValidation to be used later + ansible.builtin.set_fact: + skip_certificate_validation_value: csi_powerscale_values_file.skipCertificateValidation + +- name: Validate isiAuthType + ansible.builtin.assert: + that: + - csi_powerscale_values_file.isiAuthType is defined + - csi_powerscale_values_file.isiAuthType in [0, 1] + msg: | + "Invalid isiAuthType: {{ csi_powerscale_values_file.isiAuthType }}. + It must be 0 (basic authentication) or 1 (session-based authentication) in values.yaml file." + +- name: Validate isiAccessZone + ansible.builtin.assert: + that: + - csi_powerscale_values_file.isiAccessZone is defined + - csi_powerscale_values_file.isiAccessZone | length > 0 + msg: "Invalid isiAccessZone: {{ csi_powerscale_values_file.isiAccessZone }}. It must be a non-empty string in values.yaml file." + +- name: Validate isiPath + ansible.builtin.assert: + that: + - csi_powerscale_values_file.isiPath is defined + - csi_powerscale_values_file.isiPath | regex_search('^/[^/].*') # Basic validation for Unix absolute path + msg: "Invalid isiPath: {{ csi_powerscale_values_file.isiPath }}. It must be a valid Unix absolute path in values.yaml file." + +- name: Validate isiVolumePathPermissions + ansible.builtin.assert: + that: + - csi_powerscale_values_file.isiVolumePathPermissions is defined + - csi_powerscale_values_file.isiVolumePathPermissions | length > 0 + msg: "Invalid isiVolumePathPermissions: {{ csi_powerscale_values_file.isiVolumePathPermissions }}. It must be valid octal mode in values.yaml file." + +- name: Validate powerscale ip and credential in secret.yaml file using API call to powerscale + ansible.builtin.include_tasks: csi_powerscale_driver_api_validation.yml + loop: "{{ clusters.isilonClusters }}" diff --git a/scheduler/roles/cluster_validation/tasks/fetch_omnia_inputs.yml b/scheduler/roles/cluster_validation/tasks/fetch_omnia_inputs.yml index 520dfea90..137d04280 100644 --- a/scheduler/roles/cluster_validation/tasks/fetch_omnia_inputs.yml +++ b/scheduler/roles/cluster_validation/tasks/fetch_omnia_inputs.yml @@ -99,14 +99,26 @@ fail_msg: "{{ restart_services_failure_msg }}" when: slurm_support +- name: Check virtual ENV + ansible.builtin.set_fact: + venv_path: "{{ lookup('ansible.builtin.env', 'VIRTUAL_ENV') }}" + - name: Validate k8s_support variable when: k8s_support block: - - name: Assert kubernetes version + - name: Fail if kubernetes version is other than 1.26.12 or omnia161_venv is not activated for RHEL/Rocky 8.6 or 8.7 + ansible.builtin.fail: + msg: "{{ kube_version_on_unsupported_os }}" + when: + - cluster_os_type in ['rhel', 'rocky'] + - cluster_os_version in ['8.6', '8.7'] + - (omnia161_k8s_version not in k8s_version) or ('omnia161_venv' not in venv_path) + + - name: Assert supported kubernetes version ansible.builtin.assert: that: - - k8s_version | default("", true) | length > 1 - - "'1.26.12' in k8s_version" + - k8s_version | default("", false) | length > 1 + - "('omnia161_venv' in venv_path and '{{ omnia161_k8s_version }}' in k8s_version) or ( 'omnia17_venv' in venv_path and '{{ omnia17_k8s_version }}' in k8s_version)" # noqa: yaml[line-length] success_msg: "{{ success_msg_k8s_version }}" fail_msg: "{{ fail_msg_k8s_version }}" @@ -142,6 +154,52 @@ success_msg: "{{ success_msg_k8s_pod_network_cidr }}" fail_msg: "{{ fail_msg_k8s_pod_network_cidr }}" + - name: Assert kubernetes topology manager policy + ansible.builtin.assert: + that: + - topology_manager_policy | default("", true) | length > 1 + - "topology_manager_policy in supported_topology_manager_policy" + success_msg: "{{ success_msg_k8s_toplogy_manager_policy }}" + fail_msg: "{{ fail_msg_k8s_toplogy_manager_policy }}" + + - name: Assert kubernetes topology manager scope + ansible.builtin.assert: + that: + - topology_manager_scope | default("", true) | length > 1 + - "topology_manager_scope in supported_topology_manager_scope" + success_msg: "{{ success_msg_k8s_toplogy_manager_scope }}" + fail_msg: "{{ fail_msg_k8s_toplogy_manager_scope }}" + +- name: Initialize flag for csi powerscale driver installation + ansible.builtin.set_fact: + csi_driver_powerscale_precheck_pass: false + +- name: Validate csi powerscale variables + when: csi_driver_powerscale_support + block: + - name: Validate that csi_powerscale_driver_secret_file_path contains a valid path + ansible.builtin.assert: + that: + - csi_powerscale_driver_secret_file_path is defined + - csi_powerscale_driver_secret_file_path | length > 0 + success_msg: "{{ csi_driver_secret_file_path_success_msg }}" + fail_msg: "{{ csi_driver_secret_file_path_fail_msg }}" + + - name: Validate that csi_powerscale_driver_values_file_path contains a valid path + ansible.builtin.assert: + that: + - csi_powerscale_driver_values_file_path is defined + - csi_powerscale_driver_values_file_path | length > 0 + success_msg: "{{ csi_driver_values_file_path_success_msg }}" + fail_msg: "{{ csi_driver_values_file_path_fail_msg }}" + + - name: Validate user input for csi powerscale driver + ansible.builtin.include_tasks: csi_powerscale_driver_input_validation.yml + + - name: Set flag for csi powerscale driver installation to true after assertion pass + ansible.builtin.set_fact: + csi_driver_powerscale_precheck_pass: true + - name: Save input variables from file ansible.builtin.set_fact: db_password: "{{ mariadb_password }}" diff --git a/scheduler/roles/cluster_validation/tasks/fetch_software_config.yml b/scheduler/roles/cluster_validation/tasks/fetch_software_config.yml index 057d68557..81055d8aa 100644 --- a/scheduler/roles/cluster_validation/tasks/fetch_software_config.yml +++ b/scheduler/roles/cluster_validation/tasks/fetch_software_config.yml @@ -17,6 +17,9 @@ ansible.builtin.set_fact: k8s_support: false slurm_support: false + set_intel_config_status: false + set_intel_input_status: false + set_intelgaudi_input_status: false - name: Load software_config.json ansible.builtin.include_vars: @@ -35,6 +38,7 @@ - name: Set facts for cluster ansible.builtin.set_fact: cluster_os_type: "{{ software_config.cluster_os_type }}" + cluster_os_version: "{{ software_config.cluster_os_version }}" - name: Check if slurm support is required ansible.builtin.fail: @@ -43,6 +47,15 @@ - slurm_support is true - cluster_os_type == compute_os_ubuntu +- name: Check if csi powerscale driver installation is required + ansible.builtin.set_fact: + csi_driver_powerscale_support: "{{ software_config.softwares | selectattr('name', 'equalto', 'csi_driver_powerscale') | list | length > 0 }}" + +- name: Load csi_driver_powerscale.json + ansible.builtin.set_fact: + csi_driver_powerscale_packages_json: "{{ lookup('file', csi_driver_powerscale_packages_file) | from_json }}" + when: csi_driver_powerscale_support + - name: Check if ucx is true ansible.builtin.set_fact: ucx_support: "{{ software_config.softwares | selectattr('name', 'equalto', 'ucx') | list | length > 0 }}" @@ -103,6 +116,98 @@ ansible.builtin.set_fact: k8s_packages_json: "{{ lookup('file', k8s_packages_file) | from_json }}" + - name: Extract and set facts for tarball URLs + ansible.builtin.set_fact: + kubeadm_package: "kubeadm" + kubectl_package: "kubectl" + kubelet_package: "kubelet" + crictl_package: "{{ k8s_packages_json['k8s']['cluster'] | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'search', 'cri') | map(attribute='package') | join }}" # noqa: yaml[line-length] + calicoctl_package: "{{ k8s_packages_json['k8s']['cluster'] | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'search', 'calicoctl') | map(attribute='package') | join }}" # noqa: yaml[line-length] + calico_crds_package: "{{ k8s_packages_json['k8s']['cluster'] | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'search', 'calicocrds') | map(attribute='package') | join }}" # noqa: yaml[line-length] + cni_package: "{{ k8s_packages_json['k8s']['cluster'] | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'search', 'cni') | map(attribute='package') | join }}" # noqa: yaml[line-length] + nerdctl_package: "{{ k8s_packages_json['k8s']['cluster'] | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'search', 'nerdctl') | map(attribute='package') | join }}" # noqa: yaml[line-length] + runc_package: "{{ k8s_packages_json['k8s']['cluster'] | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'search', 'runc') | map(attribute='package') | join }}" # noqa: yaml[line-length] + etcd_package: "{{ k8s_packages_json['k8s']['cluster'] | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'search', 'etcd') | map(attribute='package') | join }}" # noqa: yaml[line-length] + containerd_package: "{{ k8s_packages_json['k8s']['cluster'] | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'search', 'containerd') | map(attribute='package') | join }}" # noqa: yaml[line-length] + helm_package: "{{ k8s_packages_json['k8s']['cluster'] | selectattr('type', 'equalto', 'tarball') | selectattr('package', 'search', 'helm') | map(attribute='package') | join }}" # noqa: yaml[line-length] + mpi_operator: "{{ k8s_packages_json['k8s']['cluster'] | selectattr('type', 'equalto', 'manifest') | selectattr('package', 'search', 'mpi') | map(attribute='package') | join }}" # noqa: yaml[line-length] + + - name: Conditional logic for kubeadm, kubectl, and kubelet packages # noqa: no-jinja-when + ansible.builtin.set_fact: + kubeadm_package: "kubeadm-{{ hostvars['localhost']['k8s_version'] }}" + kubectl_package: "kubectl-{{ hostvars['localhost']['k8s_version'] }}" + kubelet_package: "kubelet-{{ hostvars['localhost']['k8s_version'] }}" + when: k8s_version != '{{ omnia161_k8s_version }}' + + rescue: + - name: Unable to fetch k8s version + ansible.builtin.fail: + msg: "{{ k8s_version_fail_msg }}" + +- name: Get intelgaudi input status + ansible.builtin.set_fact: + set_intelgaudi_input_status: true + loop: "{{ software_config.softwares | default([]) }}" + when: + - "'intelgaudi' in item.name" + loop_control: + loop_var: item + +- name: Set set_intel_config_status + when: set_intelgaudi_input_status + block: + - name: Fetch intelgaudi_version + ansible.builtin.set_fact: + intelgaudi_version: "{{ software_config.softwares | selectattr('name', 'equalto', 'intelgaudi') | map(attribute='version') | first }}" + rescue: + - name: Failed - Intel gaudi version check + ansible.builtin.fail: + msg: "{{ intelgaudi_version_warning_msg }}" + +- name: Set intel input status + ansible.builtin.set_fact: + set_intel_input_status: true + loop: "{{ software_config.intelgaudi | default([]) }}" + when: "set_intelgaudi_input_status and 'intel' in item.name" + loop_control: + loop_var: item + +- name: Intel Gaudi will not be installed + ansible.builtin.pause: + prompt: "{{ intel_gaudi_input_fail_msg }}" + seconds: "{{ warning_time }}" + when: + - set_intelgaudi_input_status + - not set_intel_input_status + +- name: Set set_intel_config_status + when: set_intel_input_status + block: + - name: Set intel_directory + ansible.builtin.set_fact: + intel_directory: "{{ offline_intel_directory }}/intel/{{ intelgaudi_version }}/" + + - name: Check intel_directory exists or not + ansible.builtin.stat: + path: "{{ intel_directory }}" + register: check_intel_dir + + - name: Warning - Please wait, This task will take few seconds + ansible.builtin.pause: + seconds: "{{ warning_time }}" + prompt: "{{ intel_gaudi_repo_fail_msg }}" + when: not check_intel_dir.stat.exists + + - name: Set set_intel_config_status to true + ansible.builtin.set_fact: + set_intel_config_status: true + when: check_intel_dir.stat.exists + rescue: + - name: Intel Gaudi repo is not configured + ansible.builtin.pause: + seconds: "{{ warning_wait_time }}" + prompt: "{{ intel_gaudi_repo_fail_msg }}" + - name: Local local_repo_access.yml file ansible.builtin.include_vars: "{{ local_repo_access_dest_path }}" diff --git a/scheduler/roles/cluster_validation/tasks/gather_fact_resolution.yml b/scheduler/roles/cluster_validation/tasks/gather_fact_resolution.yml index ca3de6ab9..14383d2fa 100644 --- a/scheduler/roles/cluster_validation/tasks/gather_fact_resolution.yml +++ b/scheduler/roles/cluster_validation/tasks/gather_fact_resolution.yml @@ -13,9 +13,9 @@ # limitations under the License. --- -- name: Collect cp files from ansible directory +- name: Collect oim files from ansible directory ansible.builtin.find: - paths: "{{ cp_path }}" + paths: "{{ oim_path }}" hidden: true recurse: true file_type: any @@ -23,7 +23,7 @@ failed_when: false delegate_to: localhost -- name: Remove cp files from ansible directory +- name: Remove oim files from ansible directory ansible.builtin.file: path: "{{ item.path }}" state: absent diff --git a/server_spec_update/roles/add_nic_network/tasks/main.yml b/scheduler/roles/cluster_validation/tasks/include_local_repo_config.yml similarity index 61% rename from server_spec_update/roles/add_nic_network/tasks/main.yml rename to scheduler/roles/cluster_validation/tasks/include_local_repo_config.yml index d08a3e424..0dc2af4aa 100644 --- a/server_spec_update/roles/add_nic_network/tasks/main.yml +++ b/scheduler/roles/cluster_validation/tasks/include_local_repo_config.yml @@ -13,14 +13,13 @@ # limitations under the License. --- -- name: Add additional nic info table in xCAT's network table - when: add_network_status +- name: Include local_repo_config.yml vars block: - - name: Create files for stanzas - ansible.builtin.file: - path: "{{ metadata_nicinfo_path }}" - state: touch - mode: "{{ file_perm }}" - - - name: Update additional nic info in xcat networks table - ansible.builtin.include_tasks: update_new_nic_network.yml + - name: Include local_repo_config.yml vars + ansible.builtin.include_vars: "{{ local_repo_config_file }}" + register: include_local_repo_config + no_log: true + rescue: + - name: Failed to local_repo_config.yml + ansible.builtin.fail: + msg: "{{ local_repo_config_syntax_fail_msg }} Error: {{ include_local_repo_config.message }}" diff --git a/scheduler/roles/cluster_validation/tasks/main.yml b/scheduler/roles/cluster_validation/tasks/main.yml index f38ac9691..b310faf3d 100644 --- a/scheduler/roles/cluster_validation/tasks/main.yml +++ b/scheduler/roles/cluster_validation/tasks/main.yml @@ -18,6 +18,9 @@ ansible_collection_used: false scheduler_validation_status: true +- name: Include local_repo variables + ansible.builtin.include_tasks: include_local_repo_config.yml + - name: Fetch software_config.json and local repo access variables ansible.builtin.include_tasks: fetch_software_config.yml diff --git a/scheduler/roles/cluster_validation/tasks/slurm_validations.yml b/scheduler/roles/cluster_validation/tasks/slurm_validations.yml index a1f500a58..97f4cf008 100644 --- a/scheduler/roles/cluster_validation/tasks/slurm_validations.yml +++ b/scheduler/roles/cluster_validation/tasks/slurm_validations.yml @@ -32,6 +32,12 @@ fail_msg: "{{ slurm_node_validation_fail_msg }}" success_msg: "{{ slurm_node_validation_success_msg }}" +- name: Verify Slurm control is not part of slurm node group + ansible.builtin.fail: + msg: "{{ slurm_control_node_in_node_fail_msg }}" + when: + - groups['slurm_control_node'][0] in groups['slurm_node'] + - name: Add all hosts in slurm_control_node and slurm_node to group slurm_cluster ansible.builtin.add_host: name: '{{ item }}' @@ -62,3 +68,10 @@ ansible.builtin.fail: msg: "{{ unreachable_slurm_control_node_fail_msg }}" when: unreachable_slurm_control_node | length >= 1 + +- name: Verify slurm_control_node is not part of login group + ansible.builtin.fail: + msg: "{{ slurm_control_node_in_login_fail_msg }}" + when: + - login_node_required + - groups['slurm_control_node'][0] in groups['login'] diff --git a/scheduler/roles/cluster_validation/vars/main.yml b/scheduler/roles/cluster_validation/vars/main.yml index 82eed9ca6..30af38450 100644 --- a/scheduler/roles/cluster_validation/vars/main.yml +++ b/scheduler/roles/cluster_validation/vars/main.yml @@ -15,21 +15,31 @@ # Usage: main.yml omnia_log_path: /var/log/omnia -cp_path: "/root/.ansible/cp/" +oim_path: "/root/.ansible/oim/" ansible_cfg_src: "{{ playbook_dir }}/ansible.cfg" ansible_cfg_dest: - { path: "{{ playbook_dir }}/telemetry/ansible.cfg", log_path: "/var/log/omnia/omnia_telemetry.log", regexp: "/var/log/omnia.log" } - { path: "{{ playbook_dir }}/platforms/ansible.cfg", log_path: "/var/log/omnia/omnia_platforms.log", regexp: "/var/log/omnia.log" } +# Usage: include_local_repo_config.yml +local_repo_config_file: "{{ role_path }}/../../../input/local_repo_config.yml" +local_repo_config_syntax_fail_msg: "Failed. Syntax errors present in local_repo_config.yml. Fix errors and re-run playbook again." + # Usage: fetch_software_config.yml software_config_json_file: "{{ role_path }}/../../../input/software_config.json" local_repo_access_dest_path: "/opt/omnia/offline/local_repo_access.yml" +k8s_version_fail_msg: "Failed, Ensure version of k8s is mentioned in software_config.json" k8s_packages_file: "{{ role_path }}/../../../input/config/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/k8s.json" success_msg_ucx_version: "Success. ucx version is mentioned." fail_msg_ucx_version: "Failed. ucx version is not provided in software_config.json. Please include ucx version in input/software_config.json and rerun the playbook." # noqa: yaml[line-length] success_msg_openmpi_version: "Success. openmpi version is mentioned." fail_msg_openmpi_version: "Failed. openmpi version is not provided in software_config.json. Please include openmpi version in input/software_config.json and rerun the playbook." # noqa: yaml[line-length] compute_os_ubuntu: "ubuntu" +intel_gaudi_input_fail_msg: "Warning, software_config.json does not have the intel software stack. Intel stack will not be configured." +intel_gaudi_repo_fail_msg: "Failed, local_repo.yml is not executed for downloading Intel Gaudi driver packages." +warning_time: 30 +offline_intel_directory: "{{ repo_store_path }}/cluster/apt" +intelgaudi_version_warning_msg: "Failed, Intel Gaudi version not found." # Usage: fetch_omnia_inputs.yml config_filename: "omnia_config.yml" @@ -37,13 +47,22 @@ config_vaultname: .omnia_vault_key vault_key_permission: "0644" min_length: 8 max_length: 30 +omnia17_k8s_version: '1.29.5' +omnia161_k8s_version: '1.26.12' omnia_config_syntax_fail_msg: "Failed. Syntax errors present in omnia_config.yml. Fix errors and re-run playbook again." fail_msg_mariadb_password: "maria_db password not given in correct format." success_msg_mariadb_password: "mariadb_password validated" success_msg_k8s_version: "Kubernetes Version Validated" -fail_msg_k8s_version: "Failed. Kubernetes Version is unsupported or incorrect in software_config.json" +fail_msg_k8s_version: "Failed. Kubernetes Version is unsupported or incorrect in software_config.json for this virtual environment. Supported versions for omnia17_venv: 1.29.5, and for omnia161_venv: 1.26.12." # noqa: yaml[line-length] +kube_version_on_unsupported_os: "Failed. On RHEL/Rocky 8.6 or 8.7 OS, supported kubernetes version is 1.26.12 and suppported virtual environment is omnia161_venv only" # noqa: yaml[line-length] success_msg_k8s_cni: "Kubernetes CNI Validated" fail_msg_k8s_cni: "Kubernetes CNI not correct." +supported_topology_manager_policy: ['none', 'best-effort', 'restricted', 'single-numa-node'] +success_msg_k8s_toplogy_manager_policy: "topology_manager_policy validated" +fail_msg_k8s_toplogy_manager_policy: "topology_manager_policy can either be 'none' or 'best-effort' or 'restricted' or 'single-numa-node' in omnia_config.yml" +supported_topology_manager_scope: ['pod', 'container'] +success_msg_k8s_toplogy_manager_scope: "topology_manager_scope validated" +fail_msg_k8s_toplogy_manager_scope: "topology_manager_scope can either be 'pod' or 'container' in omnia_config.yml" success_msg_pod_external_ip_range: "pod_external_ip_range validated" fail_msg_pod_external_ip_range: "pod_external_ip_range is not given in correct format in omnia_config.yml" success_msg_k8s_service_addresses: "k8s_service_addresses validated" @@ -101,8 +120,9 @@ etcd group in inventory must have atleast one node and total node count must be etcd_node_validation_success_msg: "etcd should have odd number of nodes in the inventory" unreachable_kube_control_plane_fail_msg: "Failed. Unreachable node mentioned in inventory for kube_control_plane. Re-run playbook with reachable kube_control_plane." -ansible_collection_folder: "/root/.ansible/collections/ansible_collections/" +ansible_collection_folder: "{{ omnia_collection_path[0] }}/ansible_collections/" kubespray_certificate_key_taskfile_path: "kubernetes_sigs/kubespray/roles/kubernetes/control-plane/tasks/kubeadm-setup.yml" +max_retries: 5 # Usage: slurm_validations.yml invalid_slurm_inventory_fail_msg: "Failed. slurm software is present in software_config.json. @@ -115,6 +135,8 @@ At least one slurm_node should be present in the inventory." slurm_node_validation_success_msg: "At least one slurm_node exists in the inventory" unreachable_slurm_control_node_fail_msg: "Failed. Unreachable node mentioned in inventory for slurm_control_node. Re-run playbook with reachable slurm_control_node." +slurm_control_node_in_login_fail_msg: "Failed. Node mentioned in slurm_control_node group should not be present in login group" +slurm_control_node_in_node_fail_msg: "Failed. Node mentioned in slurm_control_node group should not be present in slurm_node group" # Usage: install_packages.yml ansible_base_version: '2.9' @@ -134,3 +156,29 @@ multiple_login_node_fail_msg: "Failed. Currently only one login node supported i warning_wait_time: 10 login_node_warning_msg: "[WARNING] login group with ip for login node not present in the inventory. Proceeding execution with provided nodes" + +# Usage: Fetch_software_config.yml +csi_driver_powerscale_packages_file: >- + {{ role_path }}/../../../input/config/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/csi_driver_powerscale.json + +# Usage: fetch_omnia_inputs.yml +csi_driver_secret_file_path_success_msg: "Success. csi_driver_secret_file_path is valid in omnia_config.yml" +csi_driver_secret_file_path_fail_msg: "Failed. csi_driver_secret_file_path is not valid in omnia_config.yml. Please verify the path." + +csi_driver_values_file_path_success_msg: "Success. csi_driver_values_file_path is valid in omnia_config.yml" +csi_driver_values_file_path_fail_msg: "Failed. csi_driver_values_file_path is not valid in omnia_config.yml. Please verify the path." + +# Usage: csi_powerscale_driver_input_validation.yml +csi_powerscale_secret_vaultname: ".csi_powerscale_secret_vault" +fail_msg_isilon_clusters: "isilonClusters must be a valid list of powerscale details in secret.yaml file." +fail_msg_cluster_name: "clusterName is not valid. Provide powerscale cluster name in secret.yaml file." +fail_msg_user_name: "userName is not valid. Provide powerscale user name in secret.yaml file." +fail_msg_password: "Password is not valid. Provide powerscale password in secret.yaml file." +fail_msg_endpoint: "Endpoint is not valid. Provide powerscale IP or hostname in secret.yaml file." +fail_msg_endpoint_port: "endpointPort is not valid. Provide valid port number in secret.yaml file." +fail_msg_isdefault: "isDefault value should be true or false in secret.yaml file." +fail_msg_skip_certificate_validation: "skipCertificateValidation must be true in secret.yaml file." +fail_msg_isipath: "isiPath must be a valid Unix absolute path in secret.yaml file." +fail_msg_isi_volume_path_permissions: "isiVolumePathPermissions must be a valid directory permission (example: 0777) in secret.yaml file." +fail_msg_api_call: "Please recheck powerscale username, password, endpoint and endpointPort details provided in secret.yaml and + values.yaml (if endpointPort is provided only in values.yaml) file. API call to powerscale was not successful" diff --git a/scheduler/roles/install_benchmarks_tools/tasks/main.yml b/scheduler/roles/install_benchmarks_tools/tasks/main.yml index d7aa591c9..c7ddacdd7 100644 --- a/scheduler/roles/install_benchmarks_tools/tasks/main.yml +++ b/scheduler/roles/install_benchmarks_tools/tasks/main.yml @@ -24,6 +24,7 @@ - name: Validate share path for ucx and openmpi ansible.builtin.include_tasks: validate_share_path.yml + when: ucx_software_status or openmpi_software_status - name: Install the prerequisites for installing the ucx when: diff --git a/scheduler/roles/install_benchmarks_tools/vars/redhat.yml b/scheduler/roles/install_benchmarks_tools/vars/redhat.yml index 0125c0a65..74e19ec14 100644 --- a/scheduler/roles/install_benchmarks_tools/vars/redhat.yml +++ b/scheduler/roles/install_benchmarks_tools/vars/redhat.yml @@ -14,5 +14,7 @@ --- # Usage: prerequisite_ucx.yml, prerequisite_openmpi.yml -gcc_package_names: gcc-c++ +gcc_package_names: + - gcc-c++ + - make clang_package: clang diff --git a/scheduler/roles/k8s_amd/tasks/check_pre_requisite.yml b/scheduler/roles/k8s_amd/tasks/check_pre_requisite.yml new file mode 100644 index 000000000..b64e335c9 --- /dev/null +++ b/scheduler/roles/k8s_amd/tasks/check_pre_requisite.yml @@ -0,0 +1,40 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Define variables + ansible.builtin.set_fact: + install_amd_plugin: false + +- name: Verify if the node has amd gpu accelerator + ansible.builtin.shell: > + set -o pipefail && \ + lspci | grep "Display controller: Advanced Micro Devices, Inc. \[AMD/ATI\] \| Processing accelerators: Advanced Micro Devices, Inc. \[AMD/ATI\]" + register: lspci_status + changed_when: false + failed_when: false + +- name: Update amd gpu accelerator status + ansible.builtin.set_fact: + install_amd_plugin: true + when: lspci_status.stdout | length > 0 + +- name: Set fact for localhost + ansible.builtin.set_fact: + is_amd_cluster: true + delegate_to: localhost + delegate_facts: true + when: + - install_amd_plugin is defined + - install_amd_plugin is true diff --git a/upgrade/roles/backup_omniadb/templates/upgrade_config_template.j2 b/scheduler/roles/k8s_amd/tasks/main.yml similarity index 73% rename from upgrade/roles/backup_omniadb/templates/upgrade_config_template.j2 rename to scheduler/roles/k8s_amd/tasks/main.yml index 43988659d..f8fd67497 100644 --- a/upgrade/roles/backup_omniadb/templates/upgrade_config_template.j2 +++ b/scheduler/roles/k8s_amd/tasks/main.yml @@ -11,7 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -old_input_location: "{{ old_input_location }}" -backup_location: "{{ backup_location }}" -new_pxe_mapping_file: "{{ new_pxe_mapping_file_location }}" -import_input_parameters_postgresdb_password: "{{ postgresdb_password }}" \ No newline at end of file +--- + +- name: Install amd device plugin if cluster has atleast one amd node + when: hostvars['127.0.0.1']['k8s_support'] + block: + - name: Check pre-requisite for amd + ansible.builtin.include_tasks: check_pre_requisite.yml diff --git a/scheduler/roles/k8s_csi_powerscale_plugin/files/empty_certificate_template.yml b/scheduler/roles/k8s_csi_powerscale_plugin/files/empty_certificate_template.yml new file mode 100644 index 000000000..670301195 --- /dev/null +++ b/scheduler/roles/k8s_csi_powerscale_plugin/files/empty_certificate_template.yml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: Secret +metadata: + name: isilon-certs-0 + namespace: isilon +type: Opaque +data: + cert-0: "" + \ No newline at end of file diff --git a/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_config_certificate.yml b/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_config_certificate.yml new file mode 100644 index 000000000..0a058067d --- /dev/null +++ b/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_config_certificate.yml @@ -0,0 +1,35 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Empty certificate creation + block: + - name: Copy empty certificate yaml file + ansible.builtin.copy: + dest: "{{ empty_certificate_path }}" + src: "{{ empty_certificate_template_path }}" + mode: "{{ permission_644 }}" + + - name: Apply the Secret YAML to Kubernetes + block: + - name: Create empty certificate secret + ansible.builtin.command: + cmd: "kubectl apply -f {{ empty_certificate_path }}" + register: result + changed_when: result.changed + + rescue: + - name: Empty certificate secret creation failure + ansible.builtin.fail: + msg: "{{ fail_msg_empty_certificate }}" diff --git a/upgrade/roles/uninstall_k8s_cluster/tasks/remove_docker_k8s.yml b/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_config_secret.yml similarity index 50% rename from upgrade/roles/uninstall_k8s_cluster/tasks/remove_docker_k8s.yml rename to scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_config_secret.yml index f0c131e35..ad6cf6d85 100644 --- a/upgrade/roles/uninstall_k8s_cluster/tasks/remove_docker_k8s.yml +++ b/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_config_secret.yml @@ -13,35 +13,20 @@ # limitations under the License. --- -- name: Remove docker images - ansible.builtin.command: docker image prune -f - changed_when: true +- name: Remove existing isilon-creds secret if already present in isilon namespace + ansible.builtin.command: kubectl delete secret isilon-creds -n {{ powerscale_ns }} failed_when: false - become: true + changed_when: false -- name: Restart docker service - ansible.builtin.systemd: - name: docker - state: restarted - enabled: true +- name: Create isilon-creds secret in isilon namespace + ansible.builtin.command: kubectl create secret generic isilon-creds -n {{ powerscale_ns }} --from-file=config="{{ csi_powerscale_secret_path }}" failed_when: false + register: apply_secret + changed_when: apply_secret.changed -- name: Uninstall docker packages - ansible.builtin.package: - name: "{{ docker_packages }}" - state: absent - -- name: Autoremove unneeded packages installed as dependencies - ansible.builtin.dnf: - autoremove: true - -- name: Remove docker repo file +# Remove the secret file +- name: Remove secret file ansible.builtin.file: - path: "{{ docker_repo_file }}" + path: "{{ csi_powerscale_secret_path }}" state: absent - -- name: Remove docker and containerd files from /var/lib - ansible.builtin.file: - path: "{{ item }}" - state: absent - with_items: "{{ docker_del_files }}" + failed_when: false diff --git a/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_image_pull.yml b/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_image_pull.yml new file mode 100644 index 000000000..9c51d52d5 --- /dev/null +++ b/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_image_pull.yml @@ -0,0 +1,34 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Set empty image list + ansible.builtin.set_fact: + csi_powerscale_image_versions: [] + +- name: Fetch and store image versions + ansible.builtin.set_fact: + csi_powerscale_image_versions: "{{ csi_powerscale_image_versions + [item.package + ':' + item.tag] }}" + loop: "{{ hostvars['localhost']['csi_driver_powerscale_packages_json']['csi_driver_powerscale']['cluster'] }}" + when: item.type == 'image' + +- name: Pull csi powerscale images + ansible.builtin.command: nerdctl pull {{ item }} + with_items: "{{ csi_powerscale_image_versions }}" + changed_when: true + failed_when: false + environment: + http_proxy: "{{ hostvars['localhost']['http_proxy'] }}" + https_proxy: "{{ hostvars['localhost']['https_proxy'] }}" + no_proxy: "{{ hostvars['localhost']['oim_hostname'] }},{{ hostvars['localhost']['admin_nic_ip'] }}" diff --git a/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_install.yml b/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_install.yml new file mode 100644 index 000000000..da39443b0 --- /dev/null +++ b/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_install.yml @@ -0,0 +1,72 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Deploy external-snapshotter config CRDs + ansible.builtin.command: + cmd: "kubectl apply -f client/config/crd/" + chdir: "{{ csi_powerscale_path }}/csi-powerscale/external-snapshotter/" + register: install_result + failed_when: false + changed_when: install_result.changed + +- name: Deploy external-snapshotter snapshot-controller CRDs + ansible.builtin.command: + cmd: "kubectl apply -f deploy/kubernetes/snapshot-controller/" + chdir: "{{ csi_powerscale_path }}/csi-powerscale/external-snapshotter/" + register: install_result + failed_when: false + changed_when: install_result.changed + +- name: Execute CSI driver installation script with timeout of seconds {{ async_time }} + ansible.builtin.command: + cmd: "./csi-install.sh --namespace {{ isilon_ns }} --values {{ csi_powerscale_values_path }}" + chdir: "{{ csi_powerscale_path }}/{{ csi_powerscale_git | regex_replace('\\.tar\\.gz$', '') }}/dell-csi-helm-installer" + register: install_result + async: "{{ async_time }}" + poll: "{{ poll_time }}" + failed_when: false + changed_when: install_result.changed + +- name: Wait for csi pods to be in Running state + ansible.builtin.shell: > + set -o pipefail && \ + kubectl get pod -n {{ isilon_ns }} --no-headers | grep {{ powerscale_pod_indcator }} | grep -v "Running" + register: isilon_non_running_pods + failed_when: false + changed_when: false + until: isilon_non_running_pods.stdout_lines | length == 0 + retries: "{{ max_attempts }}" + delay: "{{ wait_time }}" + +- name: Verify csi driver installation + ansible.builtin.pause: + seconds: "{{ warning_wait_time }}" + prompt: "{{ fail_msg_csi_powerscale_driver }}" + when: isilon_non_running_pods.stdout_lines | length > 0 + +- name: Create powerscale storage class if deployment was successful + ansible.builtin.command: + cmd: "kubectl apply -f ps_storage_class.yml" + chdir: "{{ csi_powerscale_path }}" + register: sc_command_result + failed_when: false + changed_when: sc_command_result.changed + when: isilon_non_running_pods.stdout_lines | length == 0 + +- name: Remove ps_storage_class.yml file + ansible.builtin.file: + path: "{{ csi_powerscale_path }}/ps_storage_class.yml" + state: absent + force: true diff --git a/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_prereq.yml b/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_prereq.yml new file mode 100644 index 000000000..eb04fae59 --- /dev/null +++ b/scheduler/roles/k8s_csi_powerscale_plugin/tasks/csi_powerscale_prereq.yml @@ -0,0 +1,174 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Check Kubernetes is deployed on cluster +- name: Verify Kubernetes is deployed on cluster + ansible.builtin.command: kubectl get node + register: k8s_return_code + changed_when: false + failed_when: false + +- name: Fail if Kubernetes is not deployed + ansible.builtin.assert: + that: + - k8s_return_code.rc == 0 + fail_msg: "{{ k8s_not_deployed }}" + +# Check if powerscale is already deployed +- name: Verify powerscale is deployed on cluster + ansible.builtin.shell: > + set -o pipefail && \ + kubectl get pod -n {{ powerscale_ns }} --no-headers | grep {{ powerscale_pod_indcator }} + register: powerscale_precheck + changed_when: false + failed_when: false + +- name: Set flag if powerscale is already deployed + ansible.builtin.set_fact: + powerscale_already_deployed: "{{ powerscale_precheck.rc == 0 }}" + +- name: Pause to notify powerscale already deployed + ansible.builtin.pause: + seconds: "{{ warning_wait_time }}" + prompt: "{{ warning_msg_already_deployed }}" + when: powerscale_already_deployed + +- name: Proceed prereq if powerscale not already deployed + when: not powerscale_already_deployed + block: + # Check helm is deployed on cluster + - name: Verify helm is deployed on cluster + ansible.builtin.command: helm + register: helm_return_code + changed_when: false + failed_when: false + + - name: Fail if helm is not deployed + ansible.builtin.assert: + that: + - helm_return_code.rc == 0 + fail_msg: "{{ helm_not_deployed }}" + + - name: Remove /opt/omnia/csi-driver-powerscale directory if already present + ansible.builtin.file: + path: "{{ csi_powerscale_path }}" + state: absent + + - name: Create csi-driver-powerscale directory under /opt/omnia + ansible.builtin.file: + path: "{{ csi_powerscale_path }}" + mode: "{{ permission_644 }}" + state: directory + owner: "{{ owner_value }}" + group: "{{ group_value }}" + + - name: Check if secret file is encrypted + ansible.builtin.command: cat "{{ hostvars['localhost']['csi_powerscale_driver_secret_file_path'] }}" + changed_when: false + register: config_content + connection: local + delegate_to: localhost + + - name: Decrpyt secret file + ansible.builtin.command: >- + ansible-vault decrypt {{ hostvars['localhost']['csi_powerscale_driver_secret_file_path'] }} + --vault-password-file {{ role_path }}/files/{{ csi_powerscale_secret_vaultname }} + when: "'$ANSIBLE_VAULT;' in config_content.stdout" + changed_when: true + connection: local + delegate_to: localhost + + # Copy secret file to /opt/omnia + - name: Copy secret file + ansible.builtin.copy: + src: "{{ hostvars['localhost']['csi_powerscale_driver_secret_file_path'] }}" + dest: "{{ csi_powerscale_secret_path }}" + owner: "{{ owner_value }}" + group: "{{ group_value }}" + mode: "{{ permission_644 }}" + + - name: Encrypt secret file + ansible.builtin.command: >- + ansible-vault encrypt {{ hostvars['localhost']['csi_powerscale_driver_secret_file_path'] }} + --vault-password-file {{ role_path }}/files/{{ csi_powerscale_secret_vaultname }} + changed_when: false + connection: local + delegate_to: localhost + + # Copy values file to /opt/omnia + - name: Copy values file + ansible.builtin.copy: + src: "{{ hostvars['localhost']['csi_powerscale_driver_values_file_path'] }}" + dest: "{{ csi_powerscale_values_path }}" + owner: "{{ owner_value }}" + group: "{{ group_value }}" + mode: "{{ permission_644 }}" + + - name: Get dependencies from local repo + block: + - name: Get csi-powerscale git tar + ansible.builtin.get_url: + url: "{{ hostvars['localhost']['offline_git_path'] }}/{{ csi_powerscale_git }}" + dest: "{{ csi_powerscale_path }}/{{ csi_powerscale_git }}" + mode: "{{ permission_644 }}" + + - name: Extract csi-powerscale tar file + ansible.builtin.unarchive: + src: "{{ csi_powerscale_path }}/{{ csi_powerscale_git }}" + dest: "{{ csi_powerscale_path }}" + remote_src: true + + - name: Get dell/helm-charts git tar + ansible.builtin.get_url: + url: "{{ hostvars['localhost']['offline_git_path'] }}/{{ helm_charts_git }}" + dest: "{{ csi_powerscale_path }}/csi-powerscale/{{ helm_charts_git }}" + mode: "{{ permission_644 }}" + + - name: Get external-snapshotter git tar + ansible.builtin.get_url: + url: "{{ hostvars['localhost']['offline_git_path'] }}/{{ external_snapshotter_git }}" + dest: "{{ csi_powerscale_path }}/csi-powerscale/{{ external_snapshotter_git }}" + mode: "{{ permission_644 }}" + rescue: + - name: Handle dependency failure + ansible.builtin.fail: + msg: "{{ fail_msg_download }}" + + - name: Extract dell/helm-charts tar file under csi-powerscale directory + ansible.builtin.unarchive: + src: "{{ csi_powerscale_path }}/csi-powerscale/{{ helm_charts_git }}" + dest: "{{ csi_powerscale_path }}/csi-powerscale" + remote_src: true + + - name: Extract external snapshotter tar file under csi-powerscale directory + ansible.builtin.unarchive: + src: "{{ csi_powerscale_path }}/csi-powerscale/{{ external_snapshotter_git }}" + dest: "{{ csi_powerscale_path }}/csi-powerscale" + remote_src: true + + - name: Transfer storage class template to kube_control_plane + ansible.builtin.template: + src: ps_storage_class.j2 + dest: "{{ csi_powerscale_path }}/ps_storage_class.yml" + owner: "{{ owner_value }}" + group: "{{ group_value }}" + mode: "{{ permission_644 }}" + + - name: Create isilon namespace + ansible.builtin.command: + cmd: "kubectl create ns isilon" + register: command_result + failed_when: false + changed_when: command_result.changed diff --git a/scheduler/roles/k8s_csi_powerscale_plugin/tasks/main.yml b/scheduler/roles/k8s_csi_powerscale_plugin/tasks/main.yml new file mode 100644 index 000000000..4599d71c0 --- /dev/null +++ b/scheduler/roles/k8s_csi_powerscale_plugin/tasks/main.yml @@ -0,0 +1,32 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: CSI powerscale driver installation + when: hostvars['localhost']['csi_driver_powerscale_precheck_pass'] + block: + - name: Fetch required files to kube control plane + ansible.builtin.include_tasks: csi_powerscale_prereq.yml + + - name: Deploy powerscale if not already deployed + when: not powerscale_already_deployed + block: + - name: Configure secret + ansible.builtin.include_tasks: csi_powerscale_config_secret.yml + + - name: Configure certificate + ansible.builtin.include_tasks: csi_powerscale_config_certificate.yml + + - name: Install powerscale driver + ansible.builtin.include_tasks: csi_powerscale_install.yml diff --git a/scheduler/roles/k8s_csi_powerscale_plugin/templates/ps_storage_class.j2 b/scheduler/roles/k8s_csi_powerscale_plugin/templates/ps_storage_class.j2 new file mode 100644 index 000000000..a8158d410 --- /dev/null +++ b/scheduler/roles/k8s_csi_powerscale_plugin/templates/ps_storage_class.j2 @@ -0,0 +1,13 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: ps01 +provisioner: csi-isilon.dellemc.com +reclaimPolicy: Delete +allowVolumeExpansion: true +volumeBindingMode: Immediate +parameters: + AccessZone: {{ ps_access_zone }} + Isipath: {{ ps_isipath }} + RootClientEnabled: "true" + csi.storage.k8s.io/fstype: "nfs" diff --git a/scheduler/roles/k8s_csi_powerscale_plugin/vars/main.yml b/scheduler/roles/k8s_csi_powerscale_plugin/vars/main.yml new file mode 100644 index 000000000..3b37fcef4 --- /dev/null +++ b/scheduler/roles/k8s_csi_powerscale_plugin/vars/main.yml @@ -0,0 +1,59 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Usage: csi_powerscale_config_certificate.yml +empty_certificate_path: "{{ csi_powerscale_path }}/empty_isilon-certs.yaml" +fail_msg_empty_certificate: "Failed. Unable to create empty certificate." +empty_certificate_template_path: "{{ role_path }}/files/empty_certificate_template.yml" + +# Usage: csi_powerscale_config_secret.yml, csi_powerscale_prereq.yml +csi_powerscale_secret_path: "{{ csi_powerscale_path }}/csi_powerscale_secret.yaml" + +# Usage: csi_powerscale_install.yml, csi_powerscale_prereq.yml +csi_powerscale_path: "/opt/omnia/csi-driver-powerscale" + +# Usage: csi_powerscale_install.yml, csi_powerscale_prereq.yml +csi_powerscale_git: "csi-powerscale.tar.gz" + +# Usage: csi_powerscale_install.yml +fail_msg_csi_powerscale_driver: "Error. Deployment of csi driver was not successful. Please review the deployment. Run playbook with -vvv for more details" +pass_msg_csi_powerscale_driver: "CSI Powerscale driver installation completed successfully." +wait_time: 10 +warning_wait_time: 30 +max_attempts: 5 +isilon_ns: "isilon" +async_time: 180 +poll_time: 10 + +# Usage: csi_powerscale_prereq.yml +permission_644: "0644" +owner_value: "root" +group_value: "root" +powerscale_ns: "isilon" +powerscale_pod_indcator: "isilon-" +csi_powerscale_values_path: "{{ csi_powerscale_path }}/values.yaml" +fail_msg_download: "Failed to get required dependencies. Make sure to verify entries in csi_driver_powerscale.json and run local_repo.yml first." +helm_charts_git: "helm-charts.tar.gz" +external_snapshotter_git: "external-snapshotter.tar.gz" +k8s_not_deployed: "Failed, Kubernetes is not deployed on the cluster. Run omnia.yml with k8s entry in software_config.json to install kubernetes first." +helm_not_deployed: "Failed, Helm is not deployed on the cluster." +csi_powerscale_secret_vaultname: ".csi_powerscale_secret_vault" +vault_key_permission: "0644" +warning_msg_already_deployed: "Powerscale will not be deployed. Existing powerscale deployment is already present on the cluster. + Please remove the existing powerscale deployment first using steps mentioned in omnia document and rerun playbook to install powerscale." + +# Usage: template ps_storage_class.j2 +ps_isipath: "{{ hostvars['localhost']['csi_powerscale_values_file']['isiPath'] }}" +ps_access_zone: "{{ hostvars['localhost']['csi_powerscale_values_file']['isiAccessZone'] }}" diff --git a/scheduler/roles/k8s_habana_container_runtime/tasks/change_containerd_runtime.yml b/scheduler/roles/k8s_habana_container_runtime/tasks/change_containerd_runtime.yml new file mode 100644 index 000000000..2bc396d67 --- /dev/null +++ b/scheduler/roles/k8s_habana_container_runtime/tasks/change_containerd_runtime.yml @@ -0,0 +1,66 @@ +# Copyright 2024 Intel Corporation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Uncomment habana-container-runtime config mount_accelerators line + ansible.builtin.lineinfile: + dest: "{{ habana_container_runtime_cfg_file_path }}" + regexp: '^#mount_accelerators = false' + line: 'mount_accelerators = false' + +- name: Uncomment habana-container-runtime config visible_devices_all_as_default line + ansible.builtin.lineinfile: + dest: "{{ habana_container_runtime_cfg_file_path }}" + regexp: '^#visible_devices_all_as_default = false' + line: 'visible_devices_all_as_default = false' + +- name: Replace the Default Runtime + ansible.builtin.replace: + path: "{{ containerd_cfg_file_path }}" + regexp: 'default_runtime_name.*=.*"runc"' + replace: 'default_runtime_name = "habana"' + +- name: Add habana-container-runtime Configuration Block 1 + ansible.builtin.blockinfile: + path: "{{ containerd_cfg_file_path }}" + marker: "# {mark} ANSIBLE MANAGED BLOCK 1" + insertafter: '\[plugins."io.containerd.grpc.v1.cri".containerd.runtimes\]' + block: | + {% filter indent(width=8, first=true) %} + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.habana] + runtime_type = "io.containerd.runc.v2" + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.habana.options] + BinaryName = "/usr/bin/habana-container-runtime" + systemdCgroup = true + {% endfilter %} + +- name: Add habana-container-runtime Configuration Block 2 + ansible.builtin.blockinfile: + path: "{{ containerd_cfg_file_path }}" + marker: "# {mark} ANSIBLE MANAGED BLOCK 2" + block: | + {% filter indent(width=2, first=true) %} + [plugins."io.containerd.runtime.v1.linux"] + runtime = "habana-container-runtime" + {% endfilter %} + +- name: Restart Containerd + ansible.builtin.service: + name: containerd + state: restarted + +- name: Restart Kubelet + ansible.builtin.service: + name: kubelet + state: restarted diff --git a/scheduler/roles/k8s_habana_container_runtime/tasks/check_prerequisite.yml b/scheduler/roles/k8s_habana_container_runtime/tasks/check_prerequisite.yml new file mode 100644 index 000000000..78ca80128 --- /dev/null +++ b/scheduler/roles/k8s_habana_container_runtime/tasks/check_prerequisite.yml @@ -0,0 +1,34 @@ +# Copyright 2024 Intel Corporation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check hl-smi + ansible.builtin.command: hl-smi + register: hlsmi_run + changed_when: true + failed_when: false + +- name: Set fact when Intel software was detected + when: hlsmi_run.rc == 0 + ansible.builtin.set_fact: + accelerator_type: "intel" + +- name: Set fact for localhost + ansible.builtin.set_fact: + is_gaudi_cluster: true + delegate_to: localhost + delegate_facts: true + when: + - accelerator_type is defined + - accelerator_type == "intel" diff --git a/upgrade/roles/docker_registry_uninstall/tasks/main.yml b/scheduler/roles/k8s_habana_container_runtime/tasks/install_prerequisite_ubuntu.yml similarity index 73% rename from upgrade/roles/docker_registry_uninstall/tasks/main.yml rename to scheduler/roles/k8s_habana_container_runtime/tasks/install_prerequisite_ubuntu.yml index 9725967b5..566a68355 100644 --- a/upgrade/roles/docker_registry_uninstall/tasks/main.yml +++ b/scheduler/roles/k8s_habana_container_runtime/tasks/install_prerequisite_ubuntu.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Intel Corporation. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ # limitations under the License. --- -- name: Uninstall docker registry - ansible.builtin.import_tasks: docker_registry_uninstall.yml - tags: docker_registry +- name: Install habana container runtime package + ansible.builtin.apt: + name: "habanalabs-container-runtime={{ intelgaudi_version }}" + update_cache: true diff --git a/scheduler/roles/k8s_habana_container_runtime/tasks/main.yml b/scheduler/roles/k8s_habana_container_runtime/tasks/main.yml new file mode 100644 index 000000000..ff5fea234 --- /dev/null +++ b/scheduler/roles/k8s_habana_container_runtime/tasks/main.yml @@ -0,0 +1,39 @@ +# Copyright 2024 Intel Corporation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Tasks for k8s_habana_container_runtime role + when: hostvars['127.0.0.1']['k8s_support'] + block: + - name: Check Intel Gaudi device for nodes + ansible.builtin.include_tasks: check_prerequisite.yml + + - name: Install Intel Gaudi device for nodes + when: + - accelerator_type is defined + - accelerator_type == "intel" + - hostvars['localhost']['set_intel_config_status'] + block: + - name: Install prerequisite + ansible.builtin.include_tasks: install_prerequisite_ubuntu.yml + + - name: Change containerd runtime for Intel Gaudi k8s device plugin + ansible.builtin.include_tasks: change_containerd_runtime.yml + + - name: Set fact for nodes with Gaudi + ansible.builtin.set_fact: + node_has_gaudi: true + when: + - accelerator_type is defined + - accelerator_type == "intel" diff --git a/upgrade/roles/backup_omniadb/vars/main.yml b/scheduler/roles/k8s_habana_container_runtime/vars/main.yml similarity index 75% rename from upgrade/roles/backup_omniadb/vars/main.yml rename to scheduler/roles/k8s_habana_container_runtime/vars/main.yml index 498fca1e3..9241ccd05 100644 --- a/upgrade/roles/backup_omniadb/vars/main.yml +++ b/scheduler/roles/k8s_habana_container_runtime/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Intel Corporation. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,5 +13,5 @@ # limitations under the License. --- -# Usage:install_packages.yml -python_version: "python3.9" +containerd_cfg_file_path: "/etc/containerd/config.toml" +habana_container_runtime_cfg_file_path: "/etc/habana-container-runtime/config.toml" diff --git a/scheduler/roles/k8s_nvidia_container_toolkit/tasks/check_pre_requisite.yml b/scheduler/roles/k8s_nvidia_container_toolkit/tasks/check_pre_requisite.yml new file mode 100644 index 000000000..36b4a50fd --- /dev/null +++ b/scheduler/roles/k8s_nvidia_container_toolkit/tasks/check_pre_requisite.yml @@ -0,0 +1,40 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Define variables + ansible.builtin.set_fact: + install_nvidia_container_toolkit: false + +- name: Check if nvidia gpu is present + ansible.builtin.shell: > + set -o pipefail && \ + lspci | grep -i nvidia + register: lspci_status + changed_when: false + failed_when: false + +- name: Update nvidia container toolkit install status + ansible.builtin.set_fact: + install_nvidia_container_toolkit: true + when: "'NVIDIA' in lspci_status.stdout" + +- name: Set fact for localhost + ansible.builtin.set_fact: + is_nvidia_cluster: true + delegate_to: localhost + delegate_facts: true + when: + - install_nvidia_container_toolkit is defined + - install_nvidia_container_toolkit is true diff --git a/scheduler/roles/k8s_nvidia_container_toolkit/tasks/install_nvidia_container_toolkit.yml b/scheduler/roles/k8s_nvidia_container_toolkit/tasks/install_nvidia_container_toolkit.yml new file mode 100644 index 000000000..8a73ff7dd --- /dev/null +++ b/scheduler/roles/k8s_nvidia_container_toolkit/tasks/install_nvidia_container_toolkit.yml @@ -0,0 +1,28 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Install nvidia-container-toolkit + ansible.builtin.package: + name: nvidia-container-toolkit + state: present + +- name: Configure nvidia-container-toolkit runtime + ansible.builtin.command: nvidia-ctk runtime configure --runtime=containerd + changed_when: false + +- name: Restart containerd + ansible.builtin.systemd: + name: containerd + state: restarted diff --git a/scheduler/roles/k8s_nvidia_container_toolkit/tasks/main.yml b/scheduler/roles/k8s_nvidia_container_toolkit/tasks/main.yml new file mode 100644 index 000000000..ce4d44bbc --- /dev/null +++ b/scheduler/roles/k8s_nvidia_container_toolkit/tasks/main.yml @@ -0,0 +1,24 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Install nvidia container toolkit if k8s_support is true + when: hostvars['127.0.0.1']['k8s_support'] + block: + - name: Check pre-requisite for nvidia container toolkit + ansible.builtin.include_tasks: check_pre_requisite.yml + + - name: Install and configure nvidia container toolkit + ansible.builtin.include_tasks: install_nvidia_container_toolkit.yml + when: install_nvidia_container_toolkit diff --git a/scheduler/roles/k8s_prepare_nodes/tasks/prepare_hosts_file.yml b/scheduler/roles/k8s_prepare_nodes/tasks/prepare_hosts_file.yml index b4f9d5b3a..3886682ad 100644 --- a/scheduler/roles/k8s_prepare_nodes/tasks/prepare_hosts_file.yml +++ b/scheduler/roles/k8s_prepare_nodes/tasks/prepare_hosts_file.yml @@ -23,6 +23,11 @@ register: short_hostname changed_when: false +- name: Set 'ip' variable for kube_control_plane, kube_node, and etcd groups + ansible.builtin.set_fact: + ip: "{{ ansible_host | default(inventory_hostname) }}" + when: "'kube_control_plane' in groups or 'kube_node' in groups or 'etcd' in groups" + - name: Set facts for node hostname and ip ansible.builtin.set_fact: node_ip: "{{ ansible_host }}" diff --git a/scheduler/roles/k8s_prepare_services/tasks/download_service_images.yml b/scheduler/roles/k8s_prepare_services/tasks/download_service_images.yml index eeef80787..e8a6dfe46 100644 --- a/scheduler/roles/k8s_prepare_services/tasks/download_service_images.yml +++ b/scheduler/roles/k8s_prepare_services/tasks/download_service_images.yml @@ -31,3 +31,4 @@ environment: http_proxy: "{{ hostvars['localhost']['http_proxy'] }}" https_proxy: "{{ hostvars['localhost']['https_proxy'] }}" + no_proxy: "{{ hostvars['localhost']['oim_hostname'] }},{{ hostvars['localhost']['admin_nic_ip'] }}" diff --git a/scheduler/roles/k8s_roce_deploy/tasks/apply_network_services.yml b/scheduler/roles/k8s_roce_deploy/tasks/apply_network_services.yml index 9836d62f2..c9b15e93d 100644 --- a/scheduler/roles/k8s_roce_deploy/tasks/apply_network_services.yml +++ b/scheduler/roles/k8s_roce_deploy/tasks/apply_network_services.yml @@ -48,7 +48,7 @@ - "item.commit is defined" - name: Checkout specific commit for whereabouts plugin - ansible.builtin.command: "git checkout {{ whereabouts_commit_id }}" # noqa: command-instead-of-module + ansible.builtin.command: "git checkout -f {{ whereabouts_commit_id }}" # noqa: command-instead-of-module when: whereabouts_plugin_commit changed_when: false args: @@ -95,7 +95,7 @@ - "item.commit is defined" - name: Checkout specific commit for rdma plugin - ansible.builtin.command: "git checkout {{ rdma_commit_id }}" # noqa: command-instead-of-module + ansible.builtin.command: "git checkout -f {{ rdma_commit_id }}" # noqa: command-instead-of-module changed_when: false when: rdma_plugin_commit args: diff --git a/scheduler/roles/k8s_roce_deploy/tasks/pull_images.yml b/scheduler/roles/k8s_roce_deploy/tasks/pull_images.yml index 1e2239f90..97cf0d9d7 100644 --- a/scheduler/roles/k8s_roce_deploy/tasks/pull_images.yml +++ b/scheduler/roles/k8s_roce_deploy/tasks/pull_images.yml @@ -31,3 +31,4 @@ environment: http_proxy: "{{ hostvars['localhost']['http_proxy'] }}" https_proxy: "{{ hostvars['localhost']['https_proxy'] }}" + no_proxy: "{{ hostvars['localhost']['oim_hostname'] }},{{ hostvars['localhost']['admin_nic_ip'] }}" diff --git a/scheduler/roles/k8s_roce_deploy/tasks/validate_roce_plugin_config.yml b/scheduler/roles/k8s_roce_deploy/tasks/validate_roce_plugin_config.yml index 2e7d3717e..006ac9241 100644 --- a/scheduler/roles/k8s_roce_deploy/tasks/validate_roce_plugin_config.yml +++ b/scheduler/roles/k8s_roce_deploy/tasks/validate_roce_plugin_config.yml @@ -19,7 +19,7 @@ - name: Ensure all interfaces have name defined ansible.builtin.assert: that: - - interfaces | map(attribute='name') | select('defined') | list | length == interfaces | length + - interfaces | selectattr('name', 'defined') | selectattr('name', 'string') | length == interfaces | length fail_msg: "{{ name_definition_fail_msg }}" success_msg: "{{ name_definition_success_msg }}" @@ -33,7 +33,7 @@ - name: Ensure all interfaces have range defined ansible.builtin.assert: that: - - interfaces | map(attribute='range') | select('defined') | list | length == interfaces | length + - interfaces | selectattr('range', 'defined') | selectattr('range', 'string') | length == interfaces | length fail_msg: "{{ range_definition_fail_msg }}" success_msg: "{{ range_definition_success_msg }}" diff --git a/scheduler/roles/k8s_start_services/files/k8s-dashboard-loadbalancer.yml b/scheduler/roles/k8s_start_services/files/k8s-dashboard-loadbalancer.yml index f48a663b0..f4b77eafe 100644 --- a/scheduler/roles/k8s_start_services/files/k8s-dashboard-loadbalancer.yml +++ b/scheduler/roles/k8s_start_services/files/k8s-dashboard-loadbalancer.yml @@ -11,4 +11,4 @@ spec: protocol: TCP targetPort: 8443 selector: - k8s-app: kubernetes-dashboard \ No newline at end of file + k8s-app: kubernetes-dashboard diff --git a/scheduler/roles/k8s_start_services/tasks/deploy_k8s_services.yml b/scheduler/roles/k8s_start_services/tasks/deploy_k8s_services.yml index 1cebb15f3..6649014c0 100644 --- a/scheduler/roles/k8s_start_services/tasks/deploy_k8s_services.yml +++ b/scheduler/roles/k8s_start_services/tasks/deploy_k8s_services.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -26,6 +26,12 @@ register: k8s_pods tags: init +# Get deployed daemonset list +- name: Get K8s daemonset + ansible.builtin.command: kubectl get ds --all-namespaces + changed_when: false + register: k8s_daemonset + - name: Create directory for temp k8s files ansible.builtin.file: path: "{{ k8s_tmp_dir }}" @@ -53,37 +59,92 @@ ansible.builtin.command: "kubectl apply -f {{ k8s_dashboard_loadbalancer_file_dest }}" changed_when: true -# NVIDIA PLUGIN -- name: Copy nvidia_runtime_class.yml file - ansible.builtin.copy: - src: "{{ nvidia_runtime_source }}" - dest: "{{ nvidia_runtime_dest }}" - mode: "{{ file_mode }}" - -- name: Create RuntimeClass for nvidia-plugin - ansible.builtin.command: "kubectl apply -f {{ nvidia_runtime_dest }}" - changed_when: true +- name: Fetch helm path + block: + - name: Fetch helm path + ansible.builtin.command: whereis helm + changed_when: false + register: helm_full_path + rescue: + - name: Helm not installed + ansible.builtin.fail: + msg: "{{ helm_install_fail_msg }}" -- name: Install nvidia-device-plugin - ansible.builtin.command: > - helm install --namespace nvidia-device-plugin --create-namespace - --generate-name --set runtimeClassName=nvidia --set migStrategy='{{ mig_strategy }}' --set gfd.enabled='{{ gpu_discovery_feature }}' - '{{ nvidia_device_plugin_repo }}' - changed_when: true +# NVIDIA PLUGIN +- name: Deploy Nvidia Device plugin when: - "'nvidia-device-plugin' not in k8s_pods.stdout" + - hostvars['localhost']['is_nvidia_cluster'] is defined + block: + - name: Copy nvidia_runtime_class.yml file + ansible.builtin.copy: + src: "{{ nvidia_runtime_source }}" + dest: "{{ nvidia_runtime_dest }}" + mode: "{{ file_mode }}" + + - name: Create RuntimeClass for nvidia-plugin + ansible.builtin.command: "kubectl apply -f {{ nvidia_runtime_dest }}" + changed_when: true + + - name: Install nvidia-device-plugin + ansible.builtin.command: > + {{ helm_full_path.stdout.split(' ')[1] }} install --namespace nvidia-device-plugin --create-namespace + --generate-name --set runtimeClassName=nvidia --set migStrategy='{{ mig_strategy }}' --set gfd.enabled='{{ gpu_discovery_feature }}' + '{{ nvidia_device_plugin_repo }}' + changed_when: true # ROCM PLUGIN - name: Deploy ROCm Device plugin - ansible.builtin.command: "kubectl create -f '{{ rocm_device_plugin_yaml_url }}'" + when: + - "'amdgpu-device-plugin-daemonset' not in k8s_pods.stdout" + - hostvars['localhost']['is_amd_cluster'] is defined + block: + - name: Deploy ROCm Device plugin + ansible.builtin.command: "kubectl create -f '{{ rocm_device_plugin_yaml_url }}'" + changed_when: true + + - name: Patch DaemonSet with updated imagePullPolicy + ansible.builtin.command: > + kubectl patch daemonset amdgpu-device-plugin-daemonset -n kube-system --type='json' + -p='[{"op": "replace", "path": "/spec/template/spec/containers/0/imagePullPolicy", "value":"IfNotPresent"}]' + changed_when: true + +# LABEL NODES WITH GAUDI +- name: Label nodes with Gaudi + ansible.builtin.command: "kubectl label nodes {{ item }} {{ node_label_for_habana_device_plugin }}=true --overwrite" changed_when: true - when: "'amdgpu-device-plugin-daemonset' not in k8s_pods.stdout" + when: + - hostvars['localhost']['is_gaudi_cluster'] is defined + - hostvars[item]['node_has_gaudi'] | default(false) + loop: "{{ groups['kube_node'] | default([]) }}" -- name: Patch DaemonSet with updated imagePullPolicy - ansible.builtin.command: > - kubectl patch daemonset amdgpu-device-plugin-daemonset -n kube-system --type='json' - -p='[{"op": "replace", "path": "/spec/template/spec/containers/0/imagePullPolicy", "value":"IfNotPresent"}]' - changed_when: true +# HABANA PLUGIN +- name: Deploy Habana Device plugin + when: + - "'habanalabs-device-plugin-daemonset' not in k8s_daemonset.stdout" + - k8s_version >= minimal_gaudi_k8s_version + - hostvars['localhost']['is_gaudi_cluster'] is defined + block: + - name: Download habana-device-plugin yaml file + ansible.builtin.uri: + url: "{{ habana_device_plugin_yaml_url }}" + return_content: true + register: habana_device_plugin_yaml_content + + - name: Modify habana-device-plugin yaml file + ansible.builtin.set_fact: + with_node_selector: | + {{ habana_device_plugin_yaml_content.content }} + nodeSelector: + {{ node_label_for_habana_device_plugin }}: 'true' + + - name: Install habana-device-plugin + ansible.builtin.shell: | + set -o pipefail + echo "{{ with_node_selector }}" | kubectl create -f - + changed_when: true + args: + executable: /bin/bash # MPI OPERATOR - name: Install MPI Operator @@ -100,7 +161,7 @@ # NFS CLIENT PROVISIONER - name: Start NFS Client Provisioner using NFS on manager node ansible.builtin.command: > - helm install nfs-omnia '{{ nfs_subdir_external_provisioner_repo }}' + {{ helm_full_path.stdout.split(' ')[1] }} install nfs-omnia '{{ nfs_subdir_external_provisioner_repo }}' --set nfs.server='{{ hostvars['127.0.0.1']['k8s_nfs_server_ip'] }}' --set nfs.path="{{ hostvars['127.0.0.1']['k8s_server_share_path'] }}" changed_when: true diff --git a/scheduler/roles/k8s_start_services/vars/main.yml b/scheduler/roles/k8s_start_services/vars/main.yml index 054d6272c..50643f9e8 100644 --- a/scheduler/roles/k8s_start_services/vars/main.yml +++ b/scheduler/roles/k8s_start_services/vars/main.yml @@ -13,6 +13,8 @@ # limitations under the License. --- docker_login_fail_msg: "Docker login failed! Please check the credentials and re-execute playbook." +k8s_version: "{{ hostvars['localhost']['k8s_version'] }}" +minimal_gaudi_k8s_version: "1.27" k8s_images: - registry.k8s.io/nfd/node-feature-discovery:v0.12.1 - docker.io/rocm/k8s-device-plugin:latest @@ -26,7 +28,7 @@ file_mode: "0655" k8s_dashboard_loadbalancer_file_source: "k8s-dashboard-loadbalancer.yml" k8s_dashboard_loadbalancer_file_dest: "{{ k8s_tmp_dir }}/k8s_dashboard_load_balancer.yaml" nfs_server_manager_node: "{{ ansible_host }}" -mpi_operator_yaml_url: "{{ hostvars['localhost']['offline_manifest_path'] }}/mpi-operator.yaml" +mpi_operator_yaml_url: "{{ hostvars['localhost']['offline_manifest_path'] }}/{{ hostvars['localhost']['mpi_operator'] }}.yaml" nvidia_runtime_source: "nvidia_runtime_class.yml" nvidia_runtime_dest: "{{ k8s_tmp_dir }}/nvidia-runtime_class.yml" nvidia_device_plugin_repo: "{{ hostvars['localhost']['offline_tarball_path'] }}/nvidia-device-plugin.tar.gz" @@ -35,6 +37,10 @@ gpu_discovery_feature: true xilinx_device_plugin_yaml_url: "{{ hostvars['localhost']['offline_manifest_path'] }}/xilinx-device-plugin.yaml" spark_operator_repo: "{{ hostvars['localhost']['offline_tarball_path'] }}/spark-operator-v1beta2-1.3.8-3.1.1.tar.gz" rocm_device_plugin_yaml_url: "{{ hostvars['localhost']['offline_manifest_path'] }}/rocm-device-plugin.yaml" +habana_device_plugin_yaml_url: "{{ hostvars['localhost']['offline_manifest_path'] }}/habana-device-plugin.yaml" +node_label_for_habana_device_plugin: "intel.com/gaudi.present" nfs_dir_mode: "0777" pod_wait_time: 300 nfs_subdir_external_provisioner_repo: "{{ hostvars['localhost']['offline_tarball_path'] }}/nfs-subdir-external-provisioner-4.0.18.tar.gz" +helm_install_fail_msg: "Failed to fetch helm path. Please verify helm installation and ensure the environment variable PATH is correctly set in the node. +Re-run the playbook after verification." diff --git a/scheduler/roles/slurm_common/tasks/update_log_files.yml b/scheduler/roles/slurm_common/tasks/update_log_files.yml index f865fb12f..71ba00286 100644 --- a/scheduler/roles/slurm_common/tasks/update_log_files.yml +++ b/scheduler/roles/slurm_common/tasks/update_log_files.yml @@ -26,6 +26,10 @@ uid: "{{ slurm_uid }}" group: slurm tags: install + retries: "{{ retry_count }}" + delay: "{{ time_delay }}" + until: result is succeeded # Retry until the task succeeds + register: result # Register the result for checking - name: Create slurm log directory ansible.builtin.file: diff --git a/scheduler/roles/slurm_common/vars/main.yml b/scheduler/roles/slurm_common/vars/main.yml index ea1d5d034..d9257169c 100644 --- a/scheduler/roles/slurm_common/vars/main.yml +++ b/scheduler/roles/slurm_common/vars/main.yml @@ -100,3 +100,5 @@ lmod_packages: grafana_conf_path: "/opt/omnia/.data/grafana_svc_details.ini" min_length_grafana: 5 dnf_conf_path: /etc/dnf/dnf.conf +time_delay: 10 +retry_count: 5 diff --git a/scheduler/scheduler.yml b/scheduler/scheduler.yml index 91151c6e8..f553950dd 100644 --- a/scheduler/scheduler.yml +++ b/scheduler/scheduler.yml @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. --- - - name: Update Inventory with ansible_host information ansible.builtin.import_playbook: ../utils/servicetag_host_mapping.yml when: not ( hostvars['127.0.0.1']['update_inventory_executed'] | default(false) | bool ) @@ -60,6 +59,24 @@ - name: Add nodes to kubernetes cluster ansible.builtin.import_playbook: "{{ playbook_dir }}/playbooks/k8s_add_node.yml" +- name: Prepare habana container runtime + hosts: kube_node + gather_facts: false + roles: + - k8s_habana_container_runtime + +- name: Install nvidia-container-toolkit on nodes with Nvidia GPU + hosts: kube_control_plane, kube_node, etcd + gather_facts: false + roles: + - k8s_nvidia_container_toolkit + +- name: Install Plugin only when one of nodes with AMD GPU + hosts: kube_control_plane, kube_node, etcd + gather_facts: false + roles: + - k8s_amd + - name: Prepare kube control plane and kube nodes for kubernetes services installations hosts: kube_control_plane, kube_node gather_facts: false @@ -72,6 +89,21 @@ roles: - k8s_start_services +- name: CSI powerscale image pulling + hosts: kube_node, kube_control_plane + tasks: + - name: Pull images + ansible.builtin.include_role: + name: k8s_csi_powerscale_plugin + tasks_from: csi_powerscale_image_pull.yml + when: hostvars['127.0.0.1']['csi_driver_powerscale_precheck_pass'] | default(false) | bool + +- name: Install CSI powerscale plugin on kube control nodes + hosts: kube_control_plane + gather_facts: false + roles: + - k8s_csi_powerscale_plugin + - name: Apply common Slurm installation and config hosts: slurm_control_node, slurm_node, login gather_facts: false diff --git a/scheduler/slurm_exporter.yml b/scheduler/slurm_exporter.yml index 4f268fbcb..37bc68a8a 100644 --- a/scheduler/slurm_exporter.yml +++ b/scheduler/slurm_exporter.yml @@ -13,6 +13,10 @@ # limitations under the License. --- +- name: Update Inventory with ansible_host information + ansible.builtin.import_playbook: ../utils/servicetag_host_mapping.yml + when: not ( hostvars['127.0.0.1']['update_inventory_executed'] | default(false) | bool ) + # This playbook is used to configure slurm job based user access in compute nodes # The inventory queried in the below command is to be created by the user prior to running `omnia.yml`. # Command to execute: ansible-playbook install_slurm_exporter.yml -i inventory diff --git a/scheduler/slurm_restd.yml b/scheduler/slurm_restd.yml index c65ee5540..cf3075850 100644 --- a/scheduler/slurm_restd.yml +++ b/scheduler/slurm_restd.yml @@ -13,6 +13,10 @@ # limitations under the License. --- +- name: Update Inventory with ansible_host information + ansible.builtin.import_playbook: ../utils/servicetag_host_mapping.yml + when: not ( hostvars['127.0.0.1']['update_inventory_executed'] | default(false) | bool ) + # This playbook is used to configure slurm job based user access in compute nodes # The inventory queried in the below command is to be created by the user prior to running `omnia.yml`. # Command to execute: ansible-playbook install_slurm_restd.yml -i inventory diff --git a/security/ansible.cfg b/security/ansible.cfg index 573afaa5f..39839be85 100644 --- a/security/ansible.cfg +++ b/security/ansible.cfg @@ -4,6 +4,7 @@ host_key_checking = false forks = 5 timeout = 180 executable = /bin/bash +collections_path = $VIRTUAL_ENV [persistent_connection] command_timeout = 180 @@ -11,4 +12,4 @@ connect_timeout = 180 [ssh_connection] retries = 3 -ssh_args = -o ControlMaster=auto -o ControlPersist=180 \ No newline at end of file +ssh_args = -o ControlMaster=auto -o ControlPersist=180 diff --git a/security/readme.rst b/security/readme.rst index bf0a87360..0cf9851d0 100644 --- a/security/readme.rst +++ b/security/readme.rst @@ -1,8 +1,7 @@ Security ========= -The security feature allows users to set up FreeIPA and OpenLDAP to help authenticate into HPC clusters. - +The security role allows users to set up FreeIPA and LDAP to help authenticate into HPC clusters. .. note:: * Nodes provisioned using the Omnia provision tool do not require a RedHat subscription to run ``security.yml`` on RHEL target nodes. @@ -146,6 +145,9 @@ ________________________ Manager and compute nodes will have LDAP client installed and configured if ``ldap_required`` is set to true. The login node does not have LDAP client installed. +.. caution:: No users/groups will be created by Omnia. + + **Running the security role** Run: :: diff --git a/security/roles/ldap_server/tasks/ldap_prereq_redhat.yml b/security/roles/ldap_server/tasks/ldap_prereq_redhat.yml index 972f9a3ac..b005d46d0 100644 --- a/security/roles/ldap_server/tasks/ldap_prereq_redhat.yml +++ b/security/roles/ldap_server/tasks/ldap_prereq_redhat.yml @@ -41,7 +41,7 @@ no_log: true when: not openldap_status - - name: Copy Certificate to control plane for client setup + - name: Copy Certificate to Omnia Infrastructure Manager for client setup ansible.builtin.fetch: src: "{{ tls_certificates_directory_path }}/{{ rhel_cert_file }}" dest: "{{ tls_cert_path }}" @@ -69,7 +69,7 @@ - "{{ hostvars['127.0.0.1']['tls_certificate_key'] }}" when: not openldap_status - - name: Copy Certificate to control plane for client setup + - name: Copy Certificate to Omnia Infrastructure Manager for client setup ansible.builtin.copy: src: "{{ hostvars['127.0.0.1']['tls_ca_certificate'] }}" dest: "{{ tls_cert_path }}" diff --git a/security/roles/ldap_server/tasks/ldap_prereq_ubuntu.yml b/security/roles/ldap_server/tasks/ldap_prereq_ubuntu.yml index 561918e6e..681cc93eb 100644 --- a/security/roles/ldap_server/tasks/ldap_prereq_ubuntu.yml +++ b/security/roles/ldap_server/tasks/ldap_prereq_ubuntu.yml @@ -38,7 +38,7 @@ - "{{ ubuntu_cert_pkg }}" when: not openldap_status - - name: Copy Certificate to control plane for client setup + - name: Copy Certificate to Omnia Infrastructure Manager for client setup ansible.builtin.fetch: src: "{{ tls_certificates_directory_path }}/{{ deb_ca_cert_file }}" dest: "{{ tls_cert_path }}" @@ -66,7 +66,7 @@ - { file: "{{ hostvars['127.0.0.1']['tls_certificate_key'] }}", directory: private } when: not openldap_status - - name: Copy Certificate to control plane for client setup + - name: Copy Certificate to Omnia Infrastructure Manager for client setup ansible.builtin.copy: src: "{{ hostvars['127.0.0.1']['tls_ca_certificate'] }}" dest: "{{ tls_cert_path }}" diff --git a/security/roles/secure_login_node/files/auth_failure_check.yml b/security/roles/secure_login_node/files/auth_failure_check.yml index 88f689da8..1ebd5eeb8 100644 --- a/security/roles/secure_login_node/files/auth_failure_check.yml +++ b/security/roles/secure_login_node/files/auth_failure_check.yml @@ -1,4 +1,4 @@ -# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -23,14 +23,35 @@ auth_failure_info_file: "{{ alert_file_path }}/auth_failure_{{ ansible_date_time.iso8601_basic_short }}.txt" auth_failure_mail_subject: "Alert - Authentication Failure" auth_failure_mail_body: "Attached the authentication failure report" - auth_failure_mail_sender: omnia-alert file_mode: 644 + ubuntu_os: "ubuntu" + redhat_os: "redhat" + rocky_os: "rocky" tasks: - - name: Check auth failure in last {{ auth_failure_check_time }} minutes - ansible.builtin.shell: journalctl -u sshd --since "{{ auth_failure_check_time }} minutes ago" | grep "{{ auth_failure_search_key }}" + - name: Set ssh_service_name for RHEL/Rocky + ansible.builtin.set_fact: + ssh_service_name: sshd + when: ansible_distribution | lower in [redhat_os, rocky_os] + + - name: Set ssh_service_name for Ubuntu + ansible.builtin.set_fact: + ssh_service_name: ssh + when: ansible_distribution | lower in ubuntu_os + + - name: Set alert_email_list + ansible.builtin.set_fact: + alert_email_list: "{{ alert_email_address.split(',') }}" + + - name: Check auth failure in last {{ auth_failure_check_time }} minutes # noqa name[template] + ansible.builtin.shell: + cmd: | + set -o pipefail + journalctl -u {{ ssh_service_name }} --since "{{ auth_failure_check_time }} minutes ago" | grep "{{ auth_failure_search_key }}" changed_when: false failed_when: false register: auth_failure_check + args: + executable: /bin/bash - name: Create alerting log directory ansible.builtin.file: @@ -50,10 +71,11 @@ community.general.mail: subject: "{{ auth_failure_mail_subject }}" body: "{{ auth_failure_mail_body }}" - sender: "{{ auth_failure_mail_sender }}" - to: "{{ alert_email_address }}" + sender: "{{ smtp_sender_address }}" + to: "{{ item }}" attach: - "{{ auth_failure_info_file }}" + with_items: "{{ alert_email_list }}" when: auth_failure_search_key in auth_failure_check.stdout - name: Delete the authentication failure info file diff --git a/security/roles/secure_login_node/tasks/configure_alerting.yml b/security/roles/secure_login_node/tasks/configure_alerting.yml index 0f972a1f0..ec42277c1 100644 --- a/security/roles/secure_login_node/tasks/configure_alerting.yml +++ b/security/roles/secure_login_node/tasks/configure_alerting.yml @@ -16,10 +16,28 @@ - name: Include local_repo_access.yml file ansible.builtin.include_vars: "{{ local_repo_access_path }}" -- name: Install mailx and postfix +- name: Test reachablility of SMTP server + ansible.builtin.command: ping -c3 {{ hostvars['localhost']['smtp_server'][0]['host'] }} + changed_when: false + failed_when: false + register: smtp_server_reachability + +- name: Verify SMTP server reachability + ansible.builtin.fail: + msg: "{{ smtp_server_reach_msg }}" + when: ping_msg in smtp_server_reachability.stdout + +- name: Install mailx and postfix for RHEL/Rocky + ansible.builtin.package: + name: "{{ mail_packages_redhat }}" + state: present + when: ansible_distribution | lower in [redhat_os, rocky_os] + +- name: Install mailx and postfix for Ubuntu ansible.builtin.package: - name: "{{ mail_packages }}" + name: "{{ mail_packages_ubuntu }}" state: present + when: ansible_distribution | lower == ubuntu_os - name: Start postfix service ansible.builtin.systemd: @@ -27,5 +45,43 @@ state: started enabled: true +- name: Configure postfix for smtp server + ansible.builtin.lineinfile: + path: "{{ postfix_conf_path }}" + line: "{{ item.line }}" + regexp: "{{ item.regexp }}" + state: present + register: postfix_config + with_items: + - { regexp: "^relayhost*", line: "relayhost = {{ hostvars['localhost']['smtp_server'][0]['host'] }}:{{ hostvars['localhost']['smtp_server'][0]['port'] }}" } + - { regexp: "^myorigin*", line: "myorigin = {{ hostvars['localhost']['smtp_server'][0]['sender_address'] }}" } + +- name: Restart postfix service + ansible.builtin.systemd: + name: postfix + state: restarted + enabled: true + when: postfix_config.results[0]['changed'] or postfix_config.results[1]['changed'] + +- name: Load software_config.json + ansible.builtin.include_vars: + file: "{{ software_config_json_file }}" + name: software_config + +# set os_version from software_config.json file +- name: Set facts for cluster + ansible.builtin.set_fact: + cluster_os_version: "{{ software_config.cluster_os_version }}" + cluster_os_type: "{{ software_config.cluster_os_type }}" + +- name: Load secure_login_node.json + ansible.builtin.set_fact: + secure_login_node_json: "{{ lookup('file', secure_login_node_packages_file) | from_json }}" + +# Extract python version from json file +- name: Extract python version from software_config.json + ansible.builtin.set_fact: + python_version: "{{ secure_login_node_json['secure_login_node']['cluster'] | selectattr('type', 'in', ['deb', 'rpm']) | selectattr('package', 'search', 'python') | map(attribute='package') | first }}" # noqa: yaml[line-length] + - name: Install packages for os ansible.builtin.include_tasks: configure_alerting_{{ ansible_distribution | lower }}.yml diff --git a/security/roles/secure_login_node/tasks/configure_alerting_redhat.yml b/security/roles/secure_login_node/tasks/configure_alerting_redhat.yml index 0c5c63e29..ca3088f09 100644 --- a/security/roles/secure_login_node/tasks/configure_alerting_redhat.yml +++ b/security/roles/secure_login_node/tasks/configure_alerting_redhat.yml @@ -12,25 +12,44 @@ # See the License for the specific language governing permissions and # limitations under the License. --- +- name: Updating repos and installing python package + block: + - name: Update repos in RHEL/Rocky + ansible.builtin.dnf: + update_cache: true + register: update_repos + until: update_repos is not failed + retries: "{{ repo_retries }}" + delay: "{{ repo_delay }}" + + - name: Install {{ python_version }} + ansible.builtin.dnf: + name: "{{ python_version }}" + state: present + rescue: + - name: Failed to update repos + ansible.builtin.fail: + msg: "{{ update_repos_fail_msg }}" - name: Install packages environment: http_proxy: "{{ http_proxy }}" https_proxy: "{{ https_proxy }}" block: - - name: Install python3.9 - ansible.builtin.package: - name: "python3.9" - state: present + + - name: Install pip for {{ python_version }} + ansible.builtin.command: + cmd: "{{ python_version }} -m ensurepip --upgrade" + changed_when: false - name: Install required ansible packages ansible.builtin.command: - cmd: "python3.9 -m pip install {{ ansible_name }}=={{ secure_ansible_version }} {{ cryptography_name }}=={{ cryptography_version }}" + cmd: "{{ python_version }} -m pip install {{ ansible_name }}=={{ secure_ansible_version }} {{ cryptography_name }}=={{ cryptography_version }}" changed_when: false - name: Install required ansible jinja packages ansible.builtin.command: - cmd: "python3.9 -m pip install {{ jinja_name }}=={{ jinja_version }}" + cmd: "{{ python_version }} -m pip install {{ jinja_name }}=={{ jinja_version }}" changed_when: false - name: Install community_general_collection repo tarball @@ -72,4 +91,4 @@ ansible.builtin.cron: name: Auth failure alerting special_time: hourly - job: "{{ ansible_playbook_path.stdout.split(' ')[1] }} {{ alerting_file_path }} -e 'alert_email_address={{ hostvars['127.0.0.1']['alert_email_address'] }}'" + job: "{{ ansible_playbook_path.stdout.split(' ')[1] }} {{ alerting_file_path }} -e 'alert_email_address={{ hostvars['localhost']['alert_email_address'] }}' -e 'smtp_sender_address={{ hostvars['localhost']['smtp_server'][0]['sender_address'] }}'" # noqa: yaml[line-length] diff --git a/security/roles/secure_login_node/tasks/configure_alerting_ubuntu.yml b/security/roles/secure_login_node/tasks/configure_alerting_ubuntu.yml index 3843bfeff..1faefe9a8 100644 --- a/security/roles/secure_login_node/tasks/configure_alerting_ubuntu.yml +++ b/security/roles/secure_login_node/tasks/configure_alerting_ubuntu.yml @@ -12,25 +12,43 @@ # See the License for the specific language governing permissions and # limitations under the License. --- +- name: Updating repos and installing python package + block: + - name: Update repos in Ubuntu + ansible.builtin.apt: + update_cache: true + register: update_repos + until: update_repos is not failed + retries: "{{ repo_retries }}" + delay: "{{ repo_delay }}" + + - name: Install python packages + ansible.builtin.apt: + name: "{{ python_packages }}" + state: present + rescue: + - name: Failed to update repos + ansible.builtin.fail: + msg: "{{ update_repos_fail_msg }}" - name: Install packages environment: http_proxy: "{{ http_proxy }}" https_proxy: "{{ https_proxy }}" block: - - name: Install python3.9 - ansible.builtin.package: - name: "{{ python_packages }}" - state: present + - name: Install pip for {{ python_version }} + ansible.builtin.command: + cmd: "{{ python_version }} -m ensurepip --upgrade" + changed_when: false - name: Install required ansible packages ansible.builtin.command: - cmd: "python3.9 -m pip install {{ ansible_name }}=={{ secure_ansible_version }} {{ cryptography_name }}=={{ cryptography_version }}" + cmd: "{{ python_version }} -m pip install {{ ansible_name }}=={{ secure_ansible_version }} {{ cryptography_name }}=={{ cryptography_version }}" changed_when: false - name: Install required ansible jinja packages ansible.builtin.command: - cmd: "python3.9 -m pip install {{ jinja_name }}=={{ jinja_version }}" + cmd: "{{ python_version }} -m pip install {{ jinja_name }}=={{ jinja_version }}" changed_when: false - name: Install community_general_collection repo tarball @@ -72,4 +90,4 @@ ansible.builtin.cron: name: Auth failure alerting special_time: hourly - job: "{{ ansible_playbook_path.stdout.split(' ')[1] }} {{ alerting_file_path }} -e 'alert_email_address={{ hostvars['127.0.0.1']['alert_email_address'] }}'" + job: "{{ ansible_playbook_path.stdout.split(' ')[1] }} {{ alerting_file_path }} -e 'alert_email_address={{ hostvars['localhost']['alert_email_address'] }}' -e 'smtp_sender_address={{ hostvars['localhost']['smtp_server'][0]['sender_address'] }}'" # noqa: yaml[line-length] diff --git a/security/roles/secure_login_node/tasks/main.yml b/security/roles/secure_login_node/tasks/main.yml index 3a35683b3..7ac95504d 100644 --- a/security/roles/secure_login_node/tasks/main.yml +++ b/security/roles/secure_login_node/tasks/main.yml @@ -36,8 +36,8 @@ - name: Configure security features when: + - hostvars['127.0.0.1'].freeipa_support or hostvars['127.0.0.1'].openldap_support - hostvars['127.0.0.1']['enable_secure_login_node'] - - ansible_host in groups['login'][0] block: - name: Install Apparmor on Leap ansible.builtin.include_tasks: install_apparmor.yml diff --git a/security/roles/secure_login_node/tasks/restrict_nonessentials.yml b/security/roles/secure_login_node/tasks/restrict_nonessentials.yml index b05bbe3d9..ce0ba976f 100644 --- a/security/roles/secure_login_node/tasks/restrict_nonessentials.yml +++ b/security/roles/secure_login_node/tasks/restrict_nonessentials.yml @@ -54,6 +54,8 @@ enabled: false when: - "'telnet' in hostvars['127.0.0.1']['disable_services']" + - "'inetd.service' in ansible_facts.services" + - ansible_facts.services['inetd.service'].status in service_status - ansible_distribution | lower == ubuntu_os - name: Disabling telnet service 'Rhel/Rocky' diff --git a/security/roles/secure_login_node/vars/main.yml b/security/roles/secure_login_node/vars/main.yml index 3c222a16c..7ae6daa11 100644 --- a/security/roles/secure_login_node/vars/main.yml +++ b/security/roles/secure_login_node/vars/main.yml @@ -18,26 +18,39 @@ sshd_conf_file: /etc/ssh/sshd_config # Usage: configure_alerting.yml community_general_collection: "{{ offline_ansible_galaxy_collection_path }}/community.general:4.4.0.tar.gz" -mail_packages: +mail_packages_redhat: - mailx - postfix +mail_packages_ubuntu: + - mailutils + - postfix alerting_file_path: /root/auth_failure_check.yml hosts_file_mode: "0644" local_repo_access_path: "/opt/omnia/offline/local_repo_access.yml" ansible_name: "ansible" secure_ansible_version: "7.7.0" cryptography_name: "cryptography" -cryptography_version: "41.0.7" +cryptography_version: "44.0.0" jinja_name: "jinja2" jinja_version: "3.1.2" ansible_galaxy_path: "community.general.tar.gz" file_mode: "0644" +repo_retries: 5 +repo_delay: 10 +update_repos_fail_msg: "Failed to update repos. Verify internet availability on Omnia Infrastructure Manager." +software_config_json_file: "{{ role_path }}/../../../input/software_config.json" +secure_login_node_packages_file: "{{ role_path }}/../../../input/config/{{ cluster_os_type }}/{{ cluster_os_version }}/secure_login_node.json" +postfix_conf_path: /etc/postfix/main.cf +ping_msg: "100% packet loss" +smtp_server_reach_msg: "Failed. SMTP server is not reachable from login node. Please provide valid SMTP server host reachable in smtp_server variable +of input/login_node_security_config.yml" # Usage: configure_alerting_ubuntu python_packages: - - python3.9 - - python3.9-distutils - - python3-pip + - "{{ python_version }}" + - "{{ python_version }}-venv" + - "{{ python_version }}-distutils" + - "{{ python_version.split('.')[0] }}-pip" # Usage: install_snoopy.yml snoopy_packages: diff --git a/security/roles/security_validation/tasks/encrypt_security_config.yml b/security/roles/security_validation/tasks/encrypt_security_config.yml index 6fb8098a0..2595241b5 100644 --- a/security/roles/security_validation/tasks/encrypt_security_config.yml +++ b/security/roles/security_validation/tasks/encrypt_security_config.yml @@ -15,7 +15,7 @@ - name: Encrypt input config file ansible.builtin.command: >- - {{ ansible_vault_path.stdout.split(' ')[1] }} encrypt {{ security_config_file }} --vault-password-file {{ security_vaultname }} + ansible-vault encrypt {{ security_config_file }} --vault-password-file {{ security_vaultname }} changed_when: false - name: Update config_content permission diff --git a/security/roles/security_validation/tasks/fetch_security_inputs.yml b/security/roles/security_validation/tasks/fetch_security_inputs.yml index 5faf00aae..7082f4601 100644 --- a/security/roles/security_validation/tasks/fetch_security_inputs.yml +++ b/security/roles/security_validation/tasks/fetch_security_inputs.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -47,14 +47,36 @@ success_msg: "{{ session_timeout_success_msg }}" fail_msg: "{{ session_timeout_fail_msg }}" -- name: Validate alert_email_address - ansible.builtin.assert: - that: - - email_search_key in alert_email_address - - alert_email_address | length < email_max_length - success_msg: "{{ alert_email_success_msg }}" - fail_msg: "{{ alert_email_fail_msg }}" +- name: Validate email alerting inputs when: alert_email_address | length > 1 + block: + - name: Set alert_email_list + ansible.builtin.set_fact: + alert_email_list: "{{ alert_email_address.split(',') }}" + + - name: Validate alert_email_address + ansible.builtin.assert: + that: + - email_search_key in item + - item | length < email_max_length + success_msg: "{{ alert_email_success_msg }}" + fail_msg: "{{ alert_email_fail_msg }}" + with_items: "{{ alert_email_list }}" + when: alert_email_address | length > 1 + + - name: Validate smtp_server details when smtp_server is provided + ansible.builtin.assert: + that: + - smtp_server | list | length == 1 + - smtp_server[0]['host'] is defined + - smtp_server[0]['port'] is defined + - smtp_server[0]['sender_address'] is defined + - smtp_server[0]['host'] | length > 1 + - smtp_server[0]['port'] | length > 1 + - smtp_server[0]['sender_address'] | length > 1 + - email_search_key in smtp_server[0]['sender_address'] + fail_msg: "{{ smtp_server_fail_msg }}" + when: alert_email_address | length > 1 - name: Warning - alert_email_address is empty ansible.builtin.debug: @@ -89,6 +111,15 @@ success_msg: "{{ allow_deny_success_msg }}" fail_msg: "{{ allow_deny_fail_msg }}" +- name: Pause for 15 seconds after warning + ansible.builtin.pause: + seconds: "{{ warning_wait_time_warning }}" + prompt: "{{ root_user_absence }}" + when: + - "user_list is defined and user_list | length > 1" + - "'root' not in user_list" + - "allow_deny == 'allow'" + - name: Initialize variables for restrict_softwares ansible.builtin.set_fact: restrict_program_status: false diff --git a/security/roles/security_validation/tasks/fetch_software_config.yml b/security/roles/security_validation/tasks/fetch_software_config.yml index c070341eb..028568850 100644 --- a/security/roles/security_validation/tasks/fetch_software_config.yml +++ b/security/roles/security_validation/tasks/fetch_software_config.yml @@ -46,6 +46,13 @@ msg: "{{ freeipa_and_openldap_true_fail_msg }}" when: freeipa_support and openldap_support +- name: Warn if secure login is not required + ansible.builtin.pause: + seconds: "{{ ipa_support_wait_time }}" + prompt: "{{ ipa_support_warning }}" + when: not (freeipa_support or openldap_support) + failed_when: false + - name: Set facts for authentication system ansible.builtin.set_fact: authentication_system: "{{ 'openldap' if openldap_support else 'freeipa' if freeipa_support }}" diff --git a/security/roles/security_validation/tasks/include_omnia_config.yml b/security/roles/security_validation/tasks/include_omnia_config.yml index 78e70f1da..038cfae31 100644 --- a/security/roles/security_validation/tasks/include_omnia_config.yml +++ b/security/roles/security_validation/tasks/include_omnia_config.yml @@ -13,11 +13,6 @@ # limitations under the License. --- -- name: Fetch ansible-vault path - ansible.builtin.command: whereis ansible-vault - changed_when: false - register: ansible_vault_path - - name: Check if omnia config file is encrypted ansible.builtin.command: cat {{ omnia_config_file }} changed_when: false @@ -26,7 +21,7 @@ - name: Decrpyt omnia_config.yml ansible.builtin.command: >- - {{ ansible_vault_path.stdout.split(' ')[1] }} decrypt {{ omnia_config_file }} --vault-password-file {{ omnia_vaultname }} + ansible-vault decrypt {{ omnia_config_file }} --vault-password-file {{ omnia_vaultname }} when: "'$ANSIBLE_VAULT;' in config_content.stdout" changed_when: true @@ -60,7 +55,7 @@ - name: Encrypt input config file ansible.builtin.command: >- - {{ ansible_vault_path.stdout.split(' ')[1] }} encrypt {{ omnia_config_file }} --vault-password-file {{ omnia_vaultname }} + ansible-vault encrypt {{ omnia_config_file }} --vault-password-file {{ omnia_vaultname }} changed_when: false - name: Update config_content permission diff --git a/security/roles/security_validation/tasks/include_security_config.yml b/security/roles/security_validation/tasks/include_security_config.yml index 91f4a4f20..4fed3a807 100644 --- a/security/roles/security_validation/tasks/include_security_config.yml +++ b/security/roles/security_validation/tasks/include_security_config.yml @@ -13,11 +13,6 @@ # limitations under the License. --- -- name: Fetch ansible-vault path - ansible.builtin.command: whereis ansible-vault - changed_when: false - register: ansible_vault_path - - name: Check if security config file is encrypted ansible.builtin.command: cat {{ security_config_file }} changed_when: false @@ -26,7 +21,7 @@ - name: Decrpyt security_config.yml ansible.builtin.command: >- - {{ ansible_vault_path.stdout.split(' ')[1] }} decrypt {{ security_config_file }} --vault-password-file {{ security_vaultname }} + ansible-vault decrypt {{ security_config_file }} --vault-password-file {{ security_vaultname }} when: "'$ANSIBLE_VAULT;' in config_content.stdout" changed_when: true diff --git a/security/roles/security_validation/tasks/ldap_prereq.yml b/security/roles/security_validation/tasks/ldap_prereq.yml index 2667fa2a9..10cbc3285 100644 --- a/security/roles/security_validation/tasks/ldap_prereq.yml +++ b/security/roles/security_validation/tasks/ldap_prereq.yml @@ -13,11 +13,17 @@ # limitations under the License. --- -- name: Get OpenLDAP local git repo status - ansible.builtin.uri: - url: "{{ offline_git_path }}/{{ ltb_git_repo_path }}.tar.gz" - return_content: false - register: ldap_repo_status +- name: Verify OpenLDAP offline git repo + block: + - name: Get OpenLDAP local git repo status + ansible.builtin.uri: + url: "{{ offline_git_path }}/{{ ltb_git_repo_path }}.tar.gz" + return_content: false + register: ldap_repo_status + rescue: + - name: OpenLDAP local git repo not found + ansible.builtin.fail: + msg: "{{ ldap_repo_failure_msg }}" - name: Validate OpenLDAP local git repo ansible.builtin.assert: @@ -38,11 +44,33 @@ dest: "{{ ldap_dir }}" mode: "{{ file_permission }}" -- name: Setup LTB repo to control plane +- name: Setup LTB repo to Omnia Infrastructure Manager ansible.builtin.unarchive: src: "{{ ldap_dir }}/{{ ltb_git_repo_path }}.tar.gz" dest: "{{ ldap_dir }}" +- name: Load openldap.json + ansible.builtin.set_fact: + openldap_packages_json: "{{ lookup('file', openldap_packages_file) | from_json }}" + +- name: Check if commit id is mentioned for LTB + ansible.builtin.set_fact: + ltb_commit: true + ltb_commit_id: "{{ item.commit }}" + loop: "{{ hostvars['localhost']['openldap_packages_json']['openldap']['cluster'] }}" + when: + - item.type == 'git' + - "'ansible-role-ldaptoolbox-openldap' in item.package" + - "item.commit is defined" + +- name: Checkout specific commit for LTB + ansible.builtin.command: "git checkout {{ ltb_commit_id }}" # noqa: command-instead-of-module + changed_when: false + when: + - ltb_commit is defined + args: + chdir: "{{ ldap_git_dir }}" + - name: Remove public access tasks ansible.builtin.replace: dest: "{{ ldap_dir }}/{{ ltb_git_repo_path }}/tasks/main.yml" diff --git a/security/roles/security_validation/tasks/validate_input_params.yml b/security/roles/security_validation/tasks/validate_input_params.yml index 91ceac250..d60d9eaa8 100644 --- a/security/roles/security_validation/tasks/validate_input_params.yml +++ b/security/roles/security_validation/tasks/validate_input_params.yml @@ -47,3 +47,14 @@ success_msg: "{{ domain_name_success_msg }}" fail_msg: "{{ domain_name_fail_msg }}" when: authentication_system == 'openldap' or authentication_system == 'freeipa' + +- name: Validate http_proxy, https_proxy and no_proxy configured as environment variables + ansible.builtin.assert: + that: + - lookup('env', 'http_proxy') | length > 1 + - lookup('env', 'https_proxy') | length > 1 + - lookup('env', 'no_proxy') | length > 1 + - oim_hostname in lookup('env', 'no_proxy') + - admin_nic_ip in lookup('env', 'no_proxy') + fail_msg: "{{ proxy_env_fail_msg }}" + when: proxy_status diff --git a/security/roles/security_validation/vars/main.yml b/security/roles/security_validation/vars/main.yml index b17ab7de5..8e210b146 100644 --- a/security/roles/security_validation/vars/main.yml +++ b/security/roles/security_validation/vars/main.yml @@ -23,7 +23,7 @@ hosts_file_dest: "/etc/hosts" hosts_file_mode: "0644" security_config_syntax_fail_msg: "Failed. Syntax errors present in security_config.yml. Fix errors and re-run playbook again." -# Usage: fetch_inputs.yml +# Usage: validate_input_params.yml authentication_system_success_msg: "authentication_system variable successfully validated" authentication_system_fail_msg: "Failed. authentication_system variable in security_config.yml should be either openldap or freeipa" min_length: 8 @@ -38,11 +38,14 @@ fail_msg_directory_manager_password: "Failed. Incorrect format provided for dire success_msg_kerberos_admin_password: "kerberos_admin_password successfully validated" fail_msg_kerberos_admin_password: "Failed. Incorrect format provided for kerberos_admin_password" input_config_failure_msg: "Failed. Since, authentication_system is freeipa, provide few additional parameters in input/security_config.yml." -secure_login_node_success_msg: "enable_secure_login_node successfully validated" +secure_login_node_success_msg: "secure_login_node present in input/software_config.json." secure_login_node_fail_msg: "Failed. enable_secure_login_node should be either true or false" -secure_login_node_warning_msg: "[WARNING] enable_secure_login_node is true in security_config.yml and +secure_login_node_warning_msg: "[WARNING] secure_login_node present in input/software_config.json and login group not provided with login node IP address in inventory. Skipping tasks for secure login node configuration" ubuntu_freeipa_support_fail_msg: "Failed. freeipa is not supported on ubuntu. Remove freeipa from software_config.json and rerun the playbook" +proxy_env_fail_msg: "Failed. The values for http_proxy and https_proxy in the +proxy variable of the site_config.yml should be set as environment variables http_proxy and https_proxy in the Omnia Infrastructure Manager. +The no_proxy environment variable should include the Omnia Infrastructure Manager hostname and the admin network IP address." # Usage: fetch_security_inputs.yml max_failures_success_msg: "max_failures successfully validated" @@ -53,6 +56,8 @@ lockout_duration_success_msg: "lockout_duration successfully validated" lockout_duration_fail_msg: "Failed. Incorrect lockout_duration value in login_node_security_config.yml" session_timeout_success_msg: "session_timeout successfully validated" session_timeout_fail_msg: "Failed. Incorrect session_timeout value in login_node_security_config.yml" +root_user_absence: "Warning: root user not provide in allow list of user. 'root' will not have the privilege to access the login node." +warning_wait_time_warning: 15 max_failures_default_value: 3 failure_reset_interval_min_value: 30 failure_reset_interval_max_value: 60 @@ -73,6 +78,8 @@ restrict_program_support_success_msg: "restrict_program_support successfully val restrict_program_support_failure_msg: "Failed. Accepted values are true or false." restrict_softwares_success_msg: "restrict_softwares successfully validated" restrict_softwares_failure_msg: "Warning. Values should be comma separated. The supported services are telnet,lpd,bluetooth,rlogin,rexec." +smtp_server_fail_msg: "Failed. smtp_server details are mandatory when alert_email_address provide in login_node_security_config.yml. +Provide SMTP server host, port and sender_address with single SMTP server details for sending alert email." # Usage: fetch_ldap_client_inputs.yml ldap_client_config_failure_msg: "LDAP Client Input parameters cannot be empty when ldap_reqired is set to true" @@ -101,15 +108,18 @@ local_repo_access_config_file: "/opt/omnia/offline/local_repo_access.yml" # Usage: generate_ldap_password_hashes.yml hashing_python_file: "{{ role_path }}/files/generate_hash.py" -python_version: python3.9 +python_version: "{{ ansible_python_interpreter }}" # Usage: ldap_prereq.yml +openldap_packages_file: "{{ role_path }}/../../../input/config/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/openldap.json" ldap_dir: "/opt/omnia/ldap" ltb_git_repo_path: "ansible-role-ldaptoolbox-openldap" +ldap_git_dir: "/opt/omnia/ldap/ansible-role-ldaptoolbox-openldap" file_permission: "0644" openldap_software_failure_msg: "Failed, OpenLDAP software stack is not present in software_config.json file. Update openldap software stack in software_config.json and execute local_repo.yml." -ldap_repo_failure_msg: "Failed, Local repo not present for OpenLDAP. Execute local_repo.yml again." +ldap_repo_failure_msg: "Failed. The OpenLDAP repository is missing. Please run the local_repo.yml playbook to download the required OpenLDAP packages. +Once the playbook has been successfully executed, you can rerun the original playbook." # fetch_software_config.yml software_config_json_file: "{{ role_path }}/../../../input/software_config.json" @@ -118,6 +128,10 @@ Please give one of them in software_config.json and execute local_repo.yml again freeipa_and_openldap_true_fail_msg: "Both freeipa and openldap are present in software_config.json. Please give only one of them in software_config.json" compute_os_ubuntu: "ubuntu" +ipa_support_wait_time: 30 +ipa_support_warning: "Skipping the configuration of the secure login node +because the software_config.json file does not contain either 'freeipa' or 'openldap' as the authentication system. +To enable the configuration of the secure login node, please provide either 'freeipa' or 'openldap' in the software_config.json file." # secure_login_node_prereq.yml snoopy_package: "install-snoopy" diff --git a/security/tests/files/test_external_ldap.yml b/security/tests/files/test_external_ldap.yml index 598e828d7..0636177b1 100644 --- a/security/tests/files/test_external_ldap.yml +++ b/security/tests/files/test_external_ldap.yml @@ -20,6 +20,7 @@ - /root/omnia/security/tests/test_vars/test_ldap_vars.yml tasks: - name: Validate LDAP client installation on client nodes + tags: LDAP_TC_001 block: - name: Identify presence of LDAP client installation ansible.legacy.shell: "rpm -qa | grep -i openldap" @@ -32,7 +33,6 @@ - '"openldap" in lclient_installed_msg.stdout' success_msg: "{{ ldap_client_installation_success_msg }}" fail_msg: "{{ ldap_client_installation_fail_msg }}" - tags: LDAP_TC_001 # Testcase to create LDAP user in server - name: OMNIA_1.4_LDAP_TC_002 @@ -41,6 +41,7 @@ - /root/omnia/security/tests/test_vars/test_ldap_vars.yml tasks: - name: Create a user in LDAP server + tags: LDAP_TC_002, LDAP_TC_003 block: - name: Create random cn for the user ansible.legacy.shell: echo "{{ ldap_cn_var + random_number }}" @@ -93,7 +94,6 @@ - '"{{ ldap_user_cn.stdout }}" in ldap_add_user_msg.stdout' success_msg: "{{ ldap_add_user_success_msg }} {{ ldap_user_cn.stdout }}" fail_msg: "{{ ldap_add_user_fail_msg }}" - tags: LDAP_TC_002, LDAP_TC_003 # Testcase to validate presence of created user in LDAP clients when # login_node_required and ldap_required is true @@ -103,9 +103,11 @@ - /root/omnia/security/tests/test_vars/test_ldap_vars.yml tasks: - name: Validate presence of user created in server on clients + tags: LDAP_TC_003 block: - name: Identify presence of created user on LDAP clients - ansible.legacy.shell: "ldapsearch -x -b '{{ user_olcSuffix }}' '(cn={{ ldap_user_cn.stdout }})'" + ansible.legacy.shell: + cmd: "ldapsearch -x -b '{{ user_olcSuffix }}' '(cn={{ ldap_user_cn.stdout }})'" register: ldap_user_search_msg changed_when: false @@ -115,7 +117,6 @@ - '"{{ ldap_user_cn.stdout }}" in ldap_user_search_msg.stdout' success_msg: "{{ ldap_user_search_client_success_msg }}" fail_msg: "{{ ldap_user_search_client_fail_msg }}" - tags: LDAP_TC_003 # Testcase to validate LDAP client installation on client nodes when # login_node_required: false @@ -126,6 +127,7 @@ - /root/omnia/security/tests/test_vars/test_ldap_vars.yml tasks: - name: Validate presence of client installation on client nodes + tags: LDAP_TC_004 block: - name: Identify presence of LDAP client installation on manager and compute node ansible.legacy.shell: "rpm -qa | grep -i openldap" @@ -162,7 +164,6 @@ - '"openldap" not in lclient_installed_msg_ln.results[0].stdout' success_msg: "{{ ldap_client_installation_login_success_msg }}" fail_msg: "{{ ldap_client_installation_login_fail_msg }}" - tags: LDAP_TC_004 # Testcase to validate free IPA installation on client nodes when # ldap_required: false @@ -173,6 +174,7 @@ - /root/omnia/security/tests/test_vars/test_ldap_vars.yml tasks: - name: Validate presence of IPA client installation on client nodes + tags: LDAP_TC_005 block: - name: Identify presence of IPA clietn installation on client nodes ansible.legacy.shell: "ipa help topics | wc -l" @@ -185,4 +187,3 @@ - '"{{ ipa_help_topics_count.stdout }}" | int > 1' success_msg: "{{ ipa_installation_success_msg }}" fail_msg: "{{ ipa_installation_fail_msg }}" - tags: LDAP_TC_005 \ No newline at end of file diff --git a/security/tests/test_external_ldap.yml b/security/tests/test_external_ldap.yml index 6deb08c00..7c370c733 100644 --- a/security/tests/test_external_ldap.yml +++ b/security/tests/test_external_ldap.yml @@ -19,6 +19,7 @@ - "{{ playbook_dir }}/test_vars/test_ldap_vars.yml" tasks: - name: To validate Openldap, Sssd, SSSD-LDAP, Oddjobmkdir, Openssl + tags: TC_001 block: - name: Find Openldap Installation ansible.legacy.shell: "rpm -qa | grep openldap*" @@ -70,7 +71,6 @@ - "'openssl' in openssl.stdout" success_msg: "{{ openssl_success_msg }}" fail_msg: "{{ openssl_fail_msg }}" - tags: TC_001 # Checking Services Status For Ldap server - name: Validate SSSD and Oddjob-Mkdir Status hosts: manager,compute,login @@ -78,6 +78,7 @@ - "{{ playbook_dir }}/test_vars/test_ldap_vars.yml" tasks: - name: To validate user creation on manager,compute and Login node + tags: TC_002 block: - name: Oddjobmaker service status ansible.legacy.shell: "systemctl status oddjob-mkhomedir.service" @@ -99,7 +100,6 @@ - "'Active: active (running)' in sssd.stdout" success_msg: "{{ sssd_status_success_msg }}" fail_msg: "{{ sssd_status_fail_msg }}" - tags: TC_002 # Testcase to validate presence of created user in LDAP clients when # login_node_required and ldap_required is true - name: OMNIA_1.4_LDAP_TC_003 @@ -108,9 +108,11 @@ - "{{ playbook_dir }}/test_vars/test_ldap_vars.yml" tasks: - name: Validate presence of user created in server on clients + tags: LDAP_TC_003 block: - name: Identify presence of created user on LDAP clients - ansible.legacy.shell: "ldapsearch -x -D {{ user_olcRootDN }},{{ user_olcSuffix }} -w {{ slapd_password }} -s onelevel -b 'uid={{ user }},{{ user_group }},{{ user_olcSuffix }}'" + ansible.legacy.shell: + cmd: "ldapsearch -x -D {{ user_olcRootDN }},{{ user_olcSuffix }} -w {{ slapd_password }} -s onelevel -b 'uid={{ user }},{{ user_group }},{{ user_olcSuffix }}'" # noqa: yaml[line-length] register: ldap_user_search_msg changed_when: false @@ -120,13 +122,14 @@ - '"{{ user }}" in ldap_user_search_msg.stdout' success_msg: "{{ ldap_user_search_client_success_msg }}" fail_msg: "{{ ldap_user_search_client_fail_msg }}" - tags: LDAP_TC_003 + - name: Validate user Creation on login compute and manager node hosts: manager,compute,login vars_files: - "{{ playbook_dir }}/test_vars/test_ldap_vars.yml" tasks: - name: To validate user creation on manager,compute and Login node + tags: TC_003 block: - name: Find Openldap Installation ansible.legacy.shell: "getent passwd {{ user }}" @@ -138,7 +141,7 @@ - "'{{ user }}' in usr.stdout" success_msg: "{{ user_success_msg }}" fail_msg: "{{ user_fail_msg }}" - tags: TC_003 + - name: Check user Login hosts: all vars_files: @@ -156,8 +159,6 @@ - "'{{ user }}' in usr.stdout" success_msg: "{{ user_success_msg }}" fail_msg: "{{ user_fail_msg }}" - - # Testcase to validate LDAP client installation on client nodes when # login_node_required and ldap_required is true # Testcase to create LDAP user in server @@ -167,6 +168,7 @@ - "{{ playbook_dir }}/test_vars/test_ldap_vars.yml" tasks: - name: Create a user in LDAP server + tags: LDAP_TC_002, LDAP_TC_003 block: - name: Create random cn for the user ansible.legacy.shell: echo "{{ ldap_cn_var + random_number }}" @@ -219,4 +221,3 @@ - '"{{ ldap_user_cn.stdout }}" in ldap_add_user_msg.stdout' success_msg: "{{ ldap_add_user_success_msg }} {{ ldap_user_cn.stdout }}" fail_msg: "{{ ldap_add_user_fail_msg }}" - tags: LDAP_TC_002, LDAP_TC_003 diff --git a/security/tests/test_passwordless.yml b/security/tests/test_passwordless.yml index 17875e12b..2fab54819 100644 --- a/security/tests/test_passwordless.yml +++ b/security/tests/test_passwordless.yml @@ -16,9 +16,9 @@ vars_files: - test_vars/test_passwordless_vars.yml tasks: - - name: Validate SSH from Manager node to Compute node + - name: Validate SSH from Manager node to Compute node # noqa: no-changed-when become: false - ansible.builtin.shell: ssh {{ ssh_user }}@{{ inventory_hostname }} "w {{ ssh_user }}" + ansible.builtin.shell: ssh {{ ssh_user }}@{{ inventory_hostname }} "w {{ ssh_user }}" # noqa: command-instead-of-shell with_items: "{{ groups['compute'] }}" register: ssh_result_manager @@ -32,9 +32,9 @@ vars_files: - test_passwordless_vars.yml tasks: - - name: Validate SSH from Compute node to Compute node + - name: Validate SSH from Compute node to Compute node # noqa: no-changed-when become: false - ansible.builtin.shell: ssh {{ ssh_user }}@{{ inventory_hostname }} "w {{ ssh_user }}" + ansible.builtin.shell: ssh {{ ssh_user }}@{{ inventory_hostname }} "w {{ ssh_user }}" # noqa: command-instead-of-shell with_items: "{{ groups['compute'] }}" register: ssh_result_compute @@ -48,13 +48,13 @@ vars_files: - test_passwordless_vars.yml tasks: - - name: Validate SSH from Compute node to Manager node + - name: Validate SSH from Compute node to Manager node # noqa: no-changed-when become: false - ansible.builtin.shell: ssh {{ ssh_user }}@{{ inventory_hostname }} "w {{ ssh_user }}" + ansible.builtin.shell: ssh {{ ssh_user }}@{{ inventory_hostname }} "w {{ ssh_user }}" # noqa: command-instead-of-shell with_items: "{{ groups['manager'] }}" register: ssh_result_compute_manager - name: Display SSH result ansible.builtin.debug: var: ssh_result_compute_manager.results - tags: TC003 \ No newline at end of file + tags: TC003 diff --git a/security/tests/test_vars/test_ldap_vars.yml b/security/tests/test_vars/test_ldap_vars.yml index cd9cc6b02..a16eb731f 100644 --- a/security/tests/test_vars/test_ldap_vars.yml +++ b/security/tests/test_vars/test_ldap_vars.yml @@ -29,7 +29,7 @@ ldap_add_user_fail_msg: "LDAP user creation failed" ldap_user_search_client_success_msg: "Created user available on client nodes" ldap_user_search_client_fail_msg: "Created user is not available on client nodes" -##Client Packages Messages and User Creation message +## Client Packages Messages and User Creation message openldap_success_msg: "Openldap installed successfully" openldap_fail_msg: "Openldap not installed" sssd_success_msg: "SSSD installed successfully" diff --git a/security/tests/test_vars/test_passwordless_vars.yml b/security/tests/test_vars/test_passwordless_vars.yml index 2425adb9e..c8c1fba04 100644 --- a/security/tests/test_vars/test_passwordless_vars.yml +++ b/security/tests/test_vars/test_passwordless_vars.yml @@ -13,4 +13,4 @@ # limitations under the License. --- -ssh_user: user1 \ No newline at end of file +ssh_user: user1 diff --git a/security/user_passwordless_ssh.yml b/security/user_passwordless_ssh.yml index a552a0d09..dea9cb0b8 100644 --- a/security/user_passwordless_ssh.yml +++ b/security/user_passwordless_ssh.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,6 +13,10 @@ # limitations under the License. --- +- name: Update Inventory with ansible_host information + ansible.builtin.import_playbook: ../utils/servicetag_host_mapping.yml + when: not ( hostvars['127.0.0.1']['update_inventory_executed'] | default(false) | bool ) + - name: Validate input parameters hosts: localhost any_errors_fatal: true diff --git a/server_spec_update/roles/add_nic_network/vars/main.yml b/server_spec_update/roles/add_nic_network/vars/main.yml deleted file mode 100644 index 8c6bb3d8c..000000000 --- a/server_spec_update/roles/add_nic_network/vars/main.yml +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -# Usage: update_new_nic_network.yml -python_version: "python3.9" -update_nic_nw_path: "{{ role_path }}/files/add_nic_xcat_network.py" -nw_spec_path: "{{ role_path }}/../../../input/network_spec.yml" -cal_path: "{{ role_path }}/../../../discovery/roles/discovery_mechanism/mtms/files" -metadata_nicinfo_path: "/opt/omnia/.data/nic_metadata.yml" -file_perm: "0644" diff --git a/server_spec_update/roles/metadata_creation/tasks/main.yml b/server_spec_update/roles/metadata_creation/tasks/main.yml deleted file mode 100644 index ed91835cd..000000000 --- a/server_spec_update/roles/metadata_creation/tasks/main.yml +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Create metadata file - when: add_network_status - block: - - name: Create nic metadata file - ansible.builtin.include_tasks: create_nic_metadata.yml - - - name: Validate input parameters if metadata exits - when: metadata_status.stat.exists - block: - - name: Validate metadata parameters - ansible.builtin.include_tasks: validate_metadata_params.yml diff --git a/server_spec_update/roles/update_node_object/vars/main.yml b/server_spec_update/roles/update_node_object/vars/main.yml deleted file mode 100644 index 31b4448bd..000000000 --- a/server_spec_update/roles/update_node_object/vars/main.yml +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -# Usage: update_nodes.yml -python_version: "python3.9" -network_spec_file_path: "{{ role_path }}/../../../input/network_spec.yml" -update_node_objects_path: "{{ role_path }}/files/update_node_objects.py" -omnia_db_path: "{{ role_path }}/../../../discovery/roles/db_operations/files" -server_sepc_update_success_msg: "server_spec_update.yml execution is successful. Verify the networks configured on the nodes. -Networks might not be configured if invalid NIC names are provided in the input file, server_spec.yml. -If vlan is not configured for a NIC ensure vlan name is provided in the format NIC.vlan_id(eth1.101) in server_spec.yml and re-run the playbook." diff --git a/storage/ansible.cfg b/storage/ansible.cfg index a2f1dbee3..54bb28c6d 100644 --- a/storage/ansible.cfg +++ b/storage/ansible.cfg @@ -4,6 +4,7 @@ host_key_checking = false forks = 5 timeout = 180 executable = /bin/bash +collections_path = $VIRTUAL_ENV [persistent_connection] command_timeout = 180 diff --git a/storage/powervault_input.yml b/storage/powervault_input.yml index 5675386a5..513b387bb 100644 --- a/storage/powervault_input.yml +++ b/storage/powervault_input.yml @@ -124,4 +124,4 @@ snmp_trap_destination: "" # Provide the snmp community name required # Default value:"public" -snmp_community_name: "public" \ No newline at end of file +snmp_community_name: "public" diff --git a/storage/roles/nfs_iscsi/tasks/check_prerequisites.yml b/storage/roles/nfs_iscsi/tasks/check_prerequisites.yml deleted file mode 100644 index d86f7d24b..000000000 --- a/storage/roles/nfs_iscsi/tasks/check_prerequisites.yml +++ /dev/null @@ -1,138 +0,0 @@ -# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -# Include base_vars.yml -- name: Include base_vars of control plane - include_vars: "{{ role_path }}/../../control_plane/input_params/base_vars.yml" - -# Check nfs_node_status -- block: - - name: Initialize variables - set_fact: - nfs_node_status: false - - - name: Set NFS node status - set_fact: - nfs_node_status: true - when: - - groups['nfs'] is defined - - groups['nfs'] | length | int > 0 - - - name: NFS group to contain exactly 1 node - assert: - that: "groups['nfs'] | length | int == 1" - fail_msg: "{{ nfs_node_group_fail_msg }}" - success_msg: "{{ nfs_node_group_success_msg }}" - when: nfs_node_status - -# Include omnia_config.yml - - name: Check if omnia_vault_key exists - stat: - path: "{{ role_path }}/../../{{ config_vaultname }}" - register: vault_key_result - - - name: Create ansible vault key if it does not exist - set_fact: - vault_key: "{{ lookup('password', '/dev/null chars=ascii_letters') }}" - when: not vault_key_result.stat.exists - - - name: Save vault key - copy: - dest: "{{ role_path }}/../../{{ config_vaultname }}" - content: | - {{ vault_key }} - force: yes - mode: "{{ vault_file_perm }}" - when: not vault_key_result.stat.exists - - - name: Check if omnia config file is encrypted - command: cat {{ role_path }}/../../{{ config_filename }} - changed_when: false - register: config_content - no_log: True - - - name: Decrpyt omnia_config.yml - command: >- - ansible-vault decrypt {{ role_path }}/../../{{ config_filename }} - --vault-password-file {{ role_path }}/../../{{ config_vaultname }} - when: "'$ANSIBLE_VAULT;' in config_content.stdout" - - - name: Include variable file omnia_config.yml - include_vars: "{{ role_path }}/../../{{ config_filename }}" - no_log: True - -# Include login_vars.yml - - - name: Check login_vars file is encrypted - command: cat "{{ role_path }}/../../control_plane/{{ login_vars_filename }}" - changed_when: false - register: config_content - no_log: true - - - name: Decrpyt login_vars.yml - command: >- - ansible-vault decrypt "{{ role_path }}/../../control_plane/{{ login_vars_filename }}" - --vault-password-file "{{ role_path }}/../../control_plane/{{ vault_filename }}" - changed_when: false - when: "'$ANSIBLE_VAULT;' in config_content.stdout" - - - name: Include variable file login_vars.yml - include_vars: "{{ role_path }}/../../control_plane/{{ login_vars_filename }}" - no_log: true - when: powervault_support - -# Validate Powervault variables - -- name: Fetch powervault inputs - include_tasks: "../../cluster_validation/tasks/fetch_powervault_status.yml" - when: - - powervault_support - - nfs_node_status - -# Encrpyt omnia_config.yml file -- block: - - - name: Encrypt input config file - command: >- - ansible-vault encrypt {{ role_path }}/../../{{ config_filename }} - --vault-password-file {{ role_path }}/../../{{ config_vaultname }} - changed_when: false - -# Encrypt login_vars.yml file - - name: Create ansible vault key - set_fact: - vault_key: "{{ lookup('password', '/dev/null chars=ascii_letters') }}" - when: "'$ANSIBLE_VAULT;' not in config_content.stdout" - - - name: Save vault key - copy: - dest: "{{ role_path }}/../../control_plane/{{ vault_filename }}" - content: | - {{ vault_key }} - force: yes - mode: "{{ vault_file_perm }}" - when: "'$ANSIBLE_VAULT;' not in config_content.stdout" - - - name: Encrypt login_vars file - command: >- - ansible-vault encrypt "{{ role_path }}/../../control_plane/{{ login_vars_filename }}" - --vault-password-file "{{ role_path }}/../../control_plane/{{ vault_filename }}" - changed_when: false - - - name: Update login_vars.yml permission - file: - path: "{{ role_path }}/../../control_plane/{{ login_vars_filename }}" - mode: "{{ vault_file_perm }}" - when: powervault_support \ No newline at end of file diff --git a/storage/roles/nfs_iscsi/tasks/main.yml b/storage/roles/nfs_iscsi/tasks/main.yml deleted file mode 100644 index a544516ab..000000000 --- a/storage/roles/nfs_iscsi/tasks/main.yml +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: include base vars - include_vars: "{{ role_path }}/../../control_plane/input_params/base_vars.yml" - -- block: - - name: Include variable file for powervault - include_vars: "{{ pv_nfs_file }}" - - - block: - - name: Validate the nfs configuration - include_tasks: validate_nfs_config.yml - - - name: Configure the server - include_tasks: nfs_node_configure.yml - - - name: Configure the port of nfs_server - include_tasks: nfs_volume.yml - - - name: Mount the partitions - include_tasks: mount_me4_partitions.yml - - - name: Setup NFS server on the partitions - include_tasks: me4_nfs_server_setup.yml - when: powervault_protocol == 'iscsi' - - when: - - powervault_support - - hostvars['127.0.0.1']['nfs_node_status'] \ No newline at end of file diff --git a/storage/roles/nfs_sas/tasks/fetch_volume_details.yml b/storage/roles/nfs_sas/tasks/fetch_volume_details.yml index 76e24e28b..78c038030 100644 --- a/storage/roles/nfs_sas/tasks/fetch_volume_details.yml +++ b/storage/roles/nfs_sas/tasks/fetch_volume_details.yml @@ -13,7 +13,7 @@ # limitations under the License. --- -- name: Include base_vars of control plane +- name: Include base_vars of Omnia Infrastructure Manager ansible.builtin.include_vars: "{{ role_path }}/../../nfs_server_input.yml" - name: Set powervault username and password diff --git a/storage/roles/nfs_server/tasks/nfs_server_setup_redhat.yml b/storage/roles/nfs_server/tasks/nfs_server_setup_redhat.yml index 7a85095e7..b6329cb49 100644 --- a/storage/roles/nfs_server/tasks/nfs_server_setup_redhat.yml +++ b/storage/roles/nfs_server/tasks/nfs_server_setup_redhat.yml @@ -44,14 +44,14 @@ state: directory mode: "{{ nfs_share_dir_mode }}" loop: "{{ hostvars['127.0.0.1']['nfs_client_params'] }}" - when: item.server_ip == ansible_host + when: item.server_ip == ansible_host or hostvars['127.0.0.1']['admin_nic_ip'] in ansible_host - name: Adding NFS share entries in /etc/exports for nodes ansible.builtin.lineinfile: path: "{{ exports_file_path }}" line: "{{ item.server_share_path }} {{ hostvars['127.0.0.1']['admin_nic_subnet'] }}(rw,sync,no_root_squash)" with_items: "{{ hostvars['127.0.0.1']['nfs_client_params'] }}" - when: item.server_ip == ansible_host + when: item.server_ip == ansible_host or hostvars['127.0.0.1']['admin_nic_ip'] in ansible_host - name: Exporting the shared directories ansible.builtin.command: exportfs -r diff --git a/storage/roles/nfs_server/tasks/nfs_server_setup_ubuntu.yml b/storage/roles/nfs_server/tasks/nfs_server_setup_ubuntu.yml index 7f9c31c16..9ca06ac7b 100644 --- a/storage/roles/nfs_server/tasks/nfs_server_setup_ubuntu.yml +++ b/storage/roles/nfs_server/tasks/nfs_server_setup_ubuntu.yml @@ -34,14 +34,14 @@ state: directory mode: "{{ nfs_share_dir_mode }}" loop: "{{ hostvars['127.0.0.1']['nfs_client_params'] }}" - when: item.server_ip == ansible_host + when: item.server_ip == ansible_host or hostvars['127.0.0.1']['admin_nic_ip'] in ansible_host - name: Adding NFS share entries in /etc/exports ansible.builtin.lineinfile: path: "{{ exports_file_path }}" line: "{{ item.server_share_path }} {{ hostvars['127.0.0.1']['admin_nic_subnet'] }}(rw,sync,no_root_squash)" with_items: "{{ hostvars['127.0.0.1']['nfs_client_params'] }}" - when: item.server_ip == ansible_host + when: item.server_ip == ansible_host or hostvars['127.0.0.1']['admin_nic_ip'] in ansible_host - name: Exporting the shared directories ansible.builtin.command: exportfs -r diff --git a/storage/roles/powervault/vars/main.yml b/storage/roles/powervault/vars/main.yml index 5ccc45f02..ccb68dac3 100644 --- a/storage/roles/powervault/vars/main.yml +++ b/storage/roles/powervault/vars/main.yml @@ -62,8 +62,8 @@ up_port: [] # Usage: map_volume.yml access: rw -login_pv_file: "{{ playbook_dir }}/control_plane/input_params/login_vars.yml" -login_pv_vault_file: "{{ playbook_dir }}/control_plane/input_params/.login_vault_key" +login_pv_file: "{{ playbook_dir }}/oim/input_params/login_vars.yml" +login_pv_vault_file: "{{ playbook_dir }}/oim/input_params/.login_vault_key" # Usage: sas_map.yml temp1: 10 diff --git a/storage/roles/storage_validation/tasks/fetch_nfs_client_params.yml b/storage/roles/storage_validation/tasks/fetch_nfs_client_params.yml index 316c4a877..b5c00e6fe 100644 --- a/storage/roles/storage_validation/tasks/fetch_nfs_client_params.yml +++ b/storage/roles/storage_validation/tasks/fetch_nfs_client_params.yml @@ -119,9 +119,18 @@ - name: Create nfs_server_local group ansible.builtin.add_host: - name: "{{ item.server_ip }}" + name: "{{ admin_nic_ip }}" groups: "nfs_server_local" when: - nfs_server_support - item.server_ip == "localhost" or item.server_ip == admin_nic_ip + +- name: Setting up the NFS server on the Omnia Infrastructure Manager - {{ admin_nic_ip }} + ansible.builtin.pause: + prompt: "{{ oim_nfs_server_warning_msg }}" + seconds: "{{ warning_wait_time }}" + when: + - nfs_server_support + - item.server_ip == "localhost" or + item.server_ip == admin_nic_ip diff --git a/storage/roles/storage_validation/vars/main.yml b/storage/roles/storage_validation/vars/main.yml index 23f9c5050..536586eab 100644 --- a/storage/roles/storage_validation/vars/main.yml +++ b/storage/roles/storage_validation/vars/main.yml @@ -64,6 +64,10 @@ nfs_client_support_failure_msg: "Failed. Its mandatory to configure NFS during o Please provide required inputs in nfs_client_params of storage_config.yml" slurm_share_on_ubuntu_warning_msg: "[Warning] Slurm_share is set to true, Note ubuntu doesn't support slurm. However, nfs share will be created." compute_os_ubuntu: "ubuntu" +oim_nfs_server_warning_msg: | + "In storage_config.yml, under the nfs_client_params variable, server_ip is set to localhost/{{ admin_nic_ip }} and nfs_server is set to true. + The NFS server will be set up on the Omnia Infrastructure Manager using the admin NIC IP address {{ admin_nic_ip }}. + The playbook logs will contain the NFS server setup details for the Omnia Infrastructure Manager with IP {{ admin_nic_ip }}." # Usage: fetch_beegfs_inputs.yml, fetch_nfs_client_params.yml ping_msg: "100% packet loss" @@ -87,7 +91,7 @@ nfs_client_params_benchmarks_success_msg: "Entry found in nfs_client_params with # Usage: include_provision_metadata.yml provision_metadata_path: "/opt/omnia/.data/metadata.yml" -metadata_missing_fail_msg: "Failed. Missing /opt/omnia/.data/metadata.yml in control plane. +metadata_missing_fail_msg: "Failed. Missing /opt/omnia/.data/metadata.yml in Omnia Infrastructure Manager. Run discovery_provision.yml before executing omnia.yml/storage.yml playbook for creating metdata.yml file." # Usage: k8s_validations.yml diff --git a/storage/storage.yml b/storage/storage.yml index 8b15a4a40..be01a93ba 100644 --- a/storage/storage.yml +++ b/storage/storage.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,6 +17,10 @@ ansible.builtin.import_playbook: ../utils/servicetag_host_mapping.yml when: not ( hostvars['127.0.0.1']['update_inventory_executed'] | default(false) | bool ) +- name: Check if package manager is not locked + ansible.builtin.import_playbook: ../utils/check_package_lock.yml + when: not ( hostvars['127.0.0.1']['apt_lock_status'] | default(false) | bool ) + - name: Validate storage input parameters hosts: localhost connection: local diff --git a/telemetry/add_node_idrac.yml b/telemetry/add_node_idrac.yml index bcfa31f23..d347c056b 100644 --- a/telemetry/add_node_idrac.yml +++ b/telemetry/add_node_idrac.yml @@ -13,28 +13,54 @@ # limitations under the License. --- -- name: Validate grafana - hosts: localhost - connection: local - tasks: - - name: Validate create_idrac_inventory playbook - ansible.builtin.include_role: - name: common - tasks_from: validate_grafana.yml +- name: Check if virtual environment is active + ansible.builtin.import_playbook: ../utils/check_venv.yml + when: not ( hostvars['127.0.0.1']['check_venv_executed'] | default(false) | bool ) -- name: Validate iDRAC inventory +- name: Add iDRAC node hosts: localhost connection: local tasks: - - name: Validate create_idrac_inventory playbook + - name: Include vars of iDRAC telemetry role + ansible.builtin.include_vars: + file: "{{ playbook_dir }}/roles/idrac_telemetry/vars/main.yml" + + - name: Set oim_os + ansible.builtin.set_fact: + oim_os: "{{ ansible_distribution | lower }}" + + - name: Read software config file ansible.builtin.include_role: - name: common - tasks_from: create_idrac_inventory.yml + name: telemetry_validation + tasks_from: read_software_config.yml -- name: Initiate telemetry for iDRACs - hosts: localhost - connection: local - gather_facts: false - tasks: - - name: Initiate telemetry - ansible.builtin.include_tasks: "{{ playbook_dir }}/roles/idrac_telemetry/tasks/initiate_telemetry.yml" + - name: Check if telemetry entry is present in software_config.json + environment: + no_proxy: "{{ groups['idrac'] | join(',') }}" + when: telemetry_entry_present + block: + - name: Validate iDRAC inventory playbook + ansible.builtin.include_role: + name: telemetry_validation + tasks_from: validate_idrac_inventory.yml + + - name: Validate telemetry parameters + ansible.builtin.include_role: + name: telemetry_validation + tasks_from: validate_telemetry_config.yml + + - name: Deploy iDRAC telemetry + when: idrac_telemetry_support + block: + - name: Verify iDRAC telemetry pods are running + ansible.builtin.include_role: + name: idrac_telemetry + tasks_from: install_check.yml + + - name: Fail when iDRAC pods are not running + when: install_idrac_telemetry + ansible.builtin.fail: + msg: "{{ add_idrac_fail_msg }}" + + - name: Initiate telemetry for iDRACs + ansible.builtin.include_tasks: "{{ playbook_dir }}/roles/idrac_telemetry/tasks/initiate_telemetry.yml" diff --git a/telemetry/ansible.cfg b/telemetry/ansible.cfg index 55750da0c..5792c4cdf 100644 --- a/telemetry/ansible.cfg +++ b/telemetry/ansible.cfg @@ -4,6 +4,7 @@ host_key_checking = false forks = 5 timeout = 180 executable = /bin/bash +collections_path = $VIRTUAL_ENV [persistent_connection] command_timeout = 180 @@ -11,4 +12,4 @@ connect_timeout = 180 [ssh_connection] retries = 3 -ssh_args = -o ControlMaster=auto -o ControlPersist=180 \ No newline at end of file +ssh_args = -o ControlMaster=auto -o ControlPersist=180 diff --git a/telemetry/roles/grafana/tasks/plugins.yml b/telemetry/roles/grafana/tasks/plugins.yml index 0e40cb961..9011f02c9 100644 --- a/telemetry/roles/grafana/tasks/plugins.yml +++ b/telemetry/roles/grafana/tasks/plugins.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,8 +20,14 @@ version: "{{ grafana_plugins_stable_commit }}" - name: Wait for grafana pod to come to ready state - ansible.builtin.command: kubectl wait --for=condition=ready --timeout="{{ grafana_pod_timeout }}" -n "{{ grafana_namespace }}" pod -l app="{{ grafana_k8s }}" - changed_when: false + block: + - name: Wait for grafana pod to come to ready state + ansible.builtin.command: kubectl wait --for=condition=ready --timeout="{{ grafana_pod_timeout }}" -n "{{ grafana_namespace }}" pod -l app="{{ grafana_k8s }}" # noqa: yaml[line-length] + changed_when: false + rescue: + - name: Failed - grafana pod is not running + ansible.builtin.fail: + msg: "{{ grafana_pod_wait_fail_msg }}" - name: Create grafana-plugins folder if not exists ansible.builtin.file: diff --git a/telemetry/roles/grafana/tasks/pre-requisites.yml b/telemetry/roles/grafana/tasks/pre-requisites.yml index 8e125e11b..69ccbd220 100644 --- a/telemetry/roles/grafana/tasks/pre-requisites.yml +++ b/telemetry/roles/grafana/tasks/pre-requisites.yml @@ -43,24 +43,6 @@ group: root owner: root -- name: Add kubernetes and grafana ansible-galaxy collection - ansible.builtin.command: ansible-galaxy collection install "{{ item }}" - with_items: "{{ collections_name }}" - changed_when: false - -- name: Install PyYAML using pip3 - ansible.builtin.pip: - name: PyYAML - state: present - executable: pip3 - extra_args: --ignore-installed - -- name: Install openshift using pip3 - ansible.builtin.pip: - name: openshift - state: present - executable: pip3 - - name: Create grafana namespace kubernetes.core.k8s: api_version: v1 diff --git a/telemetry/roles/grafana/vars/main.yml b/telemetry/roles/grafana/vars/main.yml index 457b10503..3df16c3d0 100644 --- a/telemetry/roles/grafana/vars/main.yml +++ b/telemetry/roles/grafana/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,10 +18,6 @@ grafana_conf_path: "/opt/omnia/.data/grafana_svc_details.ini" grafana_conf_dest: "/opt/omnia/.data/" conf_file_mode: "0644" -# Usage: pre-requisites.yml -collections_name: - - kubernetes.core:2.2.3 - - community.grafana:1.3.0 directory_mode: '0774' mount_dir_perm: '0775' @@ -51,3 +47,5 @@ plugins_name: grafana_plugins_folder_name: github-grafana-plugins/ grafana_plugins_github_repo: https://github.com/nsfcac/grafana-plugin.git grafana_plugins_stable_commit: "947da4f" +grafana_pod_wait_fail_msg: "Execution failed as the grafana pods did not start within the expected time. +Please re-run the playbook after verifying that the grafana pods are in running state by executing the command 'kubectl get pods -A.'" diff --git a/telemetry/roles/grafana_config/tasks/add_dashboards.yml b/telemetry/roles/grafana_config/tasks/add_dashboards.yml index 283536dc6..a0fbc7e19 100644 --- a/telemetry/roles/grafana_config/tasks/add_dashboards.yml +++ b/telemetry/roles/grafana_config/tasks/add_dashboards.yml @@ -1,4 +1,4 @@ -# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ grafana_password: "{{ grafana_password }}" title: "{{ telemetry_folder_name }}" state: present + use_proxy: false no_log: true when: idrac_telemetry_support @@ -32,6 +33,7 @@ folder: "{{ telemetry_folder_name }}" overwrite: true path: "{{ item }}" + use_proxy: false with_items: "{{ idrac_dashboards }}" no_log: true when: idrac_telemetry_support @@ -45,6 +47,7 @@ folder: "{{ telemetry_folder_name }}" overwrite: true path: "{{ item }}" + use_proxy: false with_items: "{{ slurm_dashboards }}" no_log: true when: idrac_telemetry_support and slurm_telemetry_support diff --git a/telemetry/roles/grafana_config/tasks/add_datasource.yml b/telemetry/roles/grafana_config/tasks/add_datasource.yml index 937ab7756..93928cdf0 100644 --- a/telemetry/roles/grafana_config/tasks/add_datasource.yml +++ b/telemetry/roles/grafana_config/tasks/add_datasource.yml @@ -1,4 +1,4 @@ -# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -48,6 +48,7 @@ ds_url: "{{ timescale_svc_ip.stdout }}:{{ timescale_svc_port.stdout }}" user: "{{ timescaledb_user }}" sslmode: "disable" + use_proxy: false additional_json_data: postgresVersion: 12+ timescaledb: true diff --git a/telemetry/roles/idrac_telemetry/tasks/idrac_telemetry_deployment.yml b/telemetry/roles/idrac_telemetry/tasks/idrac_telemetry_deployment.yml index ac3926e6a..dc0a588bc 100644 --- a/telemetry/roles/idrac_telemetry/tasks/idrac_telemetry_deployment.yml +++ b/telemetry/roles/idrac_telemetry/tasks/idrac_telemetry_deployment.yml @@ -1,4 +1,4 @@ -# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -100,10 +100,14 @@ value: mysqldb - name: MYSQL_HOST_PORT value: "{{ mysqldb_port1 }}" + - name: HTTP_PROXY + value: "{{ proxy[0].http_proxy | default('', true) }}" + - name: HTTPS_PROXY + value: "{{ proxy[0].https_proxy | default('', true) }}" command: - "/bin/sh" - "-c" - args: ["./cmd/idrac-telemetry-receiver.sh"] + args: ["./scripts/example/idrac-telemetry-receiver.sh"] - name: timescale-ingester image: golang:1.17 @@ -131,6 +135,10 @@ value: timescale - name: TIMESCALE_DB value: "{{ timescaledb_name }}" + - name: HTTP_PROXY + value: "{{ proxy[0].http_proxy | default('', true) }}" + - name: HTTPS_PROXY + value: "{{ proxy[0].https_proxy | default('', true) }}" command: - "/bin/sh" - "-c" diff --git a/telemetry/roles/idrac_telemetry/tasks/initiate_telemetry.yml b/telemetry/roles/idrac_telemetry/tasks/initiate_telemetry.yml index 92b4bc886..143bf2d8c 100644 --- a/telemetry/roles/idrac_telemetry/tasks/initiate_telemetry.yml +++ b/telemetry/roles/idrac_telemetry/tasks/initiate_telemetry.yml @@ -1,4 +1,4 @@ -# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,48 +13,6 @@ # limitations under the License. --- -- name: Initialize variables - ansible.builtin.set_fact: - vault_filename: "{{ playbook_dir }}/../input/.telemetry_vault_key" - telemetry_config_file: "{{ playbook_dir }}/../input/telemetry_config.yml" - -- name: Check telemetry_config.yml file is encrypted - ansible.builtin.command: cat {{ telemetry_config_file }} - changed_when: false - register: config_content - no_log: true - -- name: Decrpyt telemetry_config.yml - ansible.builtin.command: >- - ansible-vault decrypt {{ telemetry_config_file }} - --vault-password-file {{ vault_filename }} - changed_when: false - when: "'$ANSIBLE_VAULT;' in config_content.stdout" - -- name: Include telemetry_config.yml - ansible.builtin.include_vars: "{{ telemetry_config_file }}" - no_log: true - -- name: Create ansible vault key - ansible.builtin.set_fact: - vault_key: "{{ lookup('password', '/dev/null chars=ascii_letters') }}" - when: "'$ANSIBLE_VAULT;' not in config_content.stdout" - -- name: Save vault key - ansible.builtin.lineinfile: - path: "{{ vault_filename }}" - line: "{{ vault_key }}" - mode: "{{ vault_file_perm }}" - owner: root - create: true - when: "'$ANSIBLE_VAULT;' not in config_content.stdout" - -- name: Encrypt telemetry_config.yml file - ansible.builtin.command: >- - ansible-vault encrypt {{ telemetry_config_file }} - --vault-password-file {{ vault_filename }} - changed_when: false - # Include and initialize variables - name: Initiate telemetry process if idrac_support is enabled when: idrac_telemetry_support is true @@ -117,7 +75,7 @@ - name: Enable telemetry collection on iDRACs ansible.builtin.command: >- - "{{ python_version_3_9 }}" ./ConfigurationScripts/EnableOrDisableAllTelemetryReports.py -ip "{{ item }}" + "{{ python_version }}" ./ConfigurationScripts/EnableOrDisableAllTelemetryReports.py -ip "{{ item }}" -u "{{ idrac_username }}" -p "{{ idrac_password }}" -s Enabled args: chdir: "{{ mount_location + idrac_telemetry_scripting_folder }}" @@ -133,9 +91,25 @@ - name: Add iDRAC details in mysqldb when: telemetry_idrac is defined and (telemetry_idrac | length > 0) block: + - name: Wait for idrac-telemetry pod to come to ready state + block: + - name: Wait for idrac-telemetry pod to come to ready state + ansible.builtin.command: kubectl wait --for=condition=ready --timeout=10m -n "{{ namespace }}" pod -l app="{{ idrac_telemetry_k8s_name }}" + changed_when: false + rescue: + - name: Failed - idrac-telemetry pod is not running + ansible.builtin.fail: + msg: "{{ idrac_telemetry_pod_wait_fail_msg }}" + - name: Wait for mysqldb pod to come to ready state - ansible.builtin.command: kubectl wait --for=condition=ready --timeout=10m -n "{{ namespace }}" pod -l app="{{ mysqldb_k8s_name }}" - changed_when: false + block: + - name: Wait for mysqldb pod to come to ready state + ansible.builtin.command: kubectl wait --for=condition=ready --timeout=10m -n "{{ namespace }}" pod -l app="{{ mysqldb_k8s_name }}" + changed_when: false + rescue: + - name: Failed - mysqldb pod is not running + ansible.builtin.fail: + msg: "{{ mysqldb_pod_wait_fail_msg }}" - name: Get mysqlDB svc IP ansible.builtin.command: kubectl get svc "{{ mysqldb_k8s_name }}" -n "{{ namespace }}" -o=jsonpath='{.spec.clusterIP}' @@ -148,33 +122,59 @@ register: mysql_svc_port - name: Add iDRAC host in mysqlDB - community.mysql.mysql_query: - login_host: "{{ mysql_svc_ip.stdout }}" - login_port: "{{ mysql_svc_port.stdout }}" - login_user: "{{ mysqldb_user }}" - login_password: "{{ mysqldb_password }}" - login_db: "{{ mysqldb_name }}" - query: INSERT IGNORE INTO {{ mysqldb_name + '.services' }} (ip, serviceType, authType, auth) - VALUES (%s, %s, %s ,'{"password":"{{ idrac_password | quote }}","username":"{{ idrac_username | quote }}"}') - positional_args: - - "{{ item }}" - - "{{ service_type }}" - - "{{ auth_type }}" - with_items: "{{ telemetry_idrac }}" - no_log: true - + environment: + no_proxy: "{{ mysql_svc_ip.stdout }}" + block: + - name: Check if services table exists + community.mysql.mysql_query: + login_host: "{{ mysql_svc_ip.stdout }}" + login_port: "{{ mysql_svc_port.stdout }}" + login_user: "{{ mysqldb_user }}" + login_password: "{{ mysqldb_password }}" + query: "SHOW TABLES FROM {{ mysqldb_name }}" + register: services_table_exists + until: services_table_exists is not failed and services_table_exists.query_result[0] | length > 1 + retries: "{{ db_retries }}" + delay: "{{ db_delay }}" + no_log: true + + - name: Add iDRAC host in mysqlDB + community.mysql.mysql_query: + login_host: "{{ mysql_svc_ip.stdout }}" + login_port: "{{ mysql_svc_port.stdout }}" + login_user: "{{ mysqldb_user }}" + login_password: "{{ mysqldb_password }}" + login_db: "{{ mysqldb_name }}" + query: INSERT IGNORE INTO {{ mysqldb_name + '.services' }} (ip, serviceType, authType, auth) + VALUES (%s, %s, %s ,'{"password":"{{ idrac_password | quote }}","username":"{{ idrac_username | quote }}"}') + positional_args: + - "{{ item }}" + - "{{ service_type }}" + - "{{ auth_type }}" + with_items: "{{ telemetry_idrac }}" + register: add_idrac_to_db + until: add_idrac_to_db is not failed + retries: "{{ db_retries }}" + delay: "{{ db_delay }}" + no_log: true rescue: - name: Show failure msg ansible.builtin.fail: - msg: "Adding iDRAC credential details to mysqldb failed." + msg: "{{ mysqldb_insert_fail_msg }}" # Initiate iDRAC collection - name: Initiate telemetry collection when: telemetry_idrac is defined and (telemetry_idrac | length > 0) block: - name: Wait for idrac-telemetry pod to come to ready state - ansible.builtin.command: kubectl wait --for=condition=ready --timeout=10m -n "{{ namespace }}" pod -l app="{{ idrac_telemetry_k8s_name }}" - changed_when: false + block: + - name: Wait for idrac-telemetry pod to come to ready state + ansible.builtin.command: kubectl wait --for=condition=ready --timeout=10m -n "{{ namespace }}" pod -l app="{{ idrac_telemetry_k8s_name }}" + changed_when: false + rescue: + - name: Failed - idrac-telemetry pod is not running + ansible.builtin.fail: + msg: "{{ idrac_telemetry_pod_wait_fail_msg }}" - name: Get idrac-telemetry pod name ansible.builtin.command: kubectl get pods -n "{{ namespace }}" -l app="{{ idrac_telemetry_k8s_name }}" -o jsonpath="{.items[0].metadata.name}" diff --git a/upgrade/roles/user_messages/tasks/main.yml b/telemetry/roles/idrac_telemetry/tasks/install_check.yml similarity index 60% rename from upgrade/roles/user_messages/tasks/main.yml rename to telemetry/roles/idrac_telemetry/tasks/install_check.yml index 57db5cfbe..7c03287e0 100644 --- a/upgrade/roles/user_messages/tasks/main.yml +++ b/telemetry/roles/idrac_telemetry/tasks/install_check.yml @@ -13,10 +13,17 @@ # limitations under the License. --- -- name: Set message facts +- name: Initialise install_idrac_telemetry variable ansible.builtin.set_fact: - user_msg_prepare_config: "{{ user_msg_prepare_config }}" - user_msg_prepare_upgrade: "{{ user_msg_prepare_upgrade }}" - user_msg_prepare_config2: "{{ user_msg_prepare_config2 }}" - user_msg_prepare_upgrade2: "{{ user_msg_prepare_upgrade2 }}" - user_msg_upgrade: "{{ user_msg_upgrade }}" + install_idrac_telemetry: false + +- name: Check idrac-telemetry pod status + ansible.builtin.command: "kubectl get pods -A" + register: pod_status + changed_when: false + failed_when: false + +- name: Set idrac-telemetry installation status to true + ansible.builtin.set_fact: + install_idrac_telemetry: true + when: "'idrac-telemetry' not in pod_status.stdout" diff --git a/telemetry/roles/idrac_telemetry/tasks/main.yml b/telemetry/roles/idrac_telemetry/tasks/main.yml index 2909f2fd7..2bc704a5a 100644 --- a/telemetry/roles/idrac_telemetry/tasks/main.yml +++ b/telemetry/roles/idrac_telemetry/tasks/main.yml @@ -14,16 +14,24 @@ --- - name: Check if telemetry entry is present in software_config.json + environment: + no_proxy: "{{ groups['idrac'] | join(',') }}" when: telemetry_entry_present block: - name: Deploy idrac-telemetry - when: idrac_telemetry_support is true + when: idrac_telemetry_support block: - - name: Deploy mysqldb pod for idrac credentials - ansible.builtin.include_tasks: mysqldb_deployment.yml + - name: Verify idrac telemetry pods are running + ansible.builtin.include_tasks: install_check.yml - - name: Deploy idrac_telemetry pods - ansible.builtin.include_tasks: idrac_telemetry_deployment.yml + - name: Deploy idrac-telemetry if pods are not present + when: install_idrac_telemetry + block: + - name: Deploy mysqldb pod for idrac credentials + ansible.builtin.include_tasks: mysqldb_deployment.yml + + - name: Deploy idrac_telemetry pods + ansible.builtin.include_tasks: idrac_telemetry_deployment.yml - name: Collect iDRAC IP from AWX and initiate telemetry collection ansible.builtin.include_tasks: initiate_telemetry.yml diff --git a/telemetry/roles/idrac_telemetry/vars/main.yml b/telemetry/roles/idrac_telemetry/vars/main.yml index a89c9430b..19d077588 100644 --- a/telemetry/roles/idrac_telemetry/vars/main.yml +++ b/telemetry/roles/idrac_telemetry/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,10 +13,18 @@ # limitations under the License. --- -python_version_3_9: "python3.9" # Usage: initiate_telemetry.yml +python_version: "{{ ansible_python_interpreter }}" mysqldb_name: "idrac_telemetrysource_services_db" timescaledb_name: "telemetry_metrics" +mysqldb_insert_fail_msg: "Failed to add iDRAC credential details to the mysql database. +This could be due to the tables in the mysqldb not being accessible at the moment. Please try running the playbook again after some time." +idrac_telemetry_pod_wait_fail_msg: "Execution failed as the idrac-telemetry pods did not start within the expected time. +Please re-run the playbook after verifying that the idrac-telemetry pods are in running state by executing the command 'kubectl get pods -A.'" +mysqldb_pod_wait_fail_msg: "Execution failed as the mysqldb pods did not start within the expected time. +Please re-run the playbook after verifying that the mysqldb pods are in running state by executing the command 'kubectl get pods -A.'" +db_retries: 10 +db_delay: 10 # Usage: mysqldb_deployment.yml mysql_pv_name: mysqldb-storage @@ -34,3 +42,5 @@ activemq_http_port_1: 8161 activemq_http_port_2: 61616 messagebus_http_port: "61613" configui_http_port: "8082" + +add_idrac_fail_msg: "Execute the telemetry.yml playbook to ensure that the iDRAC pods are running before triggering the add_idrac_node.yml playbook." diff --git a/telemetry/roles/k8s_prometheus/tasks/download_images.yml b/telemetry/roles/k8s_prometheus/tasks/download_images.yml new file mode 100644 index 000000000..d1a5e20ac --- /dev/null +++ b/telemetry/roles/k8s_prometheus/tasks/download_images.yml @@ -0,0 +1,61 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if telemetry support is required + when: hostvars['127.0.0.1']['telemetry_entry_present'] + block: + - name: Check if k8s_prometheus_support is required + when: hostvars['127.0.0.1']['k8s_prometheus_support'] + block: + - name: Load telemetry.json file + ansible.builtin.set_fact: + telemetry_package_json: "{{ lookup('file', telemetry_packages_file) | from_json }}" + + - name: Find images and tags from JSON + ansible.builtin.set_fact: + prom_image_versions: >- + {{ telemetry_package_json.telemetry.cluster | selectattr('package', 'in', prom_image_names) | map(attribute='package') + | zip(telemetry_package_json.telemetry.cluster + | selectattr('package', 'in', prom_image_names) | map(attribute='tag')) | map('join', ':') | list }} + + - name: Pull Kube prometheus images + ansible.builtin.command: nerdctl pull {{ item }} + with_items: "{{ prom_image_versions }}" + changed_when: true + failed_when: false + environment: + http_proxy: "{{ hostvars['localhost']['http_proxy'] }}" + https_proxy: "{{ hostvars['localhost']['https_proxy'] }}" + no_proxy: "{{ hostvars['localhost']['oim_hostname'] }},{{ hostvars['localhost']['admin_nic_ip'] }}" + + - name: Check if prometheus_gaudi_support is required + when: hostvars['127.0.0.1']['prometheus_gaudi_support'] + block: + - name: Find images and tags from JSON in case of prometheus_gaudi_support + ansible.builtin.set_fact: + gaudi_exporter_image_versions: >- + {{ telemetry_package_json.telemetry.cluster | selectattr('package', 'in', gaudi_exporter_image_names) | map(attribute='package') + | zip(telemetry_package_json.telemetry.cluster + | selectattr('package', 'in', gaudi_exporter_image_names) | map(attribute='tag')) | map('join', ':') | list }} + + - name: Pull gaudi exporter images + ansible.builtin.command: nerdctl pull {{ item }} + with_items: "{{ gaudi_exporter_image_versions }}" + changed_when: true + failed_when: false + environment: + http_proxy: "{{ hostvars['localhost']['http_proxy'] }}" + https_proxy: "{{ hostvars['localhost']['https_proxy'] }}" + no_proxy: "{{ hostvars['localhost']['oim_hostname'] }},{{ hostvars['localhost']['admin_nic_ip'] }}" diff --git a/telemetry/roles/k8s_prometheus/tasks/install_kube_prometheus.yml b/telemetry/roles/k8s_prometheus/tasks/install_kube_prometheus.yml new file mode 100644 index 000000000..c6b87a833 --- /dev/null +++ b/telemetry/roles/k8s_prometheus/tasks/install_kube_prometheus.yml @@ -0,0 +1,48 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Get K8s namespace + ansible.builtin.command: kubectl get ns + changed_when: false + register: k8s_ns + +- name: Get K8s pods + ansible.builtin.command: kubectl get pods --all-namespaces + changed_when: false + register: k8s_pods + +- name: Create monitoring namespace + ansible.builtin.command: + cmd: kubectl create namespace monitoring + changed_when: false + when: "'monitoring' not in k8s_ns.stdout" + +- name: Create directory for temp k8s files + ansible.builtin.file: + path: "{{ k8s_tmp_dir }}" + state: directory + recurse: true + +- name: Copy kube_prometheus_values.yml file + ansible.builtin.template: + src: "{{ kube_prometheus_values_file_source }}" + dest: "{{ kube_prometheus_values_file_dest }}" + mode: "{{ file_mode }}" + +- name: Install kube-prometheus stack + ansible.builtin.command: "helm install prometheus '{{ kube_prometheus_stack_repo }}' --namespace monitoring -f '{{ kube_prometheus_values_file_dest }}'" + changed_when: true + when: + - prometheus_pod_name not in k8s_pods.stdout diff --git a/telemetry/roles/k8s_prometheus/tasks/main.yml b/telemetry/roles/k8s_prometheus/tasks/main.yml new file mode 100644 index 000000000..7ec835917 --- /dev/null +++ b/telemetry/roles/k8s_prometheus/tasks/main.yml @@ -0,0 +1,21 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if telemetry support is required + when: hostvars['127.0.0.1']['telemetry_entry_present'] + block: + - name: Install kube prometheus stack + ansible.builtin.include_tasks: install_kube_prometheus.yml + when: hostvars['localhost']['k8s_prometheus_support'] diff --git a/telemetry/roles/k8s_prometheus/templates/kube_prometheus_values.yml.j2 b/telemetry/roles/k8s_prometheus/templates/kube_prometheus_values.yml.j2 new file mode 100644 index 000000000..0024c8302 --- /dev/null +++ b/telemetry/roles/k8s_prometheus/templates/kube_prometheus_values.yml.j2 @@ -0,0 +1,16 @@ +# Prometheus configuration +prometheus: + prometheusSpec: + scrapeInterval: "{{ hostvars['localhost']['prometheus_scrape_interval'] }}s" + storageSpec: + volumeClaimTemplate: + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: "{{ prometheus.storage }}" + storageClassName: "{{ prometheus.storageClassName }}" + +# Grafana Configuration (Grafana is disabled here) +grafana: + enabled: false diff --git a/telemetry/roles/k8s_prometheus/vars/main.yml b/telemetry/roles/k8s_prometheus/vars/main.yml new file mode 100644 index 000000000..0874cb85e --- /dev/null +++ b/telemetry/roles/k8s_prometheus/vars/main.yml @@ -0,0 +1,38 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Usage: download_images.yml +telemetry_packages_file: "{{ role_path }}/../../../input/config/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/telemetry.json" +prom_image_names: + - quay.io/prometheus-operator/prometheus-operator + - registry.k8s.io/kube-state-metrics/kube-state-metrics + - quay.io/prometheus-operator/prometheus-config-reloader + - quay.io/prometheus/alertmanager + - quay.io/prometheus/node-exporter + - quay.io/prometheus/prometheus + - registry.k8s.io/ingress-nginx/kube-webhook-certgen +gaudi_exporter_image_names: + - vault.habana.ai/gaudi-metric-exporter/metric-exporter + +# Usage: install_kube_prometheus.yml +kube_prometheus_stack_repo: "{{ hostvars['localhost']['offline_tarball_path'] }}/kube-prometheus-stack-62.3.0.tar.gz" +k8s_tmp_dir: "/root/k8s" +kube_prometheus_values_file_source: "{{ role_path }}/templates/kube_prometheus_values.yml.j2" +kube_prometheus_values_file_dest: "{{ k8s_tmp_dir }}/k8s_prometheus_values.yml" +file_mode: "0655" +prometheus: + storage: 50Gi + storageClassName: nfs-client +prometheus_pod_name: "prometheus-prometheus-kube-prometheus-prometheus" diff --git a/telemetry/roles/loki/files/loki_dashboard.json b/telemetry/roles/loki/files/loki_dashboard.json index 8c234a02a..00d983bed 100644 --- a/telemetry/roles/loki/files/loki_dashboard.json +++ b/telemetry/roles/loki/files/loki_dashboard.json @@ -25,7 +25,7 @@ "links": [], "panels": [ { - "datasource": "control-plane-loki", + "datasource": "oim-node-loki", "gridPos": { "h": 8, "w": 24, @@ -48,7 +48,7 @@ { "datasource": { "type": "loki", - "uid": "control-plane-loki" + "uid": "oim-node-loki" }, "expr": "{filename=\"/var/log/omnia.log\",job=\"Omnia logs\"}", "queryType": "randomWalk", @@ -59,7 +59,7 @@ "type": "logs" }, { - "datasource": "control-plane-loki", + "datasource": "oim-node-loki", "gridPos": { "h": 8, "w": 24, @@ -82,7 +82,7 @@ { "datasource": { "type": "loki", - "uid": "control-plane-loki" + "uid": "oim-node-loki" }, "expr": "{filename=\"/var/log/messages\",job=\"syslog\"}", "queryType": "randomWalk", diff --git a/telemetry/roles/loki/files/promtail_config.yml b/telemetry/roles/loki/files/promtail_config.yml index 6d71551ef..6a04e6c40 100644 --- a/telemetry/roles/loki/files/promtail_config.yml +++ b/telemetry/roles/loki/files/promtail_config.yml @@ -161,13 +161,29 @@ scrape_configs: job: accelerator logs __path__: /var/log/omnia/accelerator.log -- job_name: monitor logs +- job_name: discovery logs static_configs: - targets: - localhost labels: - job: monitor logs - __path__: /var/log/omnia/monitor.log + job: discovery logs + __path__: /var/log/omnia/discovery.log + +- job_name: local_repo log + static_configs: + - targets: + - localhost + labels: + job: local_repo log + __path__: /var/log/omnia/local_repo.log + +- job_name: kubespray_telemetry logs + static_configs: + - targets: + - localhost + labels: + job: kubespray_telemetry logs + __path__: /var/log/omnia/kubespray_omnia.log - job_name: network logs static_configs: @@ -185,6 +201,14 @@ scrape_configs: job: platforms logs __path__: /var/log/omnia/platforms.log +- job_name: prepre_cp log + static_configs: + - targets: + - localhost + labels: + job: prepre_cp log + __path__: /var/log/omnia/prepare_cp.log + - job_name: provision logs static_configs: - targets: @@ -217,6 +241,14 @@ scrape_configs: job: storage logs __path__: /var/log/omnia/storage.log +- job_name: server_spec_update logs + static_configs: + - targets: + - localhost + labels: + job: server_spec_update logs + __path__: /var/log/omnia/server_spec_update.log + - job_name: telemetry logs static_configs: - targets: @@ -225,6 +257,22 @@ scrape_configs: job: telemetry logs __path__: /var/log/omnia/telemetry.log +- job_name: tools logs + static_configs: + - targets: + - localhost + labels: + job: tools logs + __path__: /var/log/omnia/tools.log + +- job_name: upgrade log + static_configs: + - targets: + - localhost + labels: + job: upgrade log + __path__: /var/log/omnia/upgrade.log + - job_name: utils logs static_configs: - targets: @@ -241,6 +289,30 @@ scrape_configs: job: utils cluster logs __path__: /var/log/omnia/utils_cluster.log +- job_name: ip_rule_assignment logs + static_configs: + - targets: + - localhost + labels: + job: ip_rule_assignment logs + __path__: /var/log/omnia/ip_rule_assignment.log + +- job_name: performance_profile logs + static_configs: + - targets: + - localhost + labels: + job: performance_profile logs + __path__: /var/log/omnia/performance_profile.log + +- job_name: software_update logs + static_configs: + - targets: + - localhost + labels: + job: software_update logs + __path__: /var/log/omnia/software_update.log + - job_name: cluster deployment logs static_configs: - targets: @@ -256,3 +328,11 @@ scrape_configs: labels: job: compute log messages __path__: /var/log/xcat/computes.log* + +- job_name: benchmarks log messages + static_configs: + - targets: + - localhost + labels: + job: benchmarks log messages + __path__: /var/log/omnia/benchmarks.log diff --git a/telemetry/roles/loki/tasks/configure_docker_proxy.yml b/telemetry/roles/loki/tasks/configure_docker_proxy.yml new file mode 100644 index 000000000..deda42816 --- /dev/null +++ b/telemetry/roles/loki/tasks/configure_docker_proxy.yml @@ -0,0 +1,52 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Create docker service directory + ansible.builtin.file: + path: "{{ docker_service_dest }}" + state: directory + mode: "{{ dir_mode }}" + +- name: Copy http-proxy.conf to docker service directory + ansible.builtin.template: + src: "{{ docker_http_proxy_conf_src }}" + dest: "{{ docker_service_dest }}/http-proxy.conf" + mode: "{{ file_mode }}" + +- name: Create .docker directory if it doesn't exist + ansible.builtin.file: + path: "{{ docker_auth_folder }}" + state: directory + mode: "{{ docker_dir_mode }}" + +- name: Copy docker config.json + ansible.builtin.template: + src: "{{ docker_config_src }}" + dest: "{{ docker_config_dest }}" + mode: "{{ docker_file_mode }}" + +- name: Reload systemd daemon + ansible.builtin.systemd: + daemon_reload: true + +- name: Restart docker service + ansible.builtin.service: + name: docker + state: restarted + enabled: true + register: docker_result + until: docker_result is succeeded + retries: "{{ package_retry }}" + delay: "{{ delay_time }}" diff --git a/telemetry/roles/loki/tasks/configure_loki_grafana.yml b/telemetry/roles/loki/tasks/configure_loki_grafana.yml index 3bd69bea4..fa988abdb 100644 --- a/telemetry/roles/loki/tasks/configure_loki_grafana.yml +++ b/telemetry/roles/loki/tasks/configure_loki_grafana.yml @@ -1,4 +1,4 @@ -# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,9 +13,25 @@ # limitations under the License. --- +- name: Wait for grafana pod to come to ready state + block: + - name: Wait for grafana pod to come to ready state + ansible.builtin.command: kubectl wait --for=condition=ready --timeout="{{ pod_timeout }}" -n "{{ grafana_namespace }}" pod -l app="{{ grafana_k8s_name }}" + changed_when: false + rescue: + - name: Failed - grafana pod is not running + ansible.builtin.fail: + msg: "{{ grafana_pod_wait_fail_msg }}" + - name: Wait for loki pod to come to ready state - ansible.builtin.command: kubectl wait --for=condition=ready --timeout={{ pod_timeout }} -n "{{ grafana_namespace }}" pod -l app="{{ loki_k8s_name }}" - changed_when: false + block: + - name: Wait for loki pod to come to ready state + ansible.builtin.command: kubectl wait --for=condition=ready --timeout={{ pod_timeout }} -n "{{ grafana_namespace }}" pod -l app="{{ loki_k8s_name }}" + changed_when: false + rescue: + - name: Failed - loki pod is not running + ansible.builtin.fail: + msg: "{{ loki_pod_wait_fail_msg }}" - name: Get loki service ip ansible.builtin.command: kubectl get svc loki -n {{ grafana_namespace }} -o=jsonpath='{.spec.clusterIP}' @@ -27,16 +43,36 @@ changed_when: false register: loki_svc_port +- name: Verify grafana url is reachable + environment: + no_proxy: "{{ grafana_svc_ip.stdout }}" + block: + - name: Verify grafana url is reachable + ansible.builtin.uri: + url: "http://{{ grafana_svc_ip.stdout }}:{{ grafana_svc_port.stdout }}" + register: grafana_url_status + until: grafana_url_status.status == 200 + retries: "{{ url_retries }}" + failed_when: false + + - name: Unreachable grafana url + ansible.builtin.fail: + msg: "{{ grafana_url_unreachable_fail_msg }}" + when: grafana_url_status.status != 200 + - name: Create loki datasource in grafana + environment: + no_proxy: "{{ grafana_svc_ip.stdout }}" block: - name: Create loki datasource in grafana community.grafana.grafana_datasource: - name: control-plane-loki + name: oim-node-loki grafana_url: "http://{{ grafana_svc_ip.stdout }}:{{ grafana_svc_port.stdout }}" grafana_user: "{{ grafana_username }}" grafana_password: "{{ grafana_password }}" ds_type: loki ds_url: "http://{{ loki_svc_ip.stdout }}:{{ loki_svc_port.stdout }}" + use_proxy: false no_log: true register: create_loki_datasource rescue: @@ -45,6 +81,8 @@ msg: "Error: {{ create_loki_datasource.msg }}" - name: Import loki dashboard in grafana + environment: + no_proxy: "{{ grafana_svc_ip.stdout }}" block: - name: Import loki dashboard in grafana community.grafana.grafana_dashboard: @@ -55,6 +93,7 @@ commit_message: Updated by ansible overwrite: true path: "{{ role_path }}/files/loki_dashboard.json" + use_proxy: false no_log: true register: import_loki_dashboard rescue: diff --git a/telemetry/roles/loki/tasks/docker_login.yml b/telemetry/roles/loki/tasks/docker_login.yml new file mode 100644 index 000000000..fc448b76b --- /dev/null +++ b/telemetry/roles/loki/tasks/docker_login.yml @@ -0,0 +1,29 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Docker login + ansible.builtin.command: nerdctl login -u {{ docker_username }} -p {{ docker_password }} + changed_when: true + register: docker_login_output + retries: "{{ image_retries }}" + delay: "{{ delay_time }}" + until: docker_login_output.rc == 0 + failed_when: false + no_log: true + +- name: Docker login check + ansible.builtin.fail: + msg: "{{ docker_login_fail_msg }} Error: {{ docker_login_output.stderr }}" + when: docker_login_output.rc != 0 diff --git a/telemetry/roles/loki/tasks/enable_buildkit.yml b/telemetry/roles/loki/tasks/enable_buildkit.yml index b0064875c..e42abda28 100644 --- a/telemetry/roles/loki/tasks/enable_buildkit.yml +++ b/telemetry/roles/loki/tasks/enable_buildkit.yml @@ -12,44 +12,59 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -- name: Execute make install - ansible.builtin.shell: - chdir: "{{ buildkit_dir }}" - cmd: "make && make install" - register: images_output - until: images_output is success - retries: "{{ image_retries }}" - delay: "{{ delay_loki }}" - changed_when: false -- name: Execute make images - ansible.builtin.shell: - chdir: "{{ buildkit_dir }}" - cmd: "make && make install" - register: install_output - until: install_output is success - retries: "{{ image_retries }}" - delay: "{{ delay_loki }}" +- name: Check buildkit created + ansible.builtin.command: nerdctl ps -f name=buildkitd + register: buildkit_container_check changed_when: false + failed_when: false -- name: Get container ID for buildkitd - ansible.builtin.shell: > - set -o pipefail - && nerdctl ps -q -a -f name=buildkitd | head -n 1 - register: buildkit_container_id - changed_when: false +- name: Enable buildkit + when: "'buildkit' not in buildkit_container_check.stdout" + environment: + http_proxy: "{{ proxy[0].http_proxy | default('', true) }}" + https_proxy: "{{ proxy[0].https_proxy | default('', true) }}" + HTTP_PROXY: "{{ proxy[0].http_proxy | default('', true) }}" + HTTPS_PROXY: "{{ proxy[0].https_proxy | default('', true) }}" + block: + - name: Execute make install + ansible.builtin.shell: + chdir: "{{ buildkit_dir }}" + cmd: "make && make install" + register: images_output + until: images_output is success + retries: "{{ image_retries }}" + delay: "{{ delay_time }}" + changed_when: false -- name: Stop buildkitd container - ansible.builtin.command: nerdctl stop {{ buildkit_container_id.stdout }} - when: buildkit_container_id.stdout is defined and buildkit_container_id.stdout != '' - changed_when: false + - name: Execute make images + ansible.builtin.shell: + chdir: "{{ buildkit_dir }}" + cmd: "make && make install" + register: install_output + until: install_output is success + retries: "{{ image_retries }}" + delay: "{{ delay_time }}" + changed_when: false -- name: Remove buildkitd container - ansible.builtin.command: nerdctl rm {{ buildkit_container_id.stdout }} - when: buildkit_container_id.stdout is defined and buildkit_container_id.stdout != '' - changed_when: false + - name: Get container ID for buildkitd + ansible.builtin.shell: > + set -o pipefail + && nerdctl ps -q -a -f name=buildkitd | head -n 1 + register: buildkit_container_id + changed_when: false + + - name: Stop buildkitd container + ansible.builtin.command: nerdctl stop {{ buildkit_container_id.stdout }} + when: buildkit_container_id.stdout is defined and buildkit_container_id.stdout != '' + changed_when: false + + - name: Remove buildkitd container + ansible.builtin.command: nerdctl rm {{ buildkit_container_id.stdout }} + when: buildkit_container_id.stdout is defined and buildkit_container_id.stdout != '' + changed_when: false -- name: Run BuildKit container - ansible.builtin.command: > - nerdctl run -d --name buildkitd --privileged moby/buildkit:latest - changed_when: true + - name: Run BuildKit container + ansible.builtin.command: > + nerdctl run -d -e http_proxy -e HTTP_PROXY -e https_proxy -e HTTPS_PROXY -e no_proxy --name buildkitd --privileged moby/buildkit:latest + changed_when: true diff --git a/upgrade/roles/uninstall_k8s_cluster/tasks/main.yml b/telemetry/roles/loki/tasks/install_check.yml similarity index 66% rename from upgrade/roles/uninstall_k8s_cluster/tasks/main.yml rename to telemetry/roles/loki/tasks/install_check.yml index 8119e2bc8..8bac03b27 100644 --- a/upgrade/roles/uninstall_k8s_cluster/tasks/main.yml +++ b/telemetry/roles/loki/tasks/install_check.yml @@ -13,13 +13,16 @@ # limitations under the License. --- -- name: Fetching cluster os +- name: Initialise install_loki variable ansible.builtin.set_fact: - cluster_os: "{{ ansible_distribution | lower }}" + install_loki: false -- name: Validate the kubernetes on cluster - ansible.builtin.include_tasks: validation.yml +- name: Check loki pod status + ansible.builtin.command: "kubectl get pods -A" + register: pod_status + changed_when: false -- name: Uninstall K8s on cluster - ansible.builtin.include_tasks: uninstall_k8s.yml - when: k8s_installation_status | default(false) | bool +- name: Set loki installation status to true + ansible.builtin.set_fact: + install_loki: true + when: "'loki' not in pod_status.stdout" diff --git a/telemetry/roles/loki/tasks/main.yml b/telemetry/roles/loki/tasks/main.yml index 93bdda395..15a0dbdd6 100644 --- a/telemetry/roles/loki/tasks/main.yml +++ b/telemetry/roles/loki/tasks/main.yml @@ -23,17 +23,23 @@ ansible.builtin.include_vars: file: "{{ local_repo_config_path }}" - - name: Run prerequisite - ansible.builtin.include_tasks: prereq_{{ ansible_distribution | lower }}.yml + - name: Verify loki pods are running + ansible.builtin.include_tasks: install_check.yml - - name: Enable buildkit - ansible.builtin.include_tasks: enable_buildkit.yml + - name: Deploy loki if pods are not present + when: install_loki + block: + - name: Run prerequisite + ansible.builtin.include_tasks: prereq_{{ ansible_distribution | lower }}.yml - - name: Build promtail image - ansible.builtin.import_tasks: promtail_image.yml + - name: Enable buildkit + ansible.builtin.include_tasks: enable_buildkit.yml - - name: Deploy loki pod - ansible.builtin.import_tasks: k8s_loki_pod.yml + - name: Build promtail image + ansible.builtin.import_tasks: promtail_image.yml + + - name: Deploy loki pod + ansible.builtin.import_tasks: k8s_loki_pod.yml - name: Deploy loki service ansible.builtin.import_tasks: loki_service.yml diff --git a/telemetry/roles/loki/tasks/prereq_redhat.yml b/telemetry/roles/loki/tasks/prereq_redhat.yml index 8261c0d31..b1c9addf9 100644 --- a/telemetry/roles/loki/tasks/prereq_redhat.yml +++ b/telemetry/roles/loki/tasks/prereq_redhat.yml @@ -28,4 +28,12 @@ register: docker_result until: docker_result is succeeded retries: "{{ package_retry }}" - delay: "{{ delay_loki }}" + delay: "{{ delay_time }}" + +- name: Configure proxy environment variables for docker + ansible.builtin.include_tasks: configure_docker_proxy.yml + when: proxy_status + +- name: Login to docker when credentials are given + ansible.builtin.include_tasks: docker_login.yml + when: docker_login diff --git a/telemetry/roles/loki/tasks/prereq_ubuntu.yml b/telemetry/roles/loki/tasks/prereq_ubuntu.yml index bdc332b68..090e34fd2 100644 --- a/telemetry/roles/loki/tasks/prereq_ubuntu.yml +++ b/telemetry/roles/loki/tasks/prereq_ubuntu.yml @@ -31,9 +31,19 @@ ansible.builtin.set_fact: os_release: "{{ ansible_distribution_release }}" -- name: Clean apt cache - ansible.builtin.apt: - autoclean: true +- name: Try cleaning apt cache in Ubuntu + block: + - name: Clean apt cache + ansible.builtin.apt: + autoclean: true + register: clean_apt_cache + until: clean_apt_cache is not failed + retries: "{{ repo_retries }}" + delay: "{{ repo_delay }}" + rescue: + - name: Failed to clean apt cache + ansible.builtin.fail: + msg: "{{ clean_apt_cache_fail_msg }}" - name: Configure Omnia Repositories ansible.builtin.template: @@ -48,9 +58,19 @@ mode: "{{ file_mode }}" changed_when: false -- name: Update apt cache - ansible.builtin.apt: - update_cache: true +- name: Try updating repos in Ubuntu + block: + - name: Update apt cache + ansible.builtin.apt: + update_cache: true + register: update_repos + until: update_repos is not failed + retries: "{{ repo_retries }}" + delay: "{{ repo_delay }}" + rescue: + - name: Failed to update repos + ansible.builtin.fail: + msg: "{{ docker_update_repos_fail_msg }}" - name: Get epoch number of docker-ce/docker-ce-cli ansible.builtin.shell: "set -o pipefail && apt-cache show docker-ce | grep 'Version: 5:24.0.4' | awk '{print $2}'" @@ -74,4 +94,12 @@ register: docker_result until: docker_result is succeeded retries: "{{ package_retry }}" - delay: "{{ delay_loki }}" + delay: "{{ delay_time }}" + +- name: Configure proxy environment variables for docker + ansible.builtin.include_tasks: configure_docker_proxy.yml + when: proxy_status + +- name: Login to docker when credentials are given + ansible.builtin.include_tasks: docker_login.yml + when: docker_login diff --git a/telemetry/roles/loki/templates/docker_http_proxy_conf.j2 b/telemetry/roles/loki/templates/docker_http_proxy_conf.j2 new file mode 100644 index 000000000..0e5392d68 --- /dev/null +++ b/telemetry/roles/loki/templates/docker_http_proxy_conf.j2 @@ -0,0 +1,4 @@ +[Service] +Environment="HTTP_PROXY={{ proxy[0].http_proxy | default('', true) }}" +Environment="HTTPS_PROXY={{ proxy[0].http_proxy | default('', true) }}" +Environment="NO_PROXY=localhost,{{ oim_hostname }},*.{{ oim_domain_name }},{{ oim_ip_addresses | join(',') }}{% if no_proxy_input_status %},{{ proxy[0].no_proxy }}{% endif %}" diff --git a/telemetry/roles/loki/templates/docker_json.j2 b/telemetry/roles/loki/templates/docker_json.j2 new file mode 100644 index 000000000..cb28fec4c --- /dev/null +++ b/telemetry/roles/loki/templates/docker_json.j2 @@ -0,0 +1,9 @@ +{ + "proxies": { + "default": { + "httpProxy": "{{ proxy[0].http_proxy | default('', true) }}", + "httpsProxy": "{{ proxy[0].https_proxy | default('', true) }}", + "noProxy": "localhost,{{ oim_hostname }},*.{{ oim_domain_name }},{{ oim_ip_addresses | join(',') }}{% if no_proxy_input_status %},{{ proxy[0].no_proxy }}{% endif %}" + } + } +} diff --git a/telemetry/roles/loki/vars/main.yml b/telemetry/roles/loki/vars/main.yml index 57ac54413..8b43a3e95 100644 --- a/telemetry/roles/loki/vars/main.yml +++ b/telemetry/roles/loki/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,10 +28,18 @@ package_retry: 10 # Usage: configure_loki_grafana pod_timeout: 30m grafana_namespace: grafana +grafana_k8s_name: grafana +url_retries: 10 +grafana_url_unreachable_fail_msg: "Failed. grafana url http://{{ grafana_svc_ip.stdout }}:{{ grafana_svc_port.stdout }} is not reachable. +Please retry aftter grafana url is reachable. If facing this issue again, please cleanup and rerun the the playbook" +grafana_pod_wait_fail_msg: "Execution failed as the grafana pods did not start within the expected time. +Please re-run the playbook after verifying that the grafana pods are in running state by executing the command 'kubectl get pods -A.'" +loki_pod_wait_fail_msg: "Execution failed as the loki pods did not start within the expected time. +Please re-run the playbook after verifying that the loki pods are in running state by executing the command 'kubectl get pods -A.'" # Usage enable_buildkit.yml image_retries: 5 -delay_loki: 10 +delay_time: 10 local_repo_config_path: "{{ role_path }}/../../../input/local_repo_config.yml" buildkit_dir: "{{ repo_store_path }}/cluster/git/buildkit" @@ -55,3 +63,21 @@ cert_packages: - ca-certificates gpg_path: /etc/apt/keyrings/docker.asc docker_gpg_url: https://download.docker.com/linux/ubuntu/gpg +repo_retries: 5 +repo_delay: 10 +docker_update_repos_fail_msg: "Failed to update the docker repositories. Please ensure that the docker repositories are accessible +from the Omnia Infrastructure Manager and re-run the playbook." +clean_apt_cache_fail_msg: "Failed to clean the apt cache. Please ensure there are no lock files present and try running the playbook again." + +# Usage: configure_docker_proxy.yml +docker_file_mode: "0600" +docker_config_src: "{{ role_path }}/templates/docker_json.j2" +docker_config_dest: "/root/.docker/config.json" +docker_service_dest: "/etc/systemd/system/docker.service.d" +docker_http_proxy_conf_src: "{{ role_path }}/templates/docker_http_proxy_conf.j2" +docker_auth_folder: "/root/.docker/" +docker_dir_mode: "700" + +# Usage: docker_login.yml +docker_login_fail_msg: "Docker login failed. Please ensure the docker login credentials in the input/provision_config_credentials.yml are valid. +If they are, this error can occur due to a pull limit issue or multiple requests. Please try running the playbook again after waiting for a while." diff --git a/telemetry/roles/omnia_telemetry_acquisition/tasks/binary_transfer.yml b/telemetry/roles/omnia_telemetry_acquisition/tasks/binary_transfer.yml index 36092b1f5..d1ff31ce5 100644 --- a/telemetry/roles/omnia_telemetry_acquisition/tasks/binary_transfer.yml +++ b/telemetry/roles/omnia_telemetry_acquisition/tasks/binary_transfer.yml @@ -12,18 +12,43 @@ # See the License for the specific language governing permissions and # limitations under the License. --- - - name: Create telemetry directory on nodes ansible.builtin.file: path: "{{ omnia_telemetry_dest }}" state: directory mode: "{{ directory_permissions }}" +- name: Get MD5 checksum of local binary + ansible.builtin.stat: + path: "{{ binary_files_path }}" + checksum_algorithm: md5 + register: local_binary_stat + delegate_to: localhost + changed_when: false + +- name: Check if binary exists and get MD5 checksum on compute nodes + ansible.builtin.stat: + path: "{{ omnia_telemetry_dest }}/omnia_telemetry" + checksum_algorithm: md5 + register: remote_binary_stat + changed_when: false + +- name: Determine if binary needs to be transferred + ansible.builtin.set_fact: + transfer_required: >- + {{ + (remote_binary_stat.stat.exists and + local_binary_stat.stat.checksum != remote_binary_stat.stat.checksum) + or + (not remote_binary_stat.stat.exists) + }} + - name: Transfer all binaries to compute nodes ansible.builtin.copy: src: "{{ binary_files_path }}" dest: "{{ omnia_telemetry_dest }}" - force: false + force: true owner: root group: root mode: "{{ binary_mode }}" + when: transfer_required diff --git a/telemetry/roles/omnia_telemetry_prepare_cp/files/config.yml b/telemetry/roles/omnia_telemetry_prepare_oim/files/config.yml similarity index 91% rename from telemetry/roles/omnia_telemetry_prepare_cp/files/config.yml rename to telemetry/roles/omnia_telemetry_prepare_oim/files/config.yml index 1ba72fcfa..538c52ba0 100644 --- a/telemetry/roles/omnia_telemetry_prepare_cp/files/config.yml +++ b/telemetry/roles/omnia_telemetry_prepare_oim/files/config.yml @@ -13,9 +13,9 @@ # limitations under the License. --- -host: -port: -username: -password: -database: -gssencmode: +host: +port: +username: +password: +database: +gssencmode: diff --git a/telemetry/roles/omnia_telemetry_prepare_cp/files/encrypt_config.py b/telemetry/roles/omnia_telemetry_prepare_oim/files/encrypt_config.py similarity index 100% rename from telemetry/roles/omnia_telemetry_prepare_cp/files/encrypt_config.py rename to telemetry/roles/omnia_telemetry_prepare_oim/files/encrypt_config.py diff --git a/telemetry/roles/omnia_telemetry_prepare_cp/files/generate_key.py b/telemetry/roles/omnia_telemetry_prepare_oim/files/generate_key.py similarity index 100% rename from telemetry/roles/omnia_telemetry_prepare_cp/files/generate_key.py rename to telemetry/roles/omnia_telemetry_prepare_oim/files/generate_key.py diff --git a/telemetry/roles/omnia_telemetry_prepare_cp/files/omnia_telemetry_schema_creation.py b/telemetry/roles/omnia_telemetry_prepare_oim/files/omnia_telemetry_schema_creation.py similarity index 64% rename from telemetry/roles/omnia_telemetry_prepare_cp/files/omnia_telemetry_schema_creation.py rename to telemetry/roles/omnia_telemetry_prepare_oim/files/omnia_telemetry_schema_creation.py index 5679dad34..c720fdf08 100644 --- a/telemetry/roles/omnia_telemetry_prepare_cp/files/omnia_telemetry_schema_creation.py +++ b/telemetry/roles/omnia_telemetry_prepare_oim/files/omnia_telemetry_schema_creation.py @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,27 +21,40 @@ ''' import sys +import argparse import psycopg2 -dbuser = sys.argv[1] -dbpwd = sys.argv[2] -dbhost = sys.argv[3] -dbport = sys.argv[4] -dbtelemetry = sys.argv[5] +def parse_arguments(): + parser = argparse.ArgumentParser(description="Dump data from the database to a CSV file.") + parser.add_argument("user", type=str, help="Username for the database") + parser.add_argument("password", type=str, help="Password for the database") + parser.add_argument("host", type=str, help="Hostname for the database") + parser.add_argument("port", type=str, help="Port number for the database") + parser.add_argument("dbname", type=str, help="Name of the database") + args = parser.parse_args() + return args + +def validate_inputs(value): + + if value.strip(): + return value + else: + raise ValueError("Value cannot be empty") def db_connect(): - ''' - This module creates Database Connection - ''' - conn = None - connection_string = f"postgres://{dbuser}:{dbpwd}@{dbhost}:{dbport}/{dbtelemetry}".format( - dbuser = dbuser, dbpwd = dbpwd, dbhost = dbhost, dbport = dbport, dbtelemetry = dbtelemetry) + """Creates a secure database connection.""" try: - conn = psycopg2.connect(connection_string) + conn = psycopg2.connect( + user=user, + password=password, + host=host, + port=port, + dbname=dbname + ) if conn is not None: conn.autocommit = True except Exception as ex: - sys.exit('Failed to connect to timescaledb') + sys.exit(f"Failed to connect to timescaledb: {ex}") return conn def db_schema(conn): @@ -80,6 +93,17 @@ def db_table(conn): cursor.execute(sql_query) cursor.close() +args = parse_arguments() + +try: + user = validate_inputs(args.user) + password = validate_inputs(args.password) + host = validate_inputs(args.host) + port = validate_inputs(args.port) + dbname = validate_inputs(args.dbname) +except Exception as ex: + sys.exit(f"Failed to parse arguments: {ex}") + def main(): ''' This module initiates db connection and creates table diff --git a/telemetry/roles/omnia_telemetry_prepare_cp/files/telemetry.ini b/telemetry/roles/omnia_telemetry_prepare_oim/files/telemetry.ini similarity index 100% rename from telemetry/roles/omnia_telemetry_prepare_cp/files/telemetry.ini rename to telemetry/roles/omnia_telemetry_prepare_oim/files/telemetry.ini diff --git a/telemetry/roles/omnia_telemetry_prepare_cp/tasks/config_file_security.yml b/telemetry/roles/omnia_telemetry_prepare_oim/tasks/config_file_security.yml similarity index 100% rename from telemetry/roles/omnia_telemetry_prepare_cp/tasks/config_file_security.yml rename to telemetry/roles/omnia_telemetry_prepare_oim/tasks/config_file_security.yml diff --git a/telemetry/roles/omnia_telemetry_prepare_cp/tasks/config_file_update.yml b/telemetry/roles/omnia_telemetry_prepare_oim/tasks/config_file_update.yml similarity index 96% rename from telemetry/roles/omnia_telemetry_prepare_cp/tasks/config_file_update.yml rename to telemetry/roles/omnia_telemetry_prepare_oim/tasks/config_file_update.yml index 77827f476..39e487c1e 100644 --- a/telemetry/roles/omnia_telemetry_prepare_cp/tasks/config_file_update.yml +++ b/telemetry/roles/omnia_telemetry_prepare_oim/tasks/config_file_update.yml @@ -17,7 +17,7 @@ ansible.builtin.include_vars: "{{ timescaledb_vars_file }}" no_log: true -- name: Create timescaledb config directory control plane +- name: Create timescaledb config directory Omnia Infrastructure Manager ansible.builtin.file: path: "{{ config_file_path_dst }}" state: directory diff --git a/telemetry/roles/omnia_telemetry_prepare_cp/tasks/create_omnia_telemetry_schema.yml b/telemetry/roles/omnia_telemetry_prepare_oim/tasks/create_omnia_telemetry_schema.yml similarity index 100% rename from telemetry/roles/omnia_telemetry_prepare_cp/tasks/create_omnia_telemetry_schema.yml rename to telemetry/roles/omnia_telemetry_prepare_oim/tasks/create_omnia_telemetry_schema.yml diff --git a/telemetry/roles/omnia_telemetry_prepare_cp/tasks/ini_file_update.yml b/telemetry/roles/omnia_telemetry_prepare_oim/tasks/ini_file_update.yml similarity index 100% rename from telemetry/roles/omnia_telemetry_prepare_cp/tasks/ini_file_update.yml rename to telemetry/roles/omnia_telemetry_prepare_oim/tasks/ini_file_update.yml diff --git a/telemetry/roles/omnia_telemetry_prepare_cp/tasks/main.yml b/telemetry/roles/omnia_telemetry_prepare_oim/tasks/main.yml similarity index 95% rename from telemetry/roles/omnia_telemetry_prepare_cp/tasks/main.yml rename to telemetry/roles/omnia_telemetry_prepare_oim/tasks/main.yml index aeb328bbf..888490a32 100644 --- a/telemetry/roles/omnia_telemetry_prepare_cp/tasks/main.yml +++ b/telemetry/roles/omnia_telemetry_prepare_oim/tasks/main.yml @@ -16,7 +16,7 @@ - name: Check if telemetry support is required when: telemetry_entry_present block: - - name: Enable telemetry on the control plane + - name: Enable telemetry on the Omnia Infrastructure Manager when: omnia_telemetry_support block: - name: Update init files for omnia telemetry diff --git a/telemetry/roles/omnia_telemetry_prepare_cp/vars/main.yml b/telemetry/roles/omnia_telemetry_prepare_oim/vars/main.yml similarity index 96% rename from telemetry/roles/omnia_telemetry_prepare_cp/vars/main.yml rename to telemetry/roles/omnia_telemetry_prepare_oim/vars/main.yml index f7bcb9462..f095e1b3d 100644 --- a/telemetry/roles/omnia_telemetry_prepare_cp/vars/main.yml +++ b/telemetry/roles/omnia_telemetry_prepare_oim/vars/main.yml @@ -34,4 +34,4 @@ config_encryption_utility: "{{ role_path }}/files/encrypt_config.py" # Usage: create_omnia_telemetry_schema.yml timescaledb_vars_filepath: "{{ role_path }}/../timescaledb/vars/main.yml" db_schema_utility: "{{ role_path }}/files/omnia_telemetry_schema_creation.py" -python_version: python3.9 +python_version: "{{ ansible_python_interpreter }}" diff --git a/telemetry/roles/orchestrator/tasks/container_runtime.yml b/telemetry/roles/orchestrator/tasks/container_runtime.yml index 9775e6fad..76de37ee8 100644 --- a/telemetry/roles/orchestrator/tasks/container_runtime.yml +++ b/telemetry/roles/orchestrator/tasks/container_runtime.yml @@ -40,7 +40,7 @@ owner: root mode: "{{ permission_644 }}" -# Containerd configuration required by K8s on Control plane installation with deploy_container_engine value as false +# Containerd configuration required by K8s on Omnia Infrastructure Manager installation with deploy_container_engine value as false - name: Containerd | Generate default base_runtime_spec register: ctr_oci_spec ansible.builtin.command: "{{ containerd_bin_dir }}/ctr oci spec" @@ -66,6 +66,25 @@ owner: "root" mode: "{{ permission_644 }}" +- name: Configure proxy environment variables for containerd + when: proxy_status + block: + - name: Create containerd service directory + ansible.builtin.file: + path: "{{ containerd_service_dest }}" + state: directory + mode: "{{ dir_mode }}" + + - name: Copy http-proxy.conf to containerd service directory + ansible.builtin.template: + src: "{{ containerd_http_proxy_conf_src }}" + dest: "{{ containerd_service_dest }}/http-proxy.conf" + mode: "{{ permission_644 }}" + + - name: Reload systemd daemon + ansible.builtin.systemd: + daemon_reload: true + - name: Restart containerd ansible.builtin.systemd: name: containerd diff --git a/telemetry/roles/orchestrator/tasks/deploy_k8s.yml b/telemetry/roles/orchestrator/tasks/deploy_k8s.yml index 30194a489..7891518e5 100644 --- a/telemetry/roles/orchestrator/tasks/deploy_k8s.yml +++ b/telemetry/roles/orchestrator/tasks/deploy_k8s.yml @@ -1,4 +1,4 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright © 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. --- - - name: Create kubespray directory "{{ kubespray_path }}" ansible.builtin.file: path: "{{ kubespray_path }}" @@ -23,7 +22,7 @@ block: - name: Get kubespray git repo tarball ansible.builtin.get_url: - url: "{{ offline_git_path }}/kubespray.tar.gz" + url: "{{ offline_git_path }}/{{ telemetry_package_map['kubespray'] }}.tar.gz" dest: "{{ kubespray_path }}" mode: "{{ permission_644 }}" failed_when: false @@ -34,22 +33,35 @@ - name: Untar kubespray git repo ansible.builtin.unarchive: - src: "{{ kubespray_path }}/kubespray.tar.gz" + src: "{{ kubespray_path }}/{{ telemetry_package_map['kubespray'] }}.tar.gz" dest: "{{ kubespray_path }}" +- name: Add collections path in kubespray ansible.cfg + community.general.ini_file: + path: "{{ kubespray_cfg_path }}" + section: defaults + option: collections_path + value: "$VIRTUAL_ENV" + mode: "{{ permission_644 }}" + backup: true + - name: Create k8s_var from template ansible.builtin.template: src: k8s_var.yml.j2 dest: "{{ role_path }}/files/k8s_var.yml" mode: "{{ permission_644 }}" -- name: K8s installation on control plane (Wait 10 mins for kubernetes installation to complete) +- name: K8s installation on Omnia Infrastructure Manager (Wait 10 mins for kubernetes installation to complete) ansible.builtin.command: > - ansible-playbook {{ kubespray_path }}/kubespray/cluster.yml \ + ansible-playbook {{ kubespray_path }}/{{ telemetry_package_map['kubespray'] }}/cluster.yml \ -i {{ role_path }}/files/k8s_inv.ini \ --extra-vars "@{{ role_path }}/files/k8s_var.yml" args: - chdir: "{{ kubespray_path }}/kubespray" + chdir: "{{ kubespray_path }}/{{ telemetry_package_map['kubespray'] }}" + environment: + HTTP_PROXY: "{{ proxy[0].http_proxy | default('', true) }}" + HTTPS_PROXY: "{{ proxy[0].https_proxy | default('', true) }}" + NO_PROXY: "localhost,{{ oim_hostname }},*.{{ oim_domain_name }},{{ oim_ip_addresses | join(',') }},{{ hostvars['localhost']['pod_external_ip_range'] }},{{ hostvars['localhost']['k8s_service_addresses'] }},{{ hostvars['localhost']['k8s_pod_network_cidr'] }}" # noqa: yaml[line-length] register: k8s_install changed_when: k8s_install.changed failed_when: false @@ -60,13 +72,16 @@ register: kubectl_command_status failed_when: false +- name: Kubespray error log if kubernetes installation not successful + ansible.builtin.lineinfile: + path: "{{ kubespray_log }}" + line: "{{ k8s_install.stdout_lines }}" + state: present + create: true + mode: "{{ permission_644 }}" + when: kubectl_command_status.rc != 0 + - name: Fail if Kubernetes installation not successful ansible.builtin.fail: msg: "{{ fail_msg_kubespray }}" when: kubectl_command_status.rc != 0 - -- name: Cleanup "{{ kubespray_path }}" - ansible.builtin.command: "rm -rf {{ kubespray_path }}" - register: command_result - changed_when: command_result.changed - failed_when: false diff --git a/telemetry/roles/orchestrator/tasks/docker_login.yml b/telemetry/roles/orchestrator/tasks/docker_login.yml new file mode 100644 index 000000000..a567753ef --- /dev/null +++ b/telemetry/roles/orchestrator/tasks/docker_login.yml @@ -0,0 +1,29 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Docker login + ansible.builtin.command: nerdctl login -u {{ docker_username }} -p {{ docker_password }} + changed_when: true + register: docker_login_output + retries: "{{ retry_count }}" + delay: "{{ min_delay }}" + until: docker_login_output.rc == 0 + failed_when: false + no_log: true + +- name: Docker login check + ansible.builtin.fail: + msg: "{{ docker_login_fail_msg }} Error: {{ docker_login_output.stderr }}" + when: docker_login_output.rc != 0 diff --git a/telemetry/roles/orchestrator/tasks/fetch_oim_details.yml b/telemetry/roles/orchestrator/tasks/fetch_oim_details.yml new file mode 100644 index 000000000..0bc9b592c --- /dev/null +++ b/telemetry/roles/orchestrator/tasks/fetch_oim_details.yml @@ -0,0 +1,35 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Gather all IP addresses + ansible.builtin.command: ip -4 addr show + register: ip_output + changed_when: false + +- name: Read Omnia Infrastructure Manager hostname + ansible.builtin.command: hostname + changed_when: false + register: hostname_output + +- name: Read Omnia Infrastructure Manager domain name + ansible.builtin.command: hostname -d + changed_when: false + register: domain_name_output + +- name: Set oim details + ansible.builtin.set_fact: + oim_hostname: "{{ hostname_output.stdout }}" + oim_domain_name: "{{ domain_name_output.stdout }}" + oim_ip_addresses: "{{ ip_output.stdout | regex_findall('inet\\s([0-9.]+)') }}" diff --git a/telemetry/roles/orchestrator/tasks/firewalld_config.yml b/telemetry/roles/orchestrator/tasks/firewalld_config.yml index e0e375b0d..b0d5dd1b3 100644 --- a/telemetry/roles/orchestrator/tasks/firewalld_config.yml +++ b/telemetry/roles/orchestrator/tasks/firewalld_config.yml @@ -20,10 +20,10 @@ - name: Firewall config for non ubuntu os when: ubuntu_os not in os_distribution block: - - name: Configure firewall on master nodes + - name: Open k8s ports in the Omnia Infrastructure Manager ansible.builtin.command: "firewall-cmd --permanent --add-port={{ item }}" changed_when: true - with_items: '{{ k8s_master_ports }}' + with_items: '{{ k8s_oim_ports }}' - name: Open calico UDP ports on the firewall ansible.builtin.command: "firewall-cmd --permanent --add-port={{ item }}/udp" diff --git a/telemetry/roles/orchestrator/tasks/main.yml b/telemetry/roles/orchestrator/tasks/main.yml index 227236c63..8bc161d16 100644 --- a/telemetry/roles/orchestrator/tasks/main.yml +++ b/telemetry/roles/orchestrator/tasks/main.yml @@ -13,19 +13,29 @@ # limitations under the License. --- -# Install kubernetes on control plane depending on the flag values of telemetry_entry_present and k8s_cp_installation_flag. +# Install kubernetes on Omnia Infrastructure Manager depending on the flag values of telemetry_entry_present and k8s_oim_installation_flag. - name: Check if telemetry feature need to be deployed when: telemetry_entry_present block: - name: Get kubernetes installation status ansible.builtin.include_tasks: precheck.yml - - name: Install K8s on Control plane - when: k8s_cp_installation_flag + - name: Fetch oim details + ansible.builtin.include_tasks: fetch_oim_details.yml + + - name: Fetch k8s package names + ansible.builtin.include_tasks: read_telemetry_packages.yml + + - name: Install K8s on Omnia Infrastructure Manager + when: k8s_oim_installation_flag block: - - name: Configure kubernetes on control plane if atleast one support value is true + - name: Configure kubernetes on Omnia Infrastructure Manager if atleast one support value is true when: idrac_telemetry_support or omnia_telemetry_support or visualization_support block: + - name: Login to docker when credentials are given + ansible.builtin.include_tasks: docker_login.yml + when: docker_login + - name: Configure container runtime ansible.builtin.import_tasks: container_runtime.yml diff --git a/telemetry/roles/orchestrator/tasks/precheck.yml b/telemetry/roles/orchestrator/tasks/precheck.yml index 300d644f5..145377a41 100644 --- a/telemetry/roles/orchestrator/tasks/precheck.yml +++ b/telemetry/roles/orchestrator/tasks/precheck.yml @@ -15,16 +15,16 @@ - name: Set kubernetes install flag to false ansible.builtin.set_fact: - k8s_cp_installation_flag: false + k8s_oim_installation_flag: false -- name: Check if kubernetes on control plane is already installed +- name: Check if kubernetes on Omnia Infrastructure Manager is already installed ansible.builtin.command: kubectl get pod -A changed_when: false register: kubectl_command_status failed_when: false -# k8s_cp_installation_flag value true denotes kubernetes is not installed and need to be installed. +# k8s_oim_installation_flag value true denotes kubernetes is not installed and need to be installed. - name: Set kubernetes install flag to true ansible.builtin.set_fact: - k8s_cp_installation_flag: true + k8s_oim_installation_flag: true when: kubectl_command_status.rc != 0 diff --git a/telemetry/roles/orchestrator/tasks/read_telemetry_packages.yml b/telemetry/roles/orchestrator/tasks/read_telemetry_packages.yml new file mode 100644 index 000000000..2104ad812 --- /dev/null +++ b/telemetry/roles/orchestrator/tasks/read_telemetry_packages.yml @@ -0,0 +1,40 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Read software_config.json + ansible.builtin.include_vars: + file: "{{ software_config_file }}" + name: software_config + +- name: Load telemetry.json + ansible.builtin.set_fact: + telemetry_json: "{{ lookup('file', telemetry_json_file) | from_json }}" + +# This task creates a mapping of package names and their corresponding versions. +# The mapping is used in the k8s_var.yml.j2 template. +# The mapping is in the format: +# { +# "package_name": "package_name-version", +# ... +# } + +- name: Create a mapping of package name and version + ansible.builtin.set_fact: + telemetry_package_map: "{{ telemetry_package_map | default({}) | combine({ ((item.package).rsplit('-', 1)[0]): item.package}) }}" + loop: "{{ telemetry_json['telemetry']['cluster'] }}" + when: (item.type == "tarball" or item.type == "git") + +- name: Extract k8s version + ansible.builtin.set_fact: + k8s_version: "{{ (telemetry_package_map['kubectl']).rsplit('-', 1)[1] | default('1.26.12') }}" diff --git a/telemetry/roles/orchestrator/templates/containerd_http_proxy_conf.j2 b/telemetry/roles/orchestrator/templates/containerd_http_proxy_conf.j2 new file mode 100644 index 000000000..9a7a402d4 --- /dev/null +++ b/telemetry/roles/orchestrator/templates/containerd_http_proxy_conf.j2 @@ -0,0 +1,4 @@ +[Service] +Environment="HTTP_PROXY={{ proxy[0].http_proxy | default('', true) }}" +Environment="HTTPS_PROXY={{ proxy[0].http_proxy | default('', true) }}" +Environment="NO_PROXY=localhost,{{ oim_hostname }},*.{{ oim_domain_name }},{{ oim_ip_addresses | join(',') }},{{ pod_external_ip_range }},{{ k8s_service_addresses }},{{ k8s_pod_network_cidr }}{% if no_proxy_input_status %},{{ proxy[0].no_proxy }}{% endif %}" diff --git a/telemetry/roles/orchestrator/templates/k8s_var.yml.j2 b/telemetry/roles/orchestrator/templates/k8s_var.yml.j2 index 5193a4535..458e373c0 100644 --- a/telemetry/roles/orchestrator/templates/k8s_var.yml.j2 +++ b/telemetry/roles/orchestrator/templates/k8s_var.yml.j2 @@ -1,12 +1,13 @@ -kube_version: "v1.26.12" +kube_version: "v{{ k8s_version }}" deploy_container_engine: false -skip_http_proxy_on_os_packages: true +skip_http_proxy_on_os_packages: {{ not proxy_status }} dashboard_enabled: false helm_enabled: true kube_network_plugin: "{{ hostvars['localhost']['k8s_cni'] }}" kube_service_addresses: "{{ hostvars['localhost']['k8s_service_addresses'] }}" kube_pods_subnet: "{{ hostvars['localhost']['k8s_pod_network_cidr'] }}" metallb_enabled: true +metallb_namespace: "metallb-system" metallb_speaker_enabled: true kube_proxy_strict_arp: true kube_proxy_mode: 'iptables' @@ -22,16 +23,22 @@ override_system_hostname: false populate_inventory_to_hosts_file: false enable_nodelocaldns: false unsafe_show_logs: true +preinstall_selinux_state: disabled kube_image_repo: "registry.k8s.io" docker_image_repo: "docker.io" quay_image_repo: "quay.io" -kubeadm_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/kubeadm.tar.gz" -kubectl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/kubectl.tar.gz" -kubelet_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/kubelet.tar.gz" -calicoctl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/calicoctl-v3.25.2.tar.gz" -calico_crds_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/calicocrds-v3.25.2.tar.gz" -cni_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/cni-plugins-v1.3.0.tar.gz" +kubeadm_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ telemetry_package_map['kubeadm'] }}.tar.gz" +kubectl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ telemetry_package_map['kubectl'] }}.tar.gz" +kubelet_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ telemetry_package_map['kubelet'] }}.tar.gz" +calicoctl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ telemetry_package_map['calicoctl'] }}.tar.gz" +calico_crds_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ telemetry_package_map['calicocrds'] }}.tar.gz" +cni_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ telemetry_package_map['cni-plugins'] }}.tar.gz" docker_rh_repo_base_url: "" docker_rh_repo_gpgkey: "" -etcd_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/etcd-v3.5.10.tar.gz" -helm_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/helm-v3.12.3.tar.gz" \ No newline at end of file +etcd_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ telemetry_package_map['etcd'] }}.tar.gz" +helm_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/{{ telemetry_package_map['helm'] }}.tar.gz" +bin_dir: /usr/bin +http_proxy: "{{ proxy[0].http_proxy | default('', true) }}" +https_proxy: "{{ proxy[0].http_proxy | default('', true) }}" +no_proxy: "localhost,{{ oim_hostname }},*.{{ oim_domain_name }},{{ oim_ip_addresses | join(',') }},{{ hostvars['localhost']['pod_external_ip_range'] }},{{ hostvars['localhost']['k8s_service_addresses'] }},{{ hostvars['localhost']['k8s_pod_network_cidr'] }}" # noqa: yaml[line-length] +additional_no_proxy: "{{ hostvars['localhost']['user_no_proxy'] if hostvars['localhost']['no_proxy_input_status'] else '' }}" # noqa: yaml[line-length] diff --git a/telemetry/roles/orchestrator/vars/main.yml b/telemetry/roles/orchestrator/vars/main.yml index 903505e47..a72ff8125 100644 --- a/telemetry/roles/orchestrator/vars/main.yml +++ b/telemetry/roles/orchestrator/vars/main.yml @@ -17,21 +17,29 @@ min_retries: 3 max_retries: 10 min_delay: 10 wait_time: 30 -retry_count: 3 +retry_count: 5 ubuntu_os: "ubuntu" # Container runtime configuration -runc_url: "{{ offline_tarball_path }}/runc.amd64.tar.gz" +runc_url: "{{ offline_tarball_path }}/{{ telemetry_package_map['runc.amd64'] }}.tar.gz" runc_dest: "/usr/local/bin/runc" runc_permission: "+x" -crictl_url: "{{ offline_tarball_path }}/cri-tools-v1.26.1.tar.gz" -crictl_archive_dest: /tmp/cri-tools-v1.26.1.tar.gz -crictl_dest: /usr/local/bin/ +software_config_file: "{{ role_path }}/../../../input/software_config.json" +telemetry_json_file: "{{ role_path }}/../../../input/config/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/telemetry.json" +crictl_url: "{{ offline_tarball_path }}/{{ telemetry_package_map['cri-tools'] }}.tar.gz" +crictl_archive_dest: "/tmp/{{ telemetry_package_map['cri-tools'] }}.tar.gz" +crictl_dest: /usr/bin/ crictl_permission: "+x" cri_socket: "unix:///var/run/containerd/containerd.sock" permission_644: "0644" -kubespray_path: "/opt/omnia/kubespray" +kubespray_path: "/opt/omnia/{{ telemetry_package_map['kubespray'] }}" +kubespray_cfg_path: "{{ kubespray_path }}/{{ telemetry_package_map['kubespray'] }}/ansible.cfg" +kubespray_log: "/var/log/omnia/kubespray_omnia.log" + +dir_mode: "755" +containerd_service_dest: "/etc/systemd/system/containerd.service.d" +containerd_http_proxy_conf_src: "{{ role_path }}/templates/containerd_http_proxy_conf.j2" # Usage: k8s_secrets.yml namespace: telemetry-and-visualizations @@ -39,7 +47,7 @@ secrets_name: credentials mysqldb_secrets_name: mysqldb-credentials # Usage: firewalld_config.yml -k8s_master_ports: +k8s_oim_ports: - 6443/tcp - 2379-2380/tcp - 10250/tcp @@ -62,13 +70,17 @@ nerdctl_registry_restart_fail_msg: "nerdctl-registry service failed to restart" # helm helm_stable_repo_url: https://charts.helm.sh/stable # dashboard -k8s_dashboard_yaml_url: https://raw.githubusercontent.com/kubernetes/dashboard/v2.2.0/aio/deploy/recommended.yaml +k8s_dashboard_yaml_url: https://raw.githubusercontent.com/kubernetes/dashboard/v2.7.0/aio/deploy/recommended.yaml k8s_dashboard_admin_file_dest: /opt/omnia/k8s_dashboard_admin.yaml k8s_dashboard_admin_file_mode: "0655" fail_msg_kubespray_not_found: "Kubespray git tar file not found in local repo." -fail_msg_kubespray: "kubernetes installation on control plane failed." -# Containerd configuration required by K8s on Control plane installation with deploy_container_engine value as false +fail_msg_kubespray: "Kubernetes installation on the Omnia Infrastructure Manager failed. +This can be due to a Docker pull issue or any other issues that occurred during the execution of the local_repo.yml playbook. +For detailed error logs, refer to the {{ kubespray_log }} file. +To resolve the issue, review the error logs and address any pull or execution issues. Then, re-run the playbook after verifying the error." + +# Containerd configuration required by K8s on Omnia Infrastructure Manager installation with deploy_container_engine value as false containerd_bin_dir: "/usr/bin" bin_dir: "/usr/local/bin" containerd_storage_dir: "/var/lib/containerd" @@ -107,3 +119,7 @@ containerd_runc_runtime: options: systemdCgroup: "{{ containerd_use_systemd_cgroup | ternary('true', 'false') }}" binaryName: "{{ bin_dir }}/runc" + +# Usage: docker_login.yml +docker_login_fail_msg: "Docker login failed. Please ensure the docker login credentials in the input/provision_config_credentials.yml are valid. +If they are, this error can occur due to a pull limit issue or multiple requests. Please try running the playbook again after waiting for a while." diff --git a/telemetry/roles/prometheus/tasks/configure_k8s_prom_grafana.yml b/telemetry/roles/prometheus/tasks/configure_k8s_prom_grafana.yml index 384bf9bee..b9f28ed63 100644 --- a/telemetry/roles/prometheus/tasks/configure_k8s_prom_grafana.yml +++ b/telemetry/roles/prometheus/tasks/configure_k8s_prom_grafana.yml @@ -27,7 +27,7 @@ block: - name: Create prometheus datasource in grafana community.grafana.grafana_datasource: - name: control-plane-prometheus + name: oim-node-prometheus grafana_url: "http://{{ grafana_svc_ip.stdout }}:{{ grafana_svc_port.stdout }}" grafana_user: "{{ grafana_username }}" grafana_password: "{{ grafana_password }}" diff --git a/telemetry/roles/prometheus_gaudi/tasks/deploy_gaudi_metric_exporter.yml b/telemetry/roles/prometheus_gaudi/tasks/deploy_gaudi_metric_exporter.yml new file mode 100644 index 000000000..e12bc2979 --- /dev/null +++ b/telemetry/roles/prometheus_gaudi/tasks/deploy_gaudi_metric_exporter.yml @@ -0,0 +1,53 @@ +# Copyright 2024 Intel Corporation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Get K8s pods + ansible.builtin.command: kubectl get pods --all-namespaces + changed_when: false + register: k8s_pods + +- name: Deploy Gaudi metric exporter daemonset + ansible.builtin.command: "kubectl apply -f {{ gaudi_metric_exporter_daemonset_url }}" + changed_when: true + when: "'metric-exporter-ds' not in k8s_pods.stdout" + +- name: Get K8s services + ansible.builtin.command: kubectl get service --all-namespaces + changed_when: false + register: k8s_services + +- name: Deploy Gaudi metric exporter service + ansible.builtin.command: "kubectl apply -f {{ gaudi_metric_exporter_service_url }}" + changed_when: true + when: "'metric-exporter' not in k8s_services.stdout" + +- name: Get K8s servicemonitors + ansible.builtin.command: kubectl get servicemonitor --all-namespaces + changed_when: false + register: k8s_serviemonitors + +- name: Deploy Gaudi metric exporter service monitor + ansible.builtin.command: "kubectl apply -f {{ gaudi_metric_exporter_servicemonitor_url }}" + changed_when: true + when: "'metric-exporter' not in k8s_serviemonitors.stdout" + +- name: Add label to the ServiceMonitor so it will be scraped by Prometheus + ansible.builtin.command: > + kubectl patch servicemonitor metric-exporter + -n monitoring + --type='merge' + -p '{"metadata": {"labels": {"release": "prometheus"}}}' + changed_when: true + when: "'metric-exporter' not in k8s_serviemonitors.stdout" diff --git a/telemetry/roles/prometheus_gaudi/tasks/main.yml b/telemetry/roles/prometheus_gaudi/tasks/main.yml new file mode 100644 index 000000000..87c6302c0 --- /dev/null +++ b/telemetry/roles/prometheus_gaudi/tasks/main.yml @@ -0,0 +1,23 @@ +# Copyright 2024 Intel Corporation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if telemetry support is required + when: hostvars['127.0.0.1']['telemetry_entry_present'] + block: + - name: Check if Gaudi Prometheus metric exporter need to be deployed + when: hostvars['127.0.0.1']['prometheus_gaudi_support'] + block: + - name: Deploy Gaudi metric exporter + ansible.builtin.import_tasks: deploy_gaudi_metric_exporter.yml diff --git a/server_spec_update/roles/metadata_creation/vars/main.yml b/telemetry/roles/prometheus_gaudi/vars/main.yml similarity index 57% rename from server_spec_update/roles/metadata_creation/vars/main.yml rename to telemetry/roles/prometheus_gaudi/vars/main.yml index cbf50fdde..4621d0cf6 100644 --- a/server_spec_update/roles/metadata_creation/vars/main.yml +++ b/telemetry/roles/prometheus_gaudi/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Intel Corporation. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,14 +13,6 @@ # limitations under the License. --- -# Usage: create_nic_metadata.yml -meta_path: "/opt/omnia/.data/nic_metadata.yml" -meta_dest: "/opt/omnia/.data/" -conf_file_mode: "0644" -mount_dir_perm: "0775" -meta_user: "root" -meta_group: "root" - -# Usage: validate_metadata_params.yml -python_version: "python3.9" -validate_nic_metadata_py: "{{ role_path }}/files/nic_metadata_validation.py" +gaudi_metric_exporter_daemonset_url: "{{ hostvars['localhost']['offline_manifest_path'] }}/metric-exporter-daemonset.yaml" +gaudi_metric_exporter_service_url: "{{ hostvars['localhost']['offline_manifest_path'] }}/metric-exporter-service.yaml" +gaudi_metric_exporter_servicemonitor_url: "{{ hostvars['localhost']['offline_manifest_path'] }}/metric-exporter-serviceMonitor.yaml" diff --git a/telemetry/roles/telemetry_validation/tasks/create_idrac_inventory.yml b/telemetry/roles/telemetry_validation/tasks/create_idrac_inventory.yml deleted file mode 100644 index c725d2829..000000000 --- a/telemetry/roles/telemetry_validation/tasks/create_idrac_inventory.yml +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Initialize variables for idrac inventory - ansible.builtin.set_fact: - create_idrac_inventory: false - -- name: Validate idrac inventory - when: - - groups['idrac'] is not defined or - groups['idrac'] | length | int < 1 - block: - - name: Initialize variables for idrac - ansible.builtin.set_fact: - create_idrac_inventory: true - - - name: Idrac inventory status- Pausing 2 seconds - ansible.builtin.pause: - seconds: 2 - prompt: "{{ idrac_inventory_msg }}" - -- name: Create internal idrac inventory - when: create_idrac_inventory - block: - - name: Check for idrac inventory existence - ansible.builtin.stat: - path: "{{ idarc_inventory_path }}" - register: idrac_file_state - - - name: Fail if idrac inventory file doesn't exist - ansible.builtin.fail: - msg: "{{ idrac_file_status }}" - when: not idrac_file_state.stat.exists - - - name: Fetch IP's from idrac_inventory - ansible.builtin.command: cat {{ idarc_inventory_path }} - changed_when: false - register: idrac_ips - when: idrac_file_state.stat.exists - - - name: Create idrac inventory - ansible.builtin.add_host: - name: "{{ item }}" - groups: "idrac" - with_items: "{{ idrac_ips.stdout_lines }}" - when: - - idrac_file_state.stat.exists - - "'[idrac]' not in item" - - item | trim | length > 1 diff --git a/telemetry/roles/telemetry_validation/tasks/main.yml b/telemetry/roles/telemetry_validation/tasks/main.yml index a5d3970bf..0e720b113 100644 --- a/telemetry/roles/telemetry_validation/tasks/main.yml +++ b/telemetry/roles/telemetry_validation/tasks/main.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,6 +17,10 @@ ansible.builtin.set_fact: telemetry_validation_status: true +- name: Set oim_os + ansible.builtin.set_fact: + oim_os: "{{ ansible_distribution | lower }}" + - name: Read software config file ansible.builtin.include_tasks: read_software_config.yml @@ -26,16 +30,23 @@ - name: Validate telemetry_config.yml ansible.builtin.include_tasks: validate_telemetry_config.yml - - name: Include provision_config.yml for timezone - ansible.builtin.include_tasks: include_provision_config.yml + - name: Include variables for telemetry when: idrac_telemetry_support or omnia_telemetry_support or visualization_support + block: + - name: Validate site_config.yml + ansible.builtin.include_tasks: validate_site_config.yml - - name: Check pre-requisites for telemetry and visualizations - ansible.builtin.include_tasks: pre-requisites.yml - when: idrac_telemetry_support or omnia_telemetry_support or visualization_support + - name: Include provision_config.yml for timezone + ansible.builtin.include_tasks: include_provision_config.yml + + - name: Validate provision_config_credentials.yml + ansible.builtin.include_tasks: validate_provision_config_credentials.yml + + - name: Check pre-requisites for telemetry and visualizations + ansible.builtin.include_tasks: pre-requisites.yml - - name: Create iDRAC inventory - ansible.builtin.include_tasks: create_idrac_inventory.yml + - name: Validate iDRAC inventory + ansible.builtin.include_tasks: validate_idrac_inventory.yml when: idrac_telemetry_support - name: Set the docker version for slurm telemetry diff --git a/telemetry/roles/telemetry_validation/tasks/pre-requisites.yml b/telemetry/roles/telemetry_validation/tasks/pre-requisites.yml index 513e24ff4..0e788a305 100644 --- a/telemetry/roles/telemetry_validation/tasks/pre-requisites.yml +++ b/telemetry/roles/telemetry_validation/tasks/pre-requisites.yml @@ -17,32 +17,16 @@ ansible.builtin.set_fact: mgmt_os: "{{ ansible_facts['distribution'] | lower }}" -- name: Check SELinux Status - when: os_supported_ubuntu not in mgmt_os - block: - - name: Fetch SElinux mode - ansible.builtin.command: sestatus - register: sestatus_current - changed_when: false - - - name: Check SELinux status - ansible.builtin.debug: - msg: "{{ selinux_warning }}" - when: '"SELinux status: disabled" in sestatus_current.stdout_lines' - - - name: Set SElinux to permissive mode - ansible.builtin.command: setenforce 0 - when: '"SELinux status: enabled" in sestatus_current.stdout_lines' - changed_when: true +- name: Install PyYAML using pip3 + ansible.builtin.pip: + name: PyYAML + state: present + executable: pip3 + extra_args: --ignore-installed -- name: Install openshift using pip3 +- name: Install python packages using pip3 ansible.builtin.pip: name: "{{ item }}" state: present executable: pip3 with_items: "{{ pip_packages }}" - -- name: Install sqldb collection - ansible.builtin.command: ansible-galaxy collection install "{{ mysqldb_collection_name }}" - changed_when: false - retries: "{{ retry_count }}" diff --git a/telemetry/roles/telemetry_validation/tasks/read_software_config.yml b/telemetry/roles/telemetry_validation/tasks/read_software_config.yml index 21640ae6b..f179b65fa 100644 --- a/telemetry/roles/telemetry_validation/tasks/read_software_config.yml +++ b/telemetry/roles/telemetry_validation/tasks/read_software_config.yml @@ -40,6 +40,10 @@ ansible.builtin.set_fact: cluster_os_type: "{{ software_config['cluster_os_type'] }}" + - name: Get cluster_os_version from software_config.json + ansible.builtin.set_fact: + cluster_os_version: "{{ software_config['cluster_os_version'] }}" + - name: Load telemetry.json ansible.builtin.set_fact: telemetry_packages: "{{ lookup('file', telemetry_packages_file) | from_json }}" diff --git a/server_spec_update/roles/metadata_update/tasks/main.yml b/telemetry/roles/telemetry_validation/tasks/validate_idrac_inventory.yml similarity index 69% rename from server_spec_update/roles/metadata_update/tasks/main.yml rename to telemetry/roles/telemetry_validation/tasks/validate_idrac_inventory.yml index 2c3ee6288..642b92b8b 100644 --- a/server_spec_update/roles/metadata_update/tasks/main.yml +++ b/telemetry/roles/telemetry_validation/tasks/validate_idrac_inventory.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,8 +13,9 @@ # limitations under the License. --- -- name: Update metadata - when: add_network_status - block: - - name: Update nic metadata.yml - ansible.builtin.include_tasks: update_nic_metadata.yml +- name: Validate idrac inventory + when: + - groups['idrac'] is not defined or + groups['idrac'] | length | int < 1 + ansible.builtin.fail: + msg: "{{ idrac_inventory_msg }}" diff --git a/telemetry/roles/telemetry_validation/tasks/validate_idrac_telemetry.yml b/telemetry/roles/telemetry_validation/tasks/validate_idrac_telemetry.yml index 4fadb309c..4177d8ab3 100644 --- a/telemetry/roles/telemetry_validation/tasks/validate_idrac_telemetry.yml +++ b/telemetry/roles/telemetry_validation/tasks/validate_idrac_telemetry.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/telemetry/roles/telemetry_validation/tasks/validate_k8s_prometheus_prometheus_gaudi.yml b/telemetry/roles/telemetry_validation/tasks/validate_k8s_prometheus_prometheus_gaudi.yml new file mode 100644 index 000000000..4196677c9 --- /dev/null +++ b/telemetry/roles/telemetry_validation/tasks/validate_k8s_prometheus_prometheus_gaudi.yml @@ -0,0 +1,49 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Validate omnia inventory + ansible.builtin.assert: + that: + - groups['kube_control_plane'] is defined + - groups['kube_node'] is defined + - groups['etcd'] is defined + fail_msg: "{{ k8s_prom_gaudi_inventory_fail_msg }}" + +- name: Validate kube_control_plane group + ansible.builtin.assert: + that: "groups['kube_control_plane'] | length | int == 1" + fail_msg: "{{ kube_control_plane_group_fail_msg }}" + +- name: Validate kube_node group + ansible.builtin.assert: + that: "groups['kube_node'] | length | int >= 1" + fail_msg: "{{ kube_node_group_fail_msg }}" + +- name: Validate etcd group + ansible.builtin.assert: + that: "groups['etcd'] | length | int >= 1" + fail_msg: "{{ etcd_group_fail_msg }}" + +- name: Validate etcd group having odd nodes + ansible.builtin.assert: + that: "groups['etcd'] | length | int % 2 == 1" + fail_msg: "{{ etcd_odd_entry_fail_msg }}" + +- name: Assert prometheus_scrape_interval + ansible.builtin.assert: + that: + - prometheus_scrape_interval is integer + - prometheus_scrape_interval > 0 + fail_msg: "{{ prometheus_scrape_interval_fail_msg }}" diff --git a/telemetry/roles/telemetry_validation/tasks/validate_k8s_setup.yml b/telemetry/roles/telemetry_validation/tasks/validate_k8s_setup.yml new file mode 100644 index 000000000..dac76ba9a --- /dev/null +++ b/telemetry/roles/telemetry_validation/tasks/validate_k8s_setup.yml @@ -0,0 +1,40 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if telemetry support is required + when: hostvars['127.0.0.1']['telemetry_entry_present'] + block: + - name: Validate k8s cluster + when: hostvars['localhost']['k8s_prometheus_support'] or hostvars['localhost']['prometheus_gaudi_support'] + block: + - name: Set fact for k8s installation status + ansible.builtin.set_fact: + k8s_installation_status: false + + - name: Check whether k8s is installed + ansible.builtin.command: kubectl get nodes -o='Name' + register: kubectl_status + changed_when: false + failed_when: false + + - name: Set the k8s installation status + ansible.builtin.set_fact: + k8s_installation_status: true + when: k8s_error_message not in kubectl_status.msg + + - name: Fail when K8S Cluster is not setup + ansible.builtin.fail: + msg: "{{ k8s_cluster_fail_msg }}" + when: not k8s_installation_status diff --git a/telemetry/roles/telemetry_validation/tasks/validate_prometheus_gaudi.yml b/telemetry/roles/telemetry_validation/tasks/validate_prometheus_gaudi.yml new file mode 100644 index 000000000..8880006f1 --- /dev/null +++ b/telemetry/roles/telemetry_validation/tasks/validate_prometheus_gaudi.yml @@ -0,0 +1,25 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Fail when prometheus_gaudi_support is true and k8s_prometheus_support is false + ansible.builtin.fail: + msg: "{{ fail_msg_k8s_prometheus_support_false }}" + when: not k8s_prometheus_support + +- name: Fail when prometheus_gaudi_support is true and not Ubuntu 22.04 OS + ansible.builtin.fail: + msg: "{{ fail_msg_prometheus_gaudi_support }}" + when: + - not ( cluster_os_type == 'ubuntu' and cluster_os_version == '22.04' ) diff --git a/telemetry/roles/telemetry_validation/tasks/validate_provision_config_credentials.yml b/telemetry/roles/telemetry_validation/tasks/validate_provision_config_credentials.yml new file mode 100644 index 000000000..ebde47525 --- /dev/null +++ b/telemetry/roles/telemetry_validation/tasks/validate_provision_config_credentials.yml @@ -0,0 +1,84 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check provision_config_credentials.yml file is encrypted + ansible.builtin.command: cat {{ provision_config_credentials_filename }} + changed_when: false + register: provision_config_content + no_log: true + +- name: Decrpyt provision_config_credentials.yml + ansible.builtin.command: >- + ansible-vault decrypt {{ provision_config_credentials_filename }} + --vault-password-file {{ provision_credentials_vault_path }} + changed_when: false + when: ansible_vault_search_key in provision_config_content.stdout + +- name: Include provision_config_credentials.yml + block: + - name: Include provision_config_credentials.yml + ansible.builtin.include_vars: "{{ provision_config_credentials_filename }}" + register: include_provision_config + no_log: true + rescue: + - name: Failed to include_provision_config_credentials.yml + ansible.builtin.fail: + msg: "{{ provision_config_credentials_syntax_fail_msg }} Error: {{ include_provision_config.message }}" + +- name: Create ansible vault key + ansible.builtin.set_fact: + provision_vault_key: "{{ lookup('password', '/dev/null chars=ascii_letters') }}" + when: ansible_vault_search_key not in provision_config_content.stdout + +- name: Save vault key to provision_vault_path + ansible.builtin.lineinfile: + path: "{{ provision_credentials_vault_path }}" + line: "{{ provision_vault_key }}" + mode: "{{ conf_file_mode }}" + owner: root + create: true + when: ansible_vault_search_key not in provision_config_content.stdout + +- name: Set default docker_login value + ansible.builtin.set_fact: + docker_login: false + +- name: Assert docker_username and docker_password + ansible.builtin.assert: + that: docker_password | length > 1 + fail_msg: "{{ docker_password_fail_msg }}" + when: docker_username | length > 1 + +- name: Set docker_login to true + ansible.builtin.set_fact: + docker_login: true + when: docker_username | length > 1 + +- name: Warning - docker_username and docker_password credentials not provided + ansible.builtin.pause: + seconds: "{{ warning_wait_time }}" + prompt: "{{ warning_msg_docker_username_password_incomplete }}" + when: docker_login is false + +- name: Encrypt provision_config_credentials.yml + ansible.builtin.command: >- + ansible-vault encrypt {{ provision_config_credentials_filename }} + --vault-password-file {{ provision_credentials_vault_path }} + changed_when: false + +- name: Update provision_config_credentials.yml permission + ansible.builtin.file: + path: "{{ provision_config_credentials_filename }}" + mode: "{{ conf_file_mode }}" diff --git a/telemetry/roles/telemetry_validation/tasks/validate_site_config.yml b/telemetry/roles/telemetry_validation/tasks/validate_site_config.yml new file mode 100644 index 000000000..95c2f0f37 --- /dev/null +++ b/telemetry/roles/telemetry_validation/tasks/validate_site_config.yml @@ -0,0 +1,115 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Initialize variables + ansible.builtin.set_fact: + http_proxy_input_status: false + https_proxy_input_status: false + no_proxy_input_status: false + proxy_status: false + +- name: Include site_config.yml + ansible.builtin.include_vars: "{{ site_config_file }}" + +- name: Validate http_proxy variable provided + ansible.builtin.set_fact: + http_proxy_input_status: true + when: + - proxy[0].http_proxy is defined + - proxy[0].http_proxy | default("", true) | length > 1 + +- name: Validate https_proxy variable provided + ansible.builtin.set_fact: + https_proxy_input_status: true + when: + - proxy[0].https_proxy is defined + - proxy[0].https_proxy | default("", true) | length > 1 + +- name: Validate no_proxy variable provided + ansible.builtin.set_fact: + no_proxy_input_status: true + when: + - proxy[0].no_proxy is defined + - proxy[0].no_proxy | default("", true) | length > 1 + +- name: Validate both http_proxy and https_proxy input provided + ansible.builtin.fail: + msg: "{{ invalid_proxy_failure_msg }}" + when: + - not https_proxy_input_status and http_proxy_input_status or + not http_proxy_input_status and https_proxy_input_status + +- name: Validate proxy + when: + - http_proxy_input_status + - https_proxy_input_status + block: + - name: Validate http_proxy, https_proxy and no_proxy configured as environment variables + ansible.builtin.assert: + that: + - lookup('env', 'http_proxy') | length > 1 + - lookup('env', 'https_proxy') | length > 1 + - lookup('env', 'no_proxy') | length > 1 + - lookup('env', 'http_proxy') == proxy[0].http_proxy + - lookup('env', 'https_proxy') == proxy[0].https_proxy + - oim_hostname in lookup('env', 'no_proxy') + - admin_nic_ip in lookup('env', 'no_proxy') + fail_msg: "{{ proxy_env_fail_msg }}" + + - name: Try updating repos in Ubuntu + when: oim_os in oim_os_ubuntu + block: + - name: Update repos in Ubuntu + ansible.builtin.apt: + update_cache: true + register: update_repos + until: update_repos is not failed + retries: "{{ repo_retries }}" + delay: "{{ repo_delay }}" + rescue: + - name: Failed to update repos + ansible.builtin.fail: + msg: "{{ update_repos_fail_msg }}" + + - name: Try updating repos in RHEL/Rocky + when: + - oim_os in oim_os_redhat or + oim_os in oim_os_rocky + block: + - name: Add proxy to dnf.conf in RockyLinux OS if http_proxy is defined + community.general.ini_file: + path: "{{ dnf_conf_path }}" + section: main + option: proxy + value: "{{ proxy[0].http_proxy | default(omit) }}" + state: "{{ proxy[0].http_proxy | default(False) | ternary('present', 'absent') }}" + no_extra_spaces: true + mode: "{{ conf_file_mode }}" + + - name: Update repos in RHEL/Rocky + ansible.builtin.dnf: + update_cache: true + register: update_repos + until: update_repos is not failed + retries: "{{ repo_retries }}" + delay: "{{ repo_delay }}" + rescue: + - name: Failed to update repos + ansible.builtin.fail: + msg: "{{ update_repos_fail_msg }}" + + - name: Set proxy_status to true + ansible.builtin.set_fact: + proxy_status: true diff --git a/telemetry/roles/telemetry_validation/tasks/validate_telemetry_config.yml b/telemetry/roles/telemetry_validation/tasks/validate_telemetry_config.yml index 6a570f767..2a4799aa8 100644 --- a/telemetry/roles/telemetry_validation/tasks/validate_telemetry_config.yml +++ b/telemetry/roles/telemetry_validation/tasks/validate_telemetry_config.yml @@ -53,6 +53,8 @@ idrac_telemetry_support: "{{ idrac_telemetry_support | lower }}" omnia_telemetry_support: "{{ omnia_telemetry_support | lower }}" visualization_support: "{{ visualization_support | lower }}" + k8s_prometheus_support: "{{ k8s_prometheus_support | lower }}" + prometheus_gaudi_support: "{{ prometheus_gaudi_support | lower }}" - name: Assert idrac telemetry support ansible.builtin.assert: @@ -74,17 +76,40 @@ success_msg: "{{ visualization_support_success_msg }}" fail_msg: "{{ visualization_support_fail_msg }}" +- name: Assert k8s prometheus support + ansible.builtin.assert: + that: + - k8s_prometheus_support == true or k8s_prometheus_support == false + success_msg: "{{ k8s_prometheus_support_success_msg }}" + fail_msg: "{{ k8s_prometheus_support_fail_msg }}" + +- name: Assert prometheus gaudi support + ansible.builtin.assert: + that: + - prometheus_gaudi_support == true or prometheus_gaudi_support == false + success_msg: "{{ prometheus_gaudi_support_success_msg }}" + fail_msg: "{{ prometheus_gaudi_support_fail_msg }}" + - name: Warning for all telemetry support category values set as false ansible.builtin.pause: seconds: "{{ pause_time_15 }}" prompt: "{{ warning_telemetry_support_false }}" - when: not idrac_telemetry_support and not omnia_telemetry_support and not visualization_support + when: + - not idrac_telemetry_support + - not omnia_telemetry_support + - not visualization_support + - not k8s_prometheus_support + - not prometheus_gaudi_support - name: Please wait, This task will take few seconds ansible.builtin.pause: seconds: "{{ idrac_omnia_telemetry_support_false_warn_time }}" prompt: "{{ idrac_omnia_telemetry_support_warn_msg }}" - when: not idrac_telemetry_support and not omnia_telemetry_support + when: + - not idrac_telemetry_support + - not omnia_telemetry_support + - not k8s_prometheus_support + - not prometheus_gaudi_support - name: Assert usernames and passwords in telemetry_login_vars.yml when: idrac_telemetry_support or omnia_telemetry_support @@ -129,6 +154,14 @@ ansible.builtin.include_tasks: validate_grafana_params.yml when: visualization_support +- name: Validate k8s prometheus, scrape interval and prometheus gaudi + ansible.builtin.include_tasks: validate_k8s_prometheus_prometheus_gaudi.yml + when: k8s_prometheus_support or prometheus_gaudi_support + +- name: Validate k8s prometheus when prometheus gaudi is true + ansible.builtin.include_tasks: validate_prometheus_gaudi.yml + when: prometheus_gaudi_support + # Validate k8s inputs - name: Assert k8s_cni ansible.builtin.assert: @@ -142,6 +175,7 @@ - pod_external_ip_range | default("", true) | length > 9 - ("'/' in pod_external_ip_range") or ("'-' in pod_external_ip_range") fail_msg: "{{ invalid_pod_external_ip_range }}" + when: idrac_telemetry_support or omnia_telemetry_support or visualization_support - name: Assert kubernetes service addresses ansible.builtin.assert: diff --git a/telemetry/roles/telemetry_validation/vars/main.yml b/telemetry/roles/telemetry_validation/vars/main.yml index 23eb1c17a..f32297f70 100644 --- a/telemetry/roles/telemetry_validation/vars/main.yml +++ b/telemetry/roles/telemetry_validation/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -30,10 +30,10 @@ selinux_warning: "Warning! SELinux status is disabled by user. No SELinux policy telemetry_config_file: "{{ role_path }}/../../../input/telemetry_config.yml" fail_msg_telemetry_config_file: "telemetry_config.yml file doesn't exist." pip_packages: - - openshift - - omsdk - - PyMySQL -mysqldb_collection_name: community.mysql:3.7.2 + - omsdk==1.2.513 + - PyMySQL==1.1.1 + - pysnmp==6.1.3 + - kubernetes==30.1.0 retry_count: 3 rhel_epel_repo8: https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm max_retries: 20 @@ -51,24 +51,32 @@ idrac_telemetry_support_fail_msg: "Failed. idrac_telemetry_support only accepts omnia_telemetry_support_fail_msg: "Failed. omnia_telemetry_support accepts boolean value true or false" visualization_support_success_msg: "visualization_support_ validated" visualization_support_fail_msg: "Failed. visualization_support accepts boolean value true or false" +k8s_prometheus_support_success_msg: "k8s_prometheus_support validated" +k8s_prometheus_support_fail_msg: "Failed. k8s_prometheus_support accepts boolean value true or false" +prometheus_gaudi_support_success_msg: "prometheus_gaudi_support validated" +prometheus_gaudi_support_fail_msg: "Failed. prometheus_gaudi_support accepts boolean value true or false" idrac_omnia_telemetry_support_false_warn_time: 10 pause_time_15: 15 warning_telemetry_support_false: "Warning. telemetry entry is present in software_config.json, but all telemetry support categories \ -(idrac_telemetry_support, omnia_telemetry_support and visualization_support) are false in input/telemetry_config.yml. \ -omnia does not deploy telemetry feature if none of the support category is true." -idrac_omnia_telemetry_support_warn_msg: "Warning. Both idrac_telemetry_support and omnia_telemetry_support are false" +(idrac_telemetry_support, omnia_telemetry_support, visualization_support, k8s_prometheus_support and prometheus_gaudi_support) are false \ +in input/telemetry_config.yml. omnia does not deploy telemetry feature if none of the support category is true." +idrac_omnia_telemetry_support_warn_msg: "Warning. idrac_telemetry_support, omnia_telemetry_support, \ +k8s_prometheus_support and prometheus_gaudi_support are set to false" vault_filename: "{{ role_path }}/../../../input/.telemetry_vault_key" login_vars_fail_msg: "Username/password validation in telemetry_config.yml failed. Please check the requirements." +# K8s Input validation +invalid_k8s_cni: "Invalid k8s_cni entry in telemetry_config.yml. Enter either calico or flannel" +invalid_pod_external_ip_range: "Invalid pod_external_ip_range in telemetry_config.yml." +invalid_k8s_service_addresses: "Invalid k8s_service_addresses in telemetry_config.yml." +invalid_k8s_pod_network_cidr: "Invalid k8s_pod_network_cidr in telemetry_config.yml." # Usage: validate_idrac_telemetry.yml min_length_idrac: 3 max_length: 30 -fail_timezone_msg: "Failed. Incorrect timezone provided. Please check the file timezone.txt in provision/roles/provision_validation/files/timezone.txt folder" +fail_timezone_msg: "Failed. Incorrect timezone provided. Please check the file timezone.txt in discovery/roles/discovery_validations/common/files/ folder." idrac_credentials_fail_msg: "idrac_username and idrac_password must be provided in telemetry_config.yml." -idrac_inventory_msg: "idrac group is not present in inventory file. Playbook will check for /opt/omnia/provisioned_idrac_inventory" -idarc_inventory_path: "/opt/omnia/provisioned_idrac_inventory" -idrac_file_status: "There is no inventory present under /opt/omnia/provisioned_idrac_inventory. - Please provide inventory with idrac group or create an inventory /opt/omnia/provisioned_idrac_inventory" +idrac_inventory_msg: "Failed. The inventory containing idrac groups must be provided when idrac_telemetry_support is set to true \ + in telemetry_config.yml playbook." idrac_exec_msg: "Since the absence of idrac group in inventory and existence of /opt/omnia/provisioned_idrac_inventory file. iDRAC telemetry execution will be skipped" advanced_vars_fail_msg: "Please give correct inputs for advanced configurations (mysqldb credentials) for idrac telemetry in telemetry_config.yml. @@ -99,6 +107,21 @@ software_config.json and omnia_telemetry_support is true." slurm_control_node_group_fail_msg: "slurm_control_node group should contain exactly 1 node in inventory" slurm_node_group_fail_msg: "slurm_node group should contain atleast 1 node in inventory" +# Usage: validate_k8s_prometheus_prometheus_gaudi.yml +k8s_prom_gaudi_inventory_fail_msg: "Inventory comprising kube_control_plane, kube_node and etcd groups should be passed \ +when k8s_prometheus_support or prometheus_gaudi_support is true in telemetry_config.yml." +prometheus_scrape_interval_fail_msg: "Failed. prometheus_scrape_interval accepts integer values greater than 0" + +# Usage: validate_prometheus_gaudi.yaml +fail_msg_k8s_prometheus_support_false: "Failed. k8s_prometheus_support must be true when prometheus_gaudi_support is true in telemetry_config.yml." +fail_msg_prometheus_gaudi_support: "Failed. prometheus_gaudi_support is only available for cluster_os_type: ubuntu and cluster_os_version: 22.04. \ +Please update prometheus_gaudi_support to false in telemetry_config.yml." + +# Usage: validate_k8s_setup.yml +k8s_error_message: "No such file or directory" +k8s_cluster_fail_msg: "Failed. k8s cluster setup not found. Hence k8s prometheus or prometheus gaudi will not be deployed. \ +Please run scheduler/scheduler.yml to setup k8s cluster" + # Usage: set_docker_os.yml docker_rocky_os: "8.5" slurm_telemetry_dockerfile_path: "{{ playbook_dir }}/roles/slurm_telemetry/files/Dockerfile" @@ -108,8 +131,27 @@ fail_msg_grafana_credentials: "Failed. Incorrect grafana_username or grafana_pas mount_validation_msg: "Make sure mount location value is not null" min_length_grafana: 5 -# K8s Input validation -invalid_k8s_cni: "Invalid k8s_cni entry in telemetry_config.yml. Enter either calico or flannel" -invalid_pod_external_ip_range: "Invalid pod_external_ip_range in telemetry_config.yml." -invalid_k8s_service_addresses: "Invalid k8s_service_addresses in telemetry_config.yml." -invalid_k8s_pod_network_cidr: "Invalid k8s_pod_network_cidr in telemetry_config.yml." +# Usage: validate_site_config.yml +site_config_file: "{{ role_path }}/../../../input/site_config.yml" +invalid_proxy_failure_msg: "Failed. Both http_proxy and https_proxy should be set for proxy variable provided in site_config.yml" +proxy_env_fail_msg: "Failed. The values for http_proxy {{ proxy[0].http_proxy }} and https_proxy {{ proxy[0].https_proxy }} in the +proxy variable of the site_config.yml should be set as environment variables http_proxy and https_proxy in the Omnia Infrastructure Manager. +The no_proxy environment variable should include the Omnia Infrastructure Manager hostname and the admin network IP address." +update_repos_fail_msg: "Failed to update repos. Verify proxy configuration in Omnia Infrastructure Manager for acccessing internet." +oim_os_redhat: "redhat" +oim_os_rocky: "rocky" +oim_os_ubuntu: "ubuntu" +repo_retries: 5 +repo_delay: 10 +dnf_conf_path: "/etc/dnf/dnf.conf" + +# Usage: validate_provision_config_credentials.yml +docker_password_fail_msg: "docker password must be mentioned when docker_username is defined." +warning_wait_time: 30 +warning_msg_docker_username_password_incomplete: "[WARNING] Docker credentials not provided in provision_config_credentials.yml. +Proceeding without docker credentials." +provision_config_credentials_filename: "{{ role_path }}/../../../input/provision_config_credentials.yml" +provision_credentials_vault_path: "{{ role_path }}/../../../input/.provision_credential_vault_key" +ansible_vault_search_key: "$ANSIBLE_VAULT;" +provision_config_credentials_syntax_fail_msg: "Failed. Syntax errors present in provision_config_credentials.yml. Fix errors and re-run playbook again." +conf_file_mode: "0644" diff --git a/telemetry/roles/timescaledb/tasks/initialize_db.yml b/telemetry/roles/timescaledb/tasks/initialize_db.yml index a76e421ab..563e49e64 100644 --- a/telemetry/roles/timescaledb/tasks/initialize_db.yml +++ b/telemetry/roles/timescaledb/tasks/initialize_db.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,9 +14,15 @@ --- -- name: Wait for timescale pod to come to ready state - ansible.builtin.command: kubectl wait --for=condition=ready --timeout=10m -n "{{ namespace }}" pod -l app="{{ timescaledb_k8s_name }}" - changed_when: false +- name: Wait for mysqldb pod to come to ready state + block: + - name: Wait for timescale pod to come to ready state + ansible.builtin.command: kubectl wait --for=condition=ready --timeout=10m -n "{{ namespace }}" pod -l app="{{ timescaledb_k8s_name }}" + changed_when: false + rescue: + - name: Failed - timescaledb pod is not running + ansible.builtin.fail: + msg: "{{ timescaledb_pod_wait_fail_msg }}" - name: Get timescaledb pod name ansible.builtin.command: kubectl get pod -n "{{ namespace }}" -l app="{{ timescaledb_k8s_name }}" -o jsonpath="{.items[0].metadata.name}" @@ -24,7 +30,7 @@ changed_when: false - name: Initialize database - ansible.builtin.command: kubectl exec -it "{{ timescaledb_pod_name.stdout }}" -n "{{ namespace }}" ./cmd/initialize_timescaledb.sh + ansible.builtin.command: kubectl exec -it "{{ timescaledb_pod_name.stdout }}" -n "{{ namespace }}" ./cmd/timescalepump/initialize_timescaledb.sh changed_when: false register: status until: status is not failed diff --git a/telemetry/roles/timescaledb/tasks/main.yml b/telemetry/roles/timescaledb/tasks/main.yml index cb01c13ed..c4b80dcab 100644 --- a/telemetry/roles/timescaledb/tasks/main.yml +++ b/telemetry/roles/timescaledb/tasks/main.yml @@ -30,6 +30,10 @@ repo: "{{ idrac_telemetry_github }}" dest: "{{ mount_location + idrac_telemetry_folder_name }}" version: "{{ reference_tools_stable_commit }}" + update: false + register: clone_idrac_telemetry + until: clone_idrac_telemetry is not failed + retries: "{{ max_retries }}" - name: Create timescaledb pod ansible.builtin.include_tasks: timescaledb_pod.yml diff --git a/telemetry/roles/timescaledb/tasks/timescaledb_pod.yml b/telemetry/roles/timescaledb/tasks/timescaledb_pod.yml index 271f2053f..f830f9d68 100644 --- a/telemetry/roles/timescaledb/tasks/timescaledb_pod.yml +++ b/telemetry/roles/timescaledb/tasks/timescaledb_pod.yml @@ -56,7 +56,7 @@ volumeMounts: - mountPath: /go/src/github.com/telemetry-reference-tools name: telemetry-reference-tools - - mountPath: /var/lib/postgresql/ + - mountPath: /var/lib/postgresql/data name: timescaledb-pvc - mountPath: /etc/localtime name: timezone @@ -78,7 +78,16 @@ secretKeyRef: name: "{{ secrets_name }}" key: timescaledb_password + - name: POSTGRES_PASS + valueFrom: + secretKeyRef: + name: "{{ secrets_name }}" + key: timescaledb_password - name: TIMESCALE_DB value: "{{ timescaledb_name }}" + - name: POSTGRES_HOST + value: "{{ timescaledb_k8s_name }}" + - name: POSTGRES_PORT + value: "{{ timescaledb_port }}" ports: - containerPort: "{{ timescaledb_container_port }}" diff --git a/telemetry/roles/timescaledb/vars/main.yml b/telemetry/roles/timescaledb/vars/main.yml index a62cc4dfe..7b603a23d 100644 --- a/telemetry/roles/timescaledb/vars/main.yml +++ b/telemetry/roles/timescaledb/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ secrets_name: credentials pv_name: timescaledb-storage timescaledb_storage: 30Gi timescaledb_name: "telemetry_metrics" +timescaledb_port: "5432" pvc_name: timescaledb-storage-claim idrac_telemetry_github: https://github.com/dell/iDRAC-Telemetry-Reference-Tools.git idrac_telemetry_folder_name: iDRAC-Telemetry-Reference-Tools @@ -26,5 +27,7 @@ timescaledb_k8s_name: timescaledb timescaledb_container_port: 5432 max_retries: 10 max_delay: 10 -reference_tools_stable_commit: "0016fcb" +reference_tools_stable_commit: "9f9c5ef" zoneinfo_dir: "/usr/share/zoneinfo/" +timescaledb_pod_wait_fail_msg: "Execution failed as the timescaledb pods did not start within the expected time. +Please re-run the playbook after verifying that the timescaledb pods are in running state by executing the command 'kubectl get pods -A.'" diff --git a/telemetry/telemetry.yml b/telemetry/telemetry.yml index 158cbe6c7..0be0a26bb 100644 --- a/telemetry/telemetry.yml +++ b/telemetry/telemetry.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,6 +17,10 @@ ansible.builtin.import_playbook: ../utils/servicetag_host_mapping.yml when: not ( hostvars['127.0.0.1']['update_inventory_executed'] | default(false) | bool ) +- name: Check if package manager is not locked + ansible.builtin.import_playbook: ../utils/check_package_lock.yml + when: not ( hostvars['127.0.0.1']['apt_lock_status'] | default(false) | bool ) + - name: Validate telemetry input parameters hosts: localhost connection: local @@ -27,6 +31,16 @@ name: telemetry_validation tasks_from: validation_status_check.yml +- name: Validate kubernetes cluster + hosts: kube_control_plane + gather_facts: false + any_errors_fatal: true + tasks: + - name: Validate k8s cluster + ansible.builtin.include_role: + name: telemetry_validation + tasks_from: validate_k8s_setup.yml + - name: Update Repositories/Registries on nodes ansible.builtin.import_playbook: ../utils/update_user_repo.yml when: not ( hostvars['127.0.0.1']['update_user_repo_executed'] | default(false) | bool ) @@ -35,8 +49,6 @@ hosts: localhost connection: local gather_facts: true - vars: - ansible_python_interpreter: /usr/bin/python3.9 roles: - orchestrator - grafana @@ -45,12 +57,12 @@ - idrac_telemetry - grafana_config -- name: Prepare CP for Telemetry +- name: Prepare OIM for Telemetry hosts: localhost connection: local gather_facts: false roles: - - omnia_telemetry_prepare_cp + - omnia_telemetry_prepare_oim tags: omnia_telemetry - name: Install Omnia Telemetry @@ -59,3 +71,24 @@ roles: - omnia_telemetry_acquisition tags: omnia_telemetry + +- name: Pull Kube Prometheus stack images for compute clusters + hosts: kube_control_plane, kube_node, etcd + gather_facts: false + tasks: + - name: Pull images to nodes + ansible.builtin.include_role: + name: k8s_prometheus + tasks_from: download_images.yml + +- name: Install Kube Prometheus stack for compute clusters + hosts: kube_control_plane + gather_facts: false + roles: + - k8s_prometheus + +- name: Install Gaudi Prometheus metric exporter for compute clusters + hosts: kube_control_plane + gather_facts: false + roles: + - prometheus_gaudi diff --git a/tools/ansible.cfg b/tools/ansible.cfg index ea1f96787..444402278 100644 --- a/tools/ansible.cfg +++ b/tools/ansible.cfg @@ -4,6 +4,7 @@ host_key_checking = false forks = 5 timeout = 180 executable = /bin/bash +collections_path = $VIRTUAL_ENV [persistent_connection] command_timeout = 180 @@ -11,4 +12,4 @@ connect_timeout = 180 [ssh_connection] retries = 3 -ssh_args = -o ControlMaster=auto -o ControlPersist=180 \ No newline at end of file +ssh_args = -o ControlMaster=auto -o ControlPersist=180 diff --git a/tools/configure_mpi_operator.yml b/tools/configure_mpi_operator.yml new file mode 100644 index 000000000..ced83a12e --- /dev/null +++ b/tools/configure_mpi_operator.yml @@ -0,0 +1,65 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Update Inventory with ansible_host information + ansible.builtin.import_playbook: ../utils/servicetag_host_mapping.yml + when: not ( hostvars['127.0.0.1']['update_inventory_executed'] | default(false) | bool ) + +- name: Inventory Check + hosts: localhost + tasks: + - name: Check inventory format + ansible.builtin.include_role: + name: mpijob + tasks_from: inv_check.yml + +- name: Running pre-requisites + hosts: kube_control_plane + gather_facts: false + tags: mpiv1,mpiv2beta1 + tasks: + - name: Check if kubeflow is installed + ansible.builtin.include_role: + name: mpijob + tasks_from: verify_kubeflow.yml + +- name: Configure MPI operator v1 + hosts: kube_control_plane + gather_facts: false + tags: mpiv1 + tasks: + - name: Initialize mpi operator v1 + ansible.builtin.include_role: + name: mpijob + tasks_from: mpi_job_v1.yml + +- name: Fetch MPI operator package + hosts: localhost + tags: mpiv2beta1 + tasks: + - name: Fetch mpi-operator package from k8s.json + ansible.builtin.include_role: + name: mpijob + tasks_from: fetch_software_config.yml + +- name: Configure MPI operator v2beta1 + hosts: kube_control_plane + tags: mpiv2beta1 + gather_facts: true + tasks: + - name: Initialize mpi-operator v2beta1 + ansible.builtin.include_role: + name: mpijob + tasks_from: mpi_job_v2beta1.yml diff --git a/tools/jupyterhub.yml b/tools/jupyterhub.yml index 4554f6097..79248071e 100644 --- a/tools/jupyterhub.yml +++ b/tools/jupyterhub.yml @@ -20,6 +20,10 @@ - name: Inventory Check hosts: localhost tasks: + - name: Set flag to indicate check_venv.yml has been executed + ansible.builtin.set_fact: + check_venv_executed: true + - name: Check inventory format ansible.builtin.include_role: name: jupyterhub diff --git a/tools/kserve.yml b/tools/kserve.yml index 0c8c36c63..a06d117b8 100644 --- a/tools/kserve.yml +++ b/tools/kserve.yml @@ -20,6 +20,10 @@ - name: Inventory Check hosts: localhost tasks: + - name: Set flag to indicate check_venv.yml has been executed + ansible.builtin.set_fact: + check_venv_executed: true + - name: Check inventory format ansible.builtin.include_role: name: kserve diff --git a/tools/kubeflow.yml b/tools/kubeflow.yml index 97130e952..248816d49 100644 --- a/tools/kubeflow.yml +++ b/tools/kubeflow.yml @@ -16,6 +16,20 @@ ansible.builtin.import_playbook: ../utils/servicetag_host_mapping.yml when: not ( hostvars['127.0.0.1']['update_inventory_executed'] | default(false) | bool ) +- name: Validate Inventory for Kubeflow + hosts: localhost + connection: local + gather_facts: false + tasks: + - name: Set flag to indicate check_venv.yml has been executed + ansible.builtin.set_fact: + check_venv_executed: true + + - name: Validate Inventory for Kubeflow + ansible.builtin.include_role: + name: kubeflow + tasks_from: validate_inventory.yml + - name: Initialize Variables gather_facts: true hosts: kube_control_plane, kube_node diff --git a/tools/pytorch.yml b/tools/pytorch.yml index a6bbe5e11..1bd921b2a 100644 --- a/tools/pytorch.yml +++ b/tools/pytorch.yml @@ -17,12 +17,29 @@ ansible.builtin.import_playbook: ../utils/servicetag_host_mapping.yml when: not ( hostvars['127.0.0.1']['update_inventory_executed'] | default(false) | bool ) +- name: Inventory Check + hosts: localhost + tasks: + - name: Set flag to indicate check_venv.yml has been executed + ansible.builtin.set_fact: + check_venv_executed: true + + - name: Check entry in software_config json + ansible.builtin.include_role: + name: pytorch + tasks_from: check_software_config_file.yml + + - name: Check inventory format + ansible.builtin.include_role: + name: pytorch + tasks_from: inv_check.yml + - name: Update Repositories/Registries on nodes ansible.builtin.import_playbook: ../utils/update_user_repo.yml when: not ( hostvars['127.0.0.1']['update_user_repo_executed'] | default(false) | bool ) - name: Installing pytorch - hosts: kube_node, kube_control_node + hosts: kube_node, kube_control_plane gather_facts: false roles: - pytorch diff --git a/tools/roles/jupyterhub/tasks/image_pulling.yml b/tools/roles/jupyterhub/tasks/image_pulling.yml index 182758e75..c3253d5aa 100644 --- a/tools/roles/jupyterhub/tasks/image_pulling.yml +++ b/tools/roles/jupyterhub/tasks/image_pulling.yml @@ -17,6 +17,7 @@ environment: http_proxy: "{{ http_proxy }}" https_proxy: "{{ https_proxy }}" + no_proxy: "{{ oim_hostname }},{{ admin_nic_ip }}" block: - name: Set empty image list ansible.builtin.set_fact: diff --git a/tools/roles/jupyterhub/tasks/inv_check.yml b/tools/roles/jupyterhub/tasks/inv_check.yml index 9b6a01c5e..7e6939bc7 100644 --- a/tools/roles/jupyterhub/tasks/inv_check.yml +++ b/tools/roles/jupyterhub/tasks/inv_check.yml @@ -14,6 +14,13 @@ --- +- name: Inventory not provided + ansible.builtin.fail: + msg: "{{ juptyerhub_empty_inventory_fail_msg }}" + when: + - groups['all'] is defined + - (groups['all'] | length == 0) + - name: Validate kube_control_plane and kube_node group in inventory ansible.builtin.assert: that: diff --git a/tools/roles/jupyterhub/vars/main.yml b/tools/roles/jupyterhub/vars/main.yml index 4f18b0fa9..b6fdc9e3d 100644 --- a/tools/roles/jupyterhub/vars/main.yml +++ b/tools/roles/jupyterhub/vars/main.yml @@ -45,3 +45,6 @@ fail_msg_jupyter_software_config: "jupyter entry is not present in software_conf fail_inv_format: "Both 'kube_control_plane' and 'kube_node' groups should be defined in inventory" fail_node_kube_control_plane: "Single node should be part of kube_control_plane group in inventory" fail_no_node_kube_node: "No node is part of kube_node group in inventory" +juptyerhub_empty_inventory_fail_msg: > + "Failed. Inventory not provided. + Please re-run the playbook with an inventory that includes the groups 'kube_control_plane' and 'kube_node' by using the -i inventory option" diff --git a/tools/roles/kserve/tasks/image_pulling.yml b/tools/roles/kserve/tasks/image_pulling.yml index 38bc2e52d..52baeb5b9 100644 --- a/tools/roles/kserve/tasks/image_pulling.yml +++ b/tools/roles/kserve/tasks/image_pulling.yml @@ -17,6 +17,7 @@ environment: http_proxy: "{{ http_proxy }}" https_proxy: "{{ https_proxy }}" + no_proxy: "{{ oim_hostname }},{{ admin_nic_ip }}" block: - name: Set empty image list ansible.builtin.set_fact: diff --git a/tools/roles/kserve/tasks/install_istio.yml b/tools/roles/kserve/tasks/install_istio.yml index 79c288b9d..739c43125 100644 --- a/tools/roles/kserve/tasks/install_istio.yml +++ b/tools/roles/kserve/tasks/install_istio.yml @@ -35,11 +35,14 @@ copy: false src: "{{ istio_tar_file_location }}" dest: /tmp/ + extra_opts: + - "--one-top-level={{ istio_tar_file_name }}" + - "--strip-components=1" register: istio_unarchive_status changed_when: istio_unarchive_status.changed - name: Install Istio using istioctl - ansible.builtin.command: "/tmp/{{ istio_tar_file_name }}-{{ istio_version }}/bin/istioctl install --set profile=default -y" + ansible.builtin.command: "/tmp/{{ istio_tar_file_name }}/bin/istioctl install --set profile=default -y" register: istio_install_status changed_when: istio_install_status.changed failed_when: istio_install_status.rc != 0 @@ -78,7 +81,7 @@ # Clean up istio local tar file and extracted folder - name: Cleanup istio Extracted folder - ansible.builtin.command: "rm -rf /tmp/{{ istio_tar_file_name }}-{{ istio_version }}" + ansible.builtin.command: "rm -rf /tmp/{{ istio_tar_file_name }}" register: command_result changed_when: command_result.changed failed_when: false diff --git a/tools/roles/kserve/tasks/install_knative.yml b/tools/roles/kserve/tasks/install_knative.yml index d453c6c36..0bee5dbd2 100644 --- a/tools/roles/kserve/tasks/install_knative.yml +++ b/tools/roles/kserve/tasks/install_knative.yml @@ -66,6 +66,16 @@ knative_namespace_flag: true when: knative_command_output.stdout | int >= 2 + - name: Wait for Knative webhook service to be ready + ansible.builtin.command: > + kubectl wait --for=condition=available --timeout=600s deployment/webhook -n knative-serving + register: knative_webhook_status + retries: "{{ max_attempts }}" + delay: "{{ wait_time }}" + until: knative_webhook_status.rc == 0 + changed_when: false + failed_when: knative_webhook_status.rc != 0 + # knative net-istio - name: Apply knative net-istio manifest ansible.builtin.command: kubectl apply -f {{ manifest_local_path }}/knative_net_istio_manifest.yaml diff --git a/tools/roles/kserve/tasks/inv_check.yml b/tools/roles/kserve/tasks/inv_check.yml index 9b6a01c5e..82898fe9a 100644 --- a/tools/roles/kserve/tasks/inv_check.yml +++ b/tools/roles/kserve/tasks/inv_check.yml @@ -14,6 +14,13 @@ --- +- name: Inventory not provided + ansible.builtin.fail: + msg: "{{ kserve_empty_inventory_fail_msg }}" + when: + - groups['all'] is defined + - (groups['all'] | length == 0) + - name: Validate kube_control_plane and kube_node group in inventory ansible.builtin.assert: that: diff --git a/tools/roles/kserve/vars/main.yml b/tools/roles/kserve/vars/main.yml index df1021e25..21451e039 100644 --- a/tools/roles/kserve/vars/main.yml +++ b/tools/roles/kserve/vars/main.yml @@ -15,7 +15,6 @@ kserve_directory: "/opt/omnia/kserve" istio_tar_file_name: "istio" -istio_version: "1.17.0" istio_tar_file_location: "{{ kserve_directory }}/{{ istio_tar_file_name }}.tar.gz" manifest_folder_name: "kserve_manifest" @@ -66,6 +65,9 @@ wait_msg_kserve_pods_bringing_up: "Waiting for pods to come to active state" fail_inv_format: "Both 'kube_control_plane' and 'kube_node' groups should be defined in inventory" fail_node_kube_control_plane: "Single node should be part of kube_control_plane group in inventory" fail_no_node_kube_node: "No node is part of kube_node group in inventory" +kserve_empty_inventory_fail_msg: > + "Failed. Inventory not provided. + Please re-run the playbook with an inventory that includes the groups 'kube_control_plane' and 'kube_node' by using the -i inventory option" kserve_deployment_warning: "Warning! Please review the deployment. There are non-running pods under namespace :" kserve_deployment_success: "kserve successfully deployed with all pods in running state." diff --git a/tools/roles/kubeflow/files/crd_mpijobs_kubeflow.yml b/tools/roles/kubeflow/files/crd_mpijobs_kubeflow.yml deleted file mode 100644 index 489c9a224..000000000 --- a/tools/roles/kubeflow/files/crd_mpijobs_kubeflow.yml +++ /dev/null @@ -1,7863 +0,0 @@ ---- -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - annotations: - controller-gen.kubebuilder.io/version: v0.12.0 - name: mpijobs.kubeflow.org -spec: - group: kubeflow.org - names: - kind: MPIJob - listKind: MPIJobList - plural: mpijobs - singular: mpijob - scope: Namespaced - versions: - - additionalPrinterColumns: - - jsonPath: .metadata.creationTimestamp - name: Age - type: date - - jsonPath: .status.conditions[-1:].type - name: State - type: string - name: v1 - schema: - openAPIV3Schema: - properties: - apiVersion: - description: 'APIVersion defines the versioned schema of this representation - of an object. Servers should convert recognized schemas to the latest - internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' - type: string - kind: - description: 'Kind is a string value representing the REST resource this - object represents. Servers may infer this from the endpoint the client - submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' - type: string - metadata: - type: object - spec: - properties: - cleanPodPolicy: - description: CleanPodPolicy defines the policy that whether to kill - pods after the job completes. Defaults to None. - type: string - mainContainer: - description: MainContainer specifies name of the main container which - executes the MPI code. - type: string - mpiReplicaSpecs: - additionalProperties: - description: ReplicaSpec is a description of the replica - properties: - replicas: - description: Replicas is the desired number of replicas of the - given template. If unspecified, defaults to 1. - format: int32 - type: integer - restartPolicy: - description: Restart policy for all replicas within the job. - One of Always, OnFailure, Never and ExitCode. Default to Never. - type: string - template: - description: Template is the object that describes the pod that - will be created for this replica. RestartPolicy in PodTemplateSpec - will be overide by RestartPolicy in ReplicaSpec - properties: - metadata: - description: 'Standard object''s metadata. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#metadata' - properties: - annotations: - additionalProperties: - type: string - type: object - finalizers: - items: - type: string - type: array - labels: - additionalProperties: - type: string - type: object - name: - type: string - namespace: - type: string - type: object - spec: - description: 'Specification of the desired behavior of the - pod. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#spec-and-status' - properties: - activeDeadlineSeconds: - description: Optional duration in seconds the pod may - be active on the node relative to StartTime before - the system will actively try to mark it failed and - kill associated containers. Value must be a positive - integer. - format: int64 - type: integer - affinity: - description: If specified, the pod's scheduling constraints - properties: - nodeAffinity: - description: Describes node affinity scheduling - rules for the pod. - properties: - preferredDuringSchedulingIgnoredDuringExecution: - description: The scheduler will prefer to schedule - pods to nodes that satisfy the affinity expressions - specified by this field, but it may choose - a node that violates one or more of the expressions. - The node that is most preferred is the one - with the greatest sum of weights, i.e. for - each node that meets all of the scheduling - requirements (resource request, requiredDuringScheduling - affinity expressions, etc. - items: - description: An empty preferred scheduling - term matches all objects with implicit weight - 0 (i.e. it's a no-op). A null preferred - scheduling term matches no objects (i.e. - is also a no-op). - properties: - preference: - description: A node selector term, associated - with the corresponding weight. - properties: - matchExpressions: - description: A list of node selector - requirements by node's labels. - items: - description: A node selector requirement - is a selector that contains values, - a key, and an operator that relates - the key and values. - properties: - key: - description: The label key that - the selector applies to. - type: string - operator: - description: Represents a key's - relationship to a set of values. - Valid operators are In, NotIn, - Exists, DoesNotExist. Gt, - and Lt. - type: string - values: - description: An array of string - values. If the operator is - In or NotIn, the values array - must be non-empty. If the - operator is Exists or DoesNotExist, - the values array must be empty. - If the operator is Gt or Lt, - the values array must have - a single element, which will - be interpreted as an integer. - This array is replaced during - a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchFields: - description: A list of node selector - requirements by node's fields. - items: - description: A node selector requirement - is a selector that contains values, - a key, and an operator that relates - the key and values. - properties: - key: - description: The label key that - the selector applies to. - type: string - operator: - description: Represents a key's - relationship to a set of values. - Valid operators are In, NotIn, - Exists, DoesNotExist. Gt, - and Lt. - type: string - values: - description: An array of string - values. If the operator is - In or NotIn, the values array - must be non-empty. If the - operator is Exists or DoesNotExist, - the values array must be empty. - If the operator is Gt or Lt, - the values array must have - a single element, which will - be interpreted as an integer. - This array is replaced during - a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - type: object - x-kubernetes-map-type: atomic - weight: - description: Weight associated with matching - the corresponding nodeSelectorTerm, - in the range 1-100. - format: int32 - type: integer - required: - - preference - - weight - type: object - type: array - requiredDuringSchedulingIgnoredDuringExecution: - description: If the affinity requirements specified - by this field are not met at scheduling time, - the pod will not be scheduled onto the node. - If the affinity requirements specified by - this field cease to be met at some point during - pod execution (e.g. due to an update), the - system may or may not try to eventually evict - the pod from its node. - properties: - nodeSelectorTerms: - description: Required. A list of node selector - terms. The terms are ORed. - items: - description: A null or empty node selector - term matches no objects. The requirements - of them are ANDed. The TopologySelectorTerm - type implements a subset of the NodeSelectorTerm. - properties: - matchExpressions: - description: A list of node selector - requirements by node's labels. - items: - description: A node selector requirement - is a selector that contains values, - a key, and an operator that relates - the key and values. - properties: - key: - description: The label key that - the selector applies to. - type: string - operator: - description: Represents a key's - relationship to a set of values. - Valid operators are In, NotIn, - Exists, DoesNotExist. Gt, - and Lt. - type: string - values: - description: An array of string - values. If the operator is - In or NotIn, the values array - must be non-empty. If the - operator is Exists or DoesNotExist, - the values array must be empty. - If the operator is Gt or Lt, - the values array must have - a single element, which will - be interpreted as an integer. - This array is replaced during - a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchFields: - description: A list of node selector - requirements by node's fields. - items: - description: A node selector requirement - is a selector that contains values, - a key, and an operator that relates - the key and values. - properties: - key: - description: The label key that - the selector applies to. - type: string - operator: - description: Represents a key's - relationship to a set of values. - Valid operators are In, NotIn, - Exists, DoesNotExist. Gt, - and Lt. - type: string - values: - description: An array of string - values. If the operator is - In or NotIn, the values array - must be non-empty. If the - operator is Exists or DoesNotExist, - the values array must be empty. - If the operator is Gt or Lt, - the values array must have - a single element, which will - be interpreted as an integer. - This array is replaced during - a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - type: object - x-kubernetes-map-type: atomic - type: array - required: - - nodeSelectorTerms - type: object - x-kubernetes-map-type: atomic - type: object - podAffinity: - description: Describes pod affinity scheduling rules - (e.g. co-locate this pod in the same node, zone, - etc. as some other pod(s)). - properties: - preferredDuringSchedulingIgnoredDuringExecution: - description: The scheduler will prefer to schedule - pods to nodes that satisfy the affinity expressions - specified by this field, but it may choose - a node that violates one or more of the expressions. - The node that is most preferred is the one - with the greatest sum of weights, i.e. for - each node that meets all of the scheduling - requirements (resource request, requiredDuringScheduling - affinity expressions, etc. - items: - description: The weights of all of the matched - WeightedPodAffinityTerm fields are added - per-node to find the most preferred node(s) - properties: - podAffinityTerm: - description: Required. A pod affinity - term, associated with the corresponding - weight. - properties: - labelSelector: - description: A label query over a - set of resources, in this case pods. - properties: - matchExpressions: - description: matchExpressions - is a list of label selector - requirements. The requirements - are ANDed. - items: - description: A label selector - requirement is a selector - that contains values, a key, - and an operator that relates - the key and values. - properties: - key: - description: key is the - label key that the selector - applies to. - type: string - operator: - description: operator represents - a key's relationship to - a set of values. Valid - operators are In, NotIn, - Exists and DoesNotExist. - type: string - values: - description: values is an - array of string values. - If the operator is In - or NotIn, the values array - must be non-empty. If - the operator is Exists - or DoesNotExist, the values - array must be empty. This - array is replaced during - a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a - map of {key,value} pairs. A - single {key,value} in the matchLabels - map is equivalent to an element - of matchExpressions, whose key - field is "key", the operator - is "In", and the values array - contains only "value". The requirements - are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - namespaceSelector: - description: A label query over the - set of namespaces that the term - applies to. The term is applied - to the union of the namespaces selected - by this field and the ones listed - in the namespaces field. null selector - and null or empty namespaces list - means "this pod's namespace". An - empty selector ({}) matches all - namespaces. - properties: - matchExpressions: - description: matchExpressions - is a list of label selector - requirements. The requirements - are ANDed. - items: - description: A label selector - requirement is a selector - that contains values, a key, - and an operator that relates - the key and values. - properties: - key: - description: key is the - label key that the selector - applies to. - type: string - operator: - description: operator represents - a key's relationship to - a set of values. Valid - operators are In, NotIn, - Exists and DoesNotExist. - type: string - values: - description: values is an - array of string values. - If the operator is In - or NotIn, the values array - must be non-empty. If - the operator is Exists - or DoesNotExist, the values - array must be empty. This - array is replaced during - a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a - map of {key,value} pairs. A - single {key,value} in the matchLabels - map is equivalent to an element - of matchExpressions, whose key - field is "key", the operator - is "In", and the values array - contains only "value". The requirements - are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - namespaces: - description: namespaces specifies - a static list of namespace names - that the term applies to. The term - is applied to the union of the namespaces - listed in this field and the ones - selected by namespaceSelector. null - or empty namespaces list and null - namespaceSelector means "this pod's - namespace". - items: - type: string - type: array - topologyKey: - description: This pod should be co-located - (affinity) or not co-located (anti-affinity) - with the pods matching the labelSelector - in the specified namespaces, where - co-located is defined as running - on a node whose value of the label - with key topologyKey matches that - of any node on which any of the - selected pods is running. Empty - topologyKey is not allowed. - type: string - required: - - topologyKey - type: object - weight: - description: weight associated with matching - the corresponding podAffinityTerm, in - the range 1-100. - format: int32 - type: integer - required: - - podAffinityTerm - - weight - type: object - type: array - requiredDuringSchedulingIgnoredDuringExecution: - description: If the affinity requirements specified - by this field are not met at scheduling time, - the pod will not be scheduled onto the node. - If the affinity requirements specified by - this field cease to be met at some point during - pod execution (e.g. due to a pod label update), - the system may or may not try to eventually - evict the pod from its node. - items: - description: Defines a set of pods (namely - those matching the labelSelector relative - to the given namespace(s)) that this pod - should be co-located (affinity) or not co-located - (anti-affinity) with, where co-located is - defined as running on a node whose value - of the label with key matches - that of any node on which a pod of the set - of pods is running - properties: - labelSelector: - description: A label query over a set - of resources, in this case pods. - properties: - matchExpressions: - description: matchExpressions is a - list of label selector requirements. - The requirements are ANDed. - items: - description: A label selector requirement - is a selector that contains values, - a key, and an operator that relates - the key and values. - properties: - key: - description: key is the label - key that the selector applies - to. - type: string - operator: - description: operator represents - a key's relationship to a - set of values. Valid operators - are In, NotIn, Exists and - DoesNotExist. - type: string - values: - description: values is an array - of string values. If the operator - is In or NotIn, the values - array must be non-empty. If - the operator is Exists or - DoesNotExist, the values array - must be empty. This array - is replaced during a strategic - merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map - of {key,value} pairs. A single {key,value} - in the matchLabels map is equivalent - to an element of matchExpressions, - whose key field is "key", the operator - is "In", and the values array contains - only "value". The requirements are - ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - namespaceSelector: - description: A label query over the set - of namespaces that the term applies - to. The term is applied to the union - of the namespaces selected by this field - and the ones listed in the namespaces - field. null selector and null or empty - namespaces list means "this pod's namespace". - An empty selector ({}) matches all namespaces. - properties: - matchExpressions: - description: matchExpressions is a - list of label selector requirements. - The requirements are ANDed. - items: - description: A label selector requirement - is a selector that contains values, - a key, and an operator that relates - the key and values. - properties: - key: - description: key is the label - key that the selector applies - to. - type: string - operator: - description: operator represents - a key's relationship to a - set of values. Valid operators - are In, NotIn, Exists and - DoesNotExist. - type: string - values: - description: values is an array - of string values. If the operator - is In or NotIn, the values - array must be non-empty. If - the operator is Exists or - DoesNotExist, the values array - must be empty. This array - is replaced during a strategic - merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map - of {key,value} pairs. A single {key,value} - in the matchLabels map is equivalent - to an element of matchExpressions, - whose key field is "key", the operator - is "In", and the values array contains - only "value". The requirements are - ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - namespaces: - description: namespaces specifies a static - list of namespace names that the term - applies to. The term is applied to the - union of the namespaces listed in this - field and the ones selected by namespaceSelector. - null or empty namespaces list and null - namespaceSelector means "this pod's - namespace". - items: - type: string - type: array - topologyKey: - description: This pod should be co-located - (affinity) or not co-located (anti-affinity) - with the pods matching the labelSelector - in the specified namespaces, where co-located - is defined as running on a node whose - value of the label with key topologyKey - matches that of any node on which any - of the selected pods is running. Empty - topologyKey is not allowed. - type: string - required: - - topologyKey - type: object - type: array - type: object - podAntiAffinity: - description: Describes pod anti-affinity scheduling - rules (e.g. avoid putting this pod in the same - node, zone, etc. as some other pod(s)). - properties: - preferredDuringSchedulingIgnoredDuringExecution: - description: The scheduler will prefer to schedule - pods to nodes that satisfy the anti-affinity - expressions specified by this field, but it - may choose a node that violates one or more - of the expressions. The node that is most - preferred is the one with the greatest sum - of weights, i.e. - items: - description: The weights of all of the matched - WeightedPodAffinityTerm fields are added - per-node to find the most preferred node(s) - properties: - podAffinityTerm: - description: Required. A pod affinity - term, associated with the corresponding - weight. - properties: - labelSelector: - description: A label query over a - set of resources, in this case pods. - properties: - matchExpressions: - description: matchExpressions - is a list of label selector - requirements. The requirements - are ANDed. - items: - description: A label selector - requirement is a selector - that contains values, a key, - and an operator that relates - the key and values. - properties: - key: - description: key is the - label key that the selector - applies to. - type: string - operator: - description: operator represents - a key's relationship to - a set of values. Valid - operators are In, NotIn, - Exists and DoesNotExist. - type: string - values: - description: values is an - array of string values. - If the operator is In - or NotIn, the values array - must be non-empty. If - the operator is Exists - or DoesNotExist, the values - array must be empty. This - array is replaced during - a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a - map of {key,value} pairs. A - single {key,value} in the matchLabels - map is equivalent to an element - of matchExpressions, whose key - field is "key", the operator - is "In", and the values array - contains only "value". The requirements - are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - namespaceSelector: - description: A label query over the - set of namespaces that the term - applies to. The term is applied - to the union of the namespaces selected - by this field and the ones listed - in the namespaces field. null selector - and null or empty namespaces list - means "this pod's namespace". An - empty selector ({}) matches all - namespaces. - properties: - matchExpressions: - description: matchExpressions - is a list of label selector - requirements. The requirements - are ANDed. - items: - description: A label selector - requirement is a selector - that contains values, a key, - and an operator that relates - the key and values. - properties: - key: - description: key is the - label key that the selector - applies to. - type: string - operator: - description: operator represents - a key's relationship to - a set of values. Valid - operators are In, NotIn, - Exists and DoesNotExist. - type: string - values: - description: values is an - array of string values. - If the operator is In - or NotIn, the values array - must be non-empty. If - the operator is Exists - or DoesNotExist, the values - array must be empty. This - array is replaced during - a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a - map of {key,value} pairs. A - single {key,value} in the matchLabels - map is equivalent to an element - of matchExpressions, whose key - field is "key", the operator - is "In", and the values array - contains only "value". The requirements - are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - namespaces: - description: namespaces specifies - a static list of namespace names - that the term applies to. The term - is applied to the union of the namespaces - listed in this field and the ones - selected by namespaceSelector. null - or empty namespaces list and null - namespaceSelector means "this pod's - namespace". - items: - type: string - type: array - topologyKey: - description: This pod should be co-located - (affinity) or not co-located (anti-affinity) - with the pods matching the labelSelector - in the specified namespaces, where - co-located is defined as running - on a node whose value of the label - with key topologyKey matches that - of any node on which any of the - selected pods is running. Empty - topologyKey is not allowed. - type: string - required: - - topologyKey - type: object - weight: - description: weight associated with matching - the corresponding podAffinityTerm, in - the range 1-100. - format: int32 - type: integer - required: - - podAffinityTerm - - weight - type: object - type: array - requiredDuringSchedulingIgnoredDuringExecution: - description: If the anti-affinity requirements - specified by this field are not met at scheduling - time, the pod will not be scheduled onto the - node. If the anti-affinity requirements specified - by this field cease to be met at some point - during pod execution (e.g. due to a pod label - update), the system may or may not try to - eventually evict the pod from its node. - items: - description: Defines a set of pods (namely - those matching the labelSelector relative - to the given namespace(s)) that this pod - should be co-located (affinity) or not co-located - (anti-affinity) with, where co-located is - defined as running on a node whose value - of the label with key matches - that of any node on which a pod of the set - of pods is running - properties: - labelSelector: - description: A label query over a set - of resources, in this case pods. - properties: - matchExpressions: - description: matchExpressions is a - list of label selector requirements. - The requirements are ANDed. - items: - description: A label selector requirement - is a selector that contains values, - a key, and an operator that relates - the key and values. - properties: - key: - description: key is the label - key that the selector applies - to. - type: string - operator: - description: operator represents - a key's relationship to a - set of values. Valid operators - are In, NotIn, Exists and - DoesNotExist. - type: string - values: - description: values is an array - of string values. If the operator - is In or NotIn, the values - array must be non-empty. If - the operator is Exists or - DoesNotExist, the values array - must be empty. This array - is replaced during a strategic - merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map - of {key,value} pairs. A single {key,value} - in the matchLabels map is equivalent - to an element of matchExpressions, - whose key field is "key", the operator - is "In", and the values array contains - only "value". The requirements are - ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - namespaceSelector: - description: A label query over the set - of namespaces that the term applies - to. The term is applied to the union - of the namespaces selected by this field - and the ones listed in the namespaces - field. null selector and null or empty - namespaces list means "this pod's namespace". - An empty selector ({}) matches all namespaces. - properties: - matchExpressions: - description: matchExpressions is a - list of label selector requirements. - The requirements are ANDed. - items: - description: A label selector requirement - is a selector that contains values, - a key, and an operator that relates - the key and values. - properties: - key: - description: key is the label - key that the selector applies - to. - type: string - operator: - description: operator represents - a key's relationship to a - set of values. Valid operators - are In, NotIn, Exists and - DoesNotExist. - type: string - values: - description: values is an array - of string values. If the operator - is In or NotIn, the values - array must be non-empty. If - the operator is Exists or - DoesNotExist, the values array - must be empty. This array - is replaced during a strategic - merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map - of {key,value} pairs. A single {key,value} - in the matchLabels map is equivalent - to an element of matchExpressions, - whose key field is "key", the operator - is "In", and the values array contains - only "value". The requirements are - ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - namespaces: - description: namespaces specifies a static - list of namespace names that the term - applies to. The term is applied to the - union of the namespaces listed in this - field and the ones selected by namespaceSelector. - null or empty namespaces list and null - namespaceSelector means "this pod's - namespace". - items: - type: string - type: array - topologyKey: - description: This pod should be co-located - (affinity) or not co-located (anti-affinity) - with the pods matching the labelSelector - in the specified namespaces, where co-located - is defined as running on a node whose - value of the label with key topologyKey - matches that of any node on which any - of the selected pods is running. Empty - topologyKey is not allowed. - type: string - required: - - topologyKey - type: object - type: array - type: object - type: object - automountServiceAccountToken: - description: AutomountServiceAccountToken indicates - whether a service account token should be automatically - mounted. - type: boolean - containers: - description: List of containers belonging to the pod. - Containers cannot currently be added or removed. There - must be at least one container in a Pod. Cannot be - updated. - items: - description: A single application container that you - want to run within a pod. - properties: - args: - description: 'Arguments to the entrypoint. The - container image''s CMD is used if this is not - provided. Variable references $(VAR_NAME) are - expanded using the container''s environment. - If a variable cannot be resolved, the reference - in the input string will be unchanged. Double - $$ are reduced to a single $, which allows for - escaping the $(VAR_NAME) syntax: i.e.' - items: - type: string - type: array - command: - description: 'Entrypoint array. Not executed within - a shell. The container image''s ENTRYPOINT is - used if this is not provided. Variable references - $(VAR_NAME) are expanded using the container''s - environment. If a variable cannot be resolved, - the reference in the input string will be unchanged. - Double $$ are reduced to a single $, which allows - for escaping the $(VAR_NAME) syntax: i.e.' - items: - type: string - type: array - env: - description: List of environment variables to - set in the container. Cannot be updated. - items: - description: EnvVar represents an environment - variable present in a Container. - properties: - name: - description: Name of the environment variable. - Must be a C_IDENTIFIER. - type: string - value: - description: 'Variable references $(VAR_NAME) - are expanded using the previously defined - environment variables in the container - and any service environment variables. - If a variable cannot be resolved, the - reference in the input string will be - unchanged. Double $$ are reduced to a - single $, which allows for escaping the - $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" - will produce the string literal "$(VAR_NAME)".' - type: string - valueFrom: - description: Source for the environment - variable's value. Cannot be used if value - is not empty. - properties: - configMapKeyRef: - description: Selects a key of a ConfigMap. - properties: - key: - description: The key to select. - type: string - name: - description: 'Name of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. - apiVersion, kind, uid?' - type: string - optional: - description: Specify whether the - ConfigMap or its key must be defined - type: boolean - required: - - key - type: object - x-kubernetes-map-type: atomic - fieldRef: - description: 'Selects a field of the - pod: supports metadata.name, metadata.namespace, - `metadata.labels['''']`, `metadata.annotations['''']`, - spec.nodeName, spec.serviceAccountName, - status.hostIP, status.podIP, status.podIPs.' - properties: - apiVersion: - description: Version of the schema - the FieldPath is written in terms - of, defaults to "v1". - type: string - fieldPath: - description: Path of the field to - select in the specified API version. - type: string - required: - - fieldPath - type: object - x-kubernetes-map-type: atomic - resourceFieldRef: - description: 'Selects a resource of - the container: only resources limits - and requests (limits.cpu, limits.memory, - limits.ephemeral-storage, requests.cpu, - requests.memory and requests.ephemeral-storage) - are currently supported.' - properties: - containerName: - description: 'Container name: required - for volumes, optional for env - vars' - type: string - divisor: - anyOf: - - type: integer - - type: string - description: Specifies the output - format of the exposed resources, - defaults to "1" - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - resource: - description: 'Required: resource - to select' - type: string - required: - - resource - type: object - x-kubernetes-map-type: atomic - secretKeyRef: - description: Selects a key of a secret - in the pod's namespace - properties: - key: - description: The key of the secret - to select from. Must be a valid - secret key. - type: string - name: - description: 'Name of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. - apiVersion, kind, uid?' - type: string - optional: - description: Specify whether the - Secret or its key must be defined - type: boolean - required: - - key - type: object - x-kubernetes-map-type: atomic - type: object - required: - - name - type: object - type: array - envFrom: - description: List of sources to populate environment - variables in the container. The keys defined - within a source must be a C_IDENTIFIER. All - invalid keys will be reported as an event when - the container is starting. When a key exists - in multiple sources, the value associated with - the last source will take precedence. Values - defined by an Env with a duplicate key will - take precedence. Cannot be updated. - items: - description: EnvFromSource represents the source - of a set of ConfigMaps - properties: - configMapRef: - description: The ConfigMap to select from - properties: - name: - description: 'Name of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' - type: string - optional: - description: Specify whether the ConfigMap - must be defined - type: boolean - type: object - x-kubernetes-map-type: atomic - prefix: - description: An optional identifier to prepend - to each key in the ConfigMap. Must be - a C_IDENTIFIER. - type: string - secretRef: - description: The Secret to select from - properties: - name: - description: 'Name of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' - type: string - optional: - description: Specify whether the Secret - must be defined - type: boolean - type: object - x-kubernetes-map-type: atomic - type: object - type: array - image: - description: 'Container image name. More info: - https://kubernetes.io/docs/concepts/containers/images - This field is optional to allow higher level - config management to default or override container - images in workload controllers like Deployments - and StatefulSets.' - type: string - imagePullPolicy: - description: 'Image pull policy. One of Always, - Never, IfNotPresent. Defaults to Always if :latest - tag is specified, or IfNotPresent otherwise. - Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images' - type: string - lifecycle: - description: Actions that the management system - should take in response to container lifecycle - events. Cannot be updated. - properties: - postStart: - description: 'PostStart is called immediately - after a container is created. If the handler - fails, the container is terminated and restarted - according to its restart policy. Other management - of the container blocks until the hook completes. - More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks' - properties: - exec: - description: Exec specifies the action - to take. - properties: - command: - description: Command is the command - line to execute inside the container, - the working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it - is not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to - explicitly call out to that shell. - Exit status of 0 is treated as live/healthy - and non-zero is unhealthy. - items: - type: string - type: array - type: object - httpGet: - description: HTTPGet specifies the http - request to perform. - properties: - host: - description: Host name to connect - to, defaults to the pod IP. You - probably want to set "Host" in httpHeaders - instead. - type: string - httpHeaders: - description: Custom headers to set - in the request. HTTP allows repeated - headers. - items: - description: HTTPHeader describes - a custom header to be used in - HTTP probes - properties: - name: - description: The header field - name. This will be canonicalized - upon output, so case-variant - names will be understood as - the same header. - type: string - value: - description: The header field - value - type: string - required: - - name - - value - type: object - type: array - path: - description: Path to access on the - HTTP server. - type: string - port: - anyOf: - - type: integer - - type: string - description: Name or number of the - port to access on the container. - Number must be in the range 1 to - 65535. Name must be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. - type: string - required: - - port - type: object - tcpSocket: - description: Deprecated. TCPSocket is - NOT supported as a LifecycleHandler - and kept for the backward compatibility. - There are no validation of this field - and lifecycle hooks will fail in runtime - when tcp handler is specified. - properties: - host: - description: 'Optional: Host name - to connect to, defaults to the pod - IP.' - type: string - port: - anyOf: - - type: integer - - type: string - description: Number or name of the - port to access on the container. - Number must be in the range 1 to - 65535. Name must be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - required: - - port - type: object - type: object - preStop: - description: PreStop is called immediately - before a container is terminated due to - an API request or management event such - as liveness/startup probe failure, preemption, - resource contention, etc. The handler is - not called if the container crashes or exits. - The Pod's termination grace period countdown - begins before the PreStop hook is executed. - properties: - exec: - description: Exec specifies the action - to take. - properties: - command: - description: Command is the command - line to execute inside the container, - the working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it - is not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to - explicitly call out to that shell. - Exit status of 0 is treated as live/healthy - and non-zero is unhealthy. - items: - type: string - type: array - type: object - httpGet: - description: HTTPGet specifies the http - request to perform. - properties: - host: - description: Host name to connect - to, defaults to the pod IP. You - probably want to set "Host" in httpHeaders - instead. - type: string - httpHeaders: - description: Custom headers to set - in the request. HTTP allows repeated - headers. - items: - description: HTTPHeader describes - a custom header to be used in - HTTP probes - properties: - name: - description: The header field - name. This will be canonicalized - upon output, so case-variant - names will be understood as - the same header. - type: string - value: - description: The header field - value - type: string - required: - - name - - value - type: object - type: array - path: - description: Path to access on the - HTTP server. - type: string - port: - anyOf: - - type: integer - - type: string - description: Name or number of the - port to access on the container. - Number must be in the range 1 to - 65535. Name must be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. - type: string - required: - - port - type: object - tcpSocket: - description: Deprecated. TCPSocket is - NOT supported as a LifecycleHandler - and kept for the backward compatibility. - There are no validation of this field - and lifecycle hooks will fail in runtime - when tcp handler is specified. - properties: - host: - description: 'Optional: Host name - to connect to, defaults to the pod - IP.' - type: string - port: - anyOf: - - type: integer - - type: string - description: Number or name of the - port to access on the container. - Number must be in the range 1 to - 65535. Name must be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - required: - - port - type: object - type: object - type: object - livenessProbe: - description: 'Periodic probe of container liveness. - Container will be restarted if the probe fails. - Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - properties: - exec: - description: Exec specifies the action to - take. - properties: - command: - description: Command is the command line - to execute inside the container, the - working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it is - not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to explicitly - call out to that shell. Exit status - of 0 is treated as live/healthy and - non-zero is unhealthy. - items: - type: string - type: array - type: object - failureThreshold: - description: Minimum consecutive failures - for the probe to be considered failed after - having succeeded. Defaults to 3. Minimum - value is 1. - format: int32 - type: integer - grpc: - description: GRPC specifies an action involving - a GRPC port. - properties: - port: - description: Port number of the gRPC service. - Number must be in the range 1 to 65535. - format: int32 - type: integer - service: - description: "Service is the name of the - service to place in the gRPC HealthCheckRequest - (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). - \n If this is not specified, the default - behavior is defined by gRPC." - type: string - required: - - port - type: object - httpGet: - description: HTTPGet specifies the http request - to perform. - properties: - host: - description: Host name to connect to, - defaults to the pod IP. You probably - want to set "Host" in httpHeaders instead. - type: string - httpHeaders: - description: Custom headers to set in - the request. HTTP allows repeated headers. - items: - description: HTTPHeader describes a - custom header to be used in HTTP probes - properties: - name: - description: The header field name. - This will be canonicalized upon - output, so case-variant names - will be understood as the same - header. - type: string - value: - description: The header field value - type: string - required: - - name - - value - type: object - type: array - path: - description: Path to access on the HTTP - server. - type: string - port: - anyOf: - - type: integer - - type: string - description: Name or number of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. - type: string - required: - - port - type: object - initialDelaySeconds: - description: 'Number of seconds after the - container has started before liveness probes - are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - periodSeconds: - description: How often (in seconds) to perform - the probe. Default to 10 seconds. Minimum - value is 1. - format: int32 - type: integer - successThreshold: - description: Minimum consecutive successes - for the probe to be considered successful - after having failed. Defaults to 1. Must - be 1 for liveness and startup. Minimum value - is 1. - format: int32 - type: integer - tcpSocket: - description: TCPSocket specifies an action - involving a TCP port. - properties: - host: - description: 'Optional: Host name to connect - to, defaults to the pod IP.' - type: string - port: - anyOf: - - type: integer - - type: string - description: Number or name of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - required: - - port - type: object - terminationGracePeriodSeconds: - description: Optional duration in seconds - the pod needs to terminate gracefully upon - probe failure. The grace period is the duration - in seconds after the processes running in - the pod are sent a termination signal and - the time when the processes are forcibly - halted with a kill signal. Set this value - longer than the expected cleanup time for - your process. - format: int64 - type: integer - timeoutSeconds: - description: 'Number of seconds after which - the probe times out. Defaults to 1 second. - Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - type: object - name: - description: Name of the container specified as - a DNS_LABEL. Each container in a pod must have - a unique name (DNS_LABEL). Cannot be updated. - type: string - ports: - description: List of ports to expose from the - container. Not specifying a port here DOES NOT - prevent that port from being exposed. Any port - which is listening on the default "0.0.0.0" - address inside a container will be accessible - from the network. Modifying this array with - strategic merge patch may corrupt the data. - For more information See https://github.com/kubernetes/kubernetes/issues/108255. - items: - description: ContainerPort represents a network - port in a single container. - properties: - containerPort: - description: Number of port to expose on - the pod's IP address. This must be a valid - port number, 0 < x < 65536. - format: int32 - type: integer - hostIP: - description: What host IP to bind the external - port to. - type: string - hostPort: - description: Number of port to expose on - the host. If specified, this must be a - valid port number, 0 < x < 65536. If HostNetwork - is specified, this must match ContainerPort. - Most containers do not need this. - format: int32 - type: integer - name: - description: If specified, this must be - an IANA_SVC_NAME and unique within the - pod. Each named port in a pod must have - a unique name. Name for the port that - can be referred to by services. - type: string - protocol: - default: TCP - description: Protocol for port. Must be - UDP, TCP, or SCTP. Defaults to "TCP". - type: string - required: - - containerPort - type: object - type: array - x-kubernetes-list-map-keys: - - containerPort - - protocol - x-kubernetes-list-type: map - readinessProbe: - description: 'Periodic probe of container service - readiness. Container will be removed from service - endpoints if the probe fails. Cannot be updated. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - properties: - exec: - description: Exec specifies the action to - take. - properties: - command: - description: Command is the command line - to execute inside the container, the - working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it is - not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to explicitly - call out to that shell. Exit status - of 0 is treated as live/healthy and - non-zero is unhealthy. - items: - type: string - type: array - type: object - failureThreshold: - description: Minimum consecutive failures - for the probe to be considered failed after - having succeeded. Defaults to 3. Minimum - value is 1. - format: int32 - type: integer - grpc: - description: GRPC specifies an action involving - a GRPC port. - properties: - port: - description: Port number of the gRPC service. - Number must be in the range 1 to 65535. - format: int32 - type: integer - service: - description: "Service is the name of the - service to place in the gRPC HealthCheckRequest - (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). - \n If this is not specified, the default - behavior is defined by gRPC." - type: string - required: - - port - type: object - httpGet: - description: HTTPGet specifies the http request - to perform. - properties: - host: - description: Host name to connect to, - defaults to the pod IP. You probably - want to set "Host" in httpHeaders instead. - type: string - httpHeaders: - description: Custom headers to set in - the request. HTTP allows repeated headers. - items: - description: HTTPHeader describes a - custom header to be used in HTTP probes - properties: - name: - description: The header field name. - This will be canonicalized upon - output, so case-variant names - will be understood as the same - header. - type: string - value: - description: The header field value - type: string - required: - - name - - value - type: object - type: array - path: - description: Path to access on the HTTP - server. - type: string - port: - anyOf: - - type: integer - - type: string - description: Name or number of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. - type: string - required: - - port - type: object - initialDelaySeconds: - description: 'Number of seconds after the - container has started before liveness probes - are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - periodSeconds: - description: How often (in seconds) to perform - the probe. Default to 10 seconds. Minimum - value is 1. - format: int32 - type: integer - successThreshold: - description: Minimum consecutive successes - for the probe to be considered successful - after having failed. Defaults to 1. Must - be 1 for liveness and startup. Minimum value - is 1. - format: int32 - type: integer - tcpSocket: - description: TCPSocket specifies an action - involving a TCP port. - properties: - host: - description: 'Optional: Host name to connect - to, defaults to the pod IP.' - type: string - port: - anyOf: - - type: integer - - type: string - description: Number or name of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - required: - - port - type: object - terminationGracePeriodSeconds: - description: Optional duration in seconds - the pod needs to terminate gracefully upon - probe failure. The grace period is the duration - in seconds after the processes running in - the pod are sent a termination signal and - the time when the processes are forcibly - halted with a kill signal. Set this value - longer than the expected cleanup time for - your process. - format: int64 - type: integer - timeoutSeconds: - description: 'Number of seconds after which - the probe times out. Defaults to 1 second. - Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - type: object - resizePolicy: - description: Resources resize policy for the container. - items: - description: ContainerResizePolicy represents - resource resize policy for the container. - properties: - resourceName: - description: 'Name of the resource to which - this resource resize policy applies. Supported - values: cpu, memory.' - type: string - restartPolicy: - description: Restart policy to apply when - specified resource is resized. If not - specified, it defaults to NotRequired. - type: string - required: - - resourceName - - restartPolicy - type: object - type: array - x-kubernetes-list-type: atomic - resources: - description: 'Compute Resources required by this - container. Cannot be updated. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' - properties: - claims: - description: "Claims lists the names of resources, - defined in spec.resourceClaims, that are - used by this container. \n This is an alpha - field and requires enabling the DynamicResourceAllocation - feature gate. \n This field is immutable. - It can only be set for containers." - items: - description: ResourceClaim references one - entry in PodSpec.ResourceClaims. - properties: - name: - description: Name must match the name - of one entry in pod.spec.resourceClaims - of the Pod where this field is used. - It makes that resource available inside - a container. - type: string - required: - - name - type: object - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - limits: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: 'Limits describes the maximum - amount of compute resources allowed. More - info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' - type: object - requests: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: 'Requests describes the minimum - amount of compute resources required. If - Requests is omitted for a container, it - defaults to Limits if that is explicitly - specified, otherwise to an implementation-defined - value. Requests cannot exceed Limits. More - info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' - type: object - type: object - securityContext: - description: 'SecurityContext defines the security - options the container should be run with. If - set, the fields of SecurityContext override - the equivalent fields of PodSecurityContext. - More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/' - properties: - allowPrivilegeEscalation: - description: 'AllowPrivilegeEscalation controls - whether a process can gain more privileges - than its parent process. This bool directly - controls if the no_new_privs flag will be - set on the container process. AllowPrivilegeEscalation - is true always when the container is: 1) - run as Privileged 2) has CAP_SYS_ADMIN Note - that this field cannot be set when spec.os.name - is windows.' - type: boolean - capabilities: - description: The capabilities to add/drop - when running containers. Defaults to the - default set of capabilities granted by the - container runtime. Note that this field - cannot be set when spec.os.name is windows. - properties: - add: - description: Added capabilities - items: - description: Capability represent POSIX - capabilities type - type: string - type: array - drop: - description: Removed capabilities - items: - description: Capability represent POSIX - capabilities type - type: string - type: array - type: object - privileged: - description: Run container in privileged mode. - Processes in privileged containers are essentially - equivalent to root on the host. Defaults - to false. Note that this field cannot be - set when spec.os.name is windows. - type: boolean - procMount: - description: procMount denotes the type of - proc mount to use for the containers. The - default is DefaultProcMount which uses the - container runtime defaults for readonly - paths and masked paths. This requires the - ProcMountType feature flag to be enabled. - Note that this field cannot be set when - spec.os.name is windows. - type: string - readOnlyRootFilesystem: - description: Whether this container has a - read-only root filesystem. Default is false. - Note that this field cannot be set when - spec.os.name is windows. - type: boolean - runAsGroup: - description: The GID to run the entrypoint - of the container process. Uses runtime default - if unset. May also be set in PodSecurityContext. If - set in both SecurityContext and PodSecurityContext, - the value specified in SecurityContext takes - precedence. Note that this field cannot - be set when spec.os.name is windows. - format: int64 - type: integer - runAsNonRoot: - description: Indicates that the container - must run as a non-root user. If true, the - Kubelet will validate the image at runtime - to ensure that it does not run as UID 0 - (root) and fail to start the container if - it does. If unset or false, no such validation - will be performed. May also be set in PodSecurityContext. - type: boolean - runAsUser: - description: The UID to run the entrypoint - of the container process. Defaults to user - specified in image metadata if unspecified. - May also be set in PodSecurityContext. If - set in both SecurityContext and PodSecurityContext, - the value specified in SecurityContext takes - precedence. Note that this field cannot - be set when spec.os.name is windows. - format: int64 - type: integer - seLinuxOptions: - description: The SELinux context to be applied - to the container. If unspecified, the container - runtime will allocate a random SELinux context - for each container. May also be set in - PodSecurityContext. If set in both SecurityContext - and PodSecurityContext, the value specified - in SecurityContext takes precedence. Note - that this field cannot be set when spec.os.name - is windows. - properties: - level: - description: Level is SELinux level label - that applies to the container. - type: string - role: - description: Role is a SELinux role label - that applies to the container. - type: string - type: - description: Type is a SELinux type label - that applies to the container. - type: string - user: - description: User is a SELinux user label - that applies to the container. - type: string - type: object - seccompProfile: - description: The seccomp options to use by - this container. If seccomp options are provided - at both the pod & container level, the container - options override the pod options. Note that - this field cannot be set when spec.os.name - is windows. - properties: - localhostProfile: - description: localhostProfile indicates - a profile defined in a file on the node - should be used. The profile must be - preconfigured on the node to work. Must - be a descending path, relative to the - kubelet's configured seccomp profile - location. Must only be set if type is - "Localhost". - type: string - type: - description: "type indicates which kind - of seccomp profile will be applied. - Valid options are: \n Localhost - a - profile defined in a file on the node - should be used. RuntimeDefault - the - container runtime default profile should - be used. Unconfined - no profile should - be applied." - type: string - required: - - type - type: object - windowsOptions: - description: The Windows specific settings - applied to all containers. If unspecified, - the options from the PodSecurityContext - will be used. If set in both SecurityContext - and PodSecurityContext, the value specified - in SecurityContext takes precedence. Note - that this field cannot be set when spec.os.name - is linux. - properties: - gmsaCredentialSpec: - description: GMSACredentialSpec is where - the GMSA admission webhook (https://github.com/kubernetes-sigs/windows-gmsa) - inlines the contents of the GMSA credential - spec named by the GMSACredentialSpecName - field. - type: string - gmsaCredentialSpecName: - description: GMSACredentialSpecName is - the name of the GMSA credential spec - to use. - type: string - hostProcess: - description: HostProcess determines if - a container should be run as a 'Host - Process' container. This field is alpha-level - and will only be honored by components - that enable the WindowsHostProcessContainers - feature flag. Setting this field without - the feature flag will result in errors - when validating the Pod. - type: boolean - runAsUserName: - description: The UserName in Windows to - run the entrypoint of the container - process. Defaults to the user specified - in image metadata if unspecified. May - also be set in PodSecurityContext. If - set in both SecurityContext and PodSecurityContext, - the value specified in SecurityContext - takes precedence. - type: string - type: object - type: object - startupProbe: - description: StartupProbe indicates that the Pod - has successfully initialized. If specified, - no other probes are executed until this completes - successfully. If this probe fails, the Pod will - be restarted, just as if the livenessProbe failed. - properties: - exec: - description: Exec specifies the action to - take. - properties: - command: - description: Command is the command line - to execute inside the container, the - working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it is - not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to explicitly - call out to that shell. Exit status - of 0 is treated as live/healthy and - non-zero is unhealthy. - items: - type: string - type: array - type: object - failureThreshold: - description: Minimum consecutive failures - for the probe to be considered failed after - having succeeded. Defaults to 3. Minimum - value is 1. - format: int32 - type: integer - grpc: - description: GRPC specifies an action involving - a GRPC port. - properties: - port: - description: Port number of the gRPC service. - Number must be in the range 1 to 65535. - format: int32 - type: integer - service: - description: "Service is the name of the - service to place in the gRPC HealthCheckRequest - (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). - \n If this is not specified, the default - behavior is defined by gRPC." - type: string - required: - - port - type: object - httpGet: - description: HTTPGet specifies the http request - to perform. - properties: - host: - description: Host name to connect to, - defaults to the pod IP. You probably - want to set "Host" in httpHeaders instead. - type: string - httpHeaders: - description: Custom headers to set in - the request. HTTP allows repeated headers. - items: - description: HTTPHeader describes a - custom header to be used in HTTP probes - properties: - name: - description: The header field name. - This will be canonicalized upon - output, so case-variant names - will be understood as the same - header. - type: string - value: - description: The header field value - type: string - required: - - name - - value - type: object - type: array - path: - description: Path to access on the HTTP - server. - type: string - port: - anyOf: - - type: integer - - type: string - description: Name or number of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. - type: string - required: - - port - type: object - initialDelaySeconds: - description: 'Number of seconds after the - container has started before liveness probes - are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - periodSeconds: - description: How often (in seconds) to perform - the probe. Default to 10 seconds. Minimum - value is 1. - format: int32 - type: integer - successThreshold: - description: Minimum consecutive successes - for the probe to be considered successful - after having failed. Defaults to 1. Must - be 1 for liveness and startup. Minimum value - is 1. - format: int32 - type: integer - tcpSocket: - description: TCPSocket specifies an action - involving a TCP port. - properties: - host: - description: 'Optional: Host name to connect - to, defaults to the pod IP.' - type: string - port: - anyOf: - - type: integer - - type: string - description: Number or name of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - required: - - port - type: object - terminationGracePeriodSeconds: - description: Optional duration in seconds - the pod needs to terminate gracefully upon - probe failure. The grace period is the duration - in seconds after the processes running in - the pod are sent a termination signal and - the time when the processes are forcibly - halted with a kill signal. Set this value - longer than the expected cleanup time for - your process. - format: int64 - type: integer - timeoutSeconds: - description: 'Number of seconds after which - the probe times out. Defaults to 1 second. - Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - type: object - stdin: - description: Whether this container should allocate - a buffer for stdin in the container runtime. - If this is not set, reads from stdin in the - container will always result in EOF. Default - is false. - type: boolean - stdinOnce: - description: Whether the container runtime should - close the stdin channel after it has been opened - by a single attach. When stdin is true the stdin - stream will remain open across multiple attach - sessions. - type: boolean - terminationMessagePath: - description: 'Optional: Path at which the file - to which the container''s termination message - will be written is mounted into the container''s - filesystem. Message written is intended to be - brief final status, such as an assertion failure - message. Will be truncated by the node if greater - than 4096 bytes. The total message length across - all containers will be limited to 12kb. Defaults - to /dev/termination-log.' - type: string - terminationMessagePolicy: - description: Indicate how the termination message - should be populated. File will use the contents - of terminationMessagePath to populate the container - status message on both success and failure. - FallbackToLogsOnError will use the last chunk - of container log output if the termination message - file is empty and the container exited with - an error. - type: string - tty: - description: Whether this container should allocate - a TTY for itself, also requires 'stdin' to be - true. Default is false. - type: boolean - volumeDevices: - description: volumeDevices is the list of block - devices to be used by the container. - items: - description: volumeDevice describes a mapping - of a raw block device within a container. - properties: - devicePath: - description: devicePath is the path inside - of the container that the device will - be mapped to. - type: string - name: - description: name must match the name of - a persistentVolumeClaim in the pod - type: string - required: - - devicePath - - name - type: object - type: array - volumeMounts: - description: Pod volumes to mount into the container's - filesystem. Cannot be updated. - items: - description: VolumeMount describes a mounting - of a Volume within a container. - properties: - mountPath: - description: Path within the container at - which the volume should be mounted. Must - not contain ':'. - type: string - mountPropagation: - description: mountPropagation determines - how mounts are propagated from the host - to container and the other way around. - When not set, MountPropagationNone is - used. This field is beta in 1.10. - type: string - name: - description: This must match the Name of - a Volume. - type: string - readOnly: - description: Mounted read-only if true, - read-write otherwise (false or unspecified). - Defaults to false. - type: boolean - subPath: - description: Path within the volume from - which the container's volume should be - mounted. Defaults to "" (volume's root). - type: string - subPathExpr: - description: Expanded path within the volume - from which the container's volume should - be mounted. Behaves similarly to SubPath - but environment variable references $(VAR_NAME) - are expanded using the container's environment. - Defaults to "" (volume's root). SubPathExpr - and SubPath are mutually exclusive. - type: string - required: - - mountPath - - name - type: object - type: array - workingDir: - description: Container's working directory. If - not specified, the container runtime's default - will be used, which might be configured in the - container image. Cannot be updated. - type: string - required: - - name - type: object - type: array - dnsConfig: - description: Specifies the DNS parameters of a pod. - Parameters specified here will be merged to the generated - DNS configuration based on DNSPolicy. - properties: - nameservers: - description: A list of DNS name server IP addresses. - This will be appended to the base nameservers - generated from DNSPolicy. Duplicated nameservers - will be removed. - items: - type: string - type: array - options: - description: A list of DNS resolver options. This - will be merged with the base options generated - from DNSPolicy. Duplicated entries will be removed. - Resolution options given in Options will override - those that appear in the base DNSPolicy. - items: - description: PodDNSConfigOption defines DNS resolver - options of a pod. - properties: - name: - description: Required. - type: string - value: - type: string - type: object - type: array - searches: - description: A list of DNS search domains for host-name - lookup. This will be appended to the base search - paths generated from DNSPolicy. Duplicated search - paths will be removed. - items: - type: string - type: array - type: object - dnsPolicy: - description: Set DNS policy for the pod. Defaults to - "ClusterFirst". Valid values are 'ClusterFirstWithHostNet', - 'ClusterFirst', 'Default' or 'None'. DNS parameters - given in DNSConfig will be merged with the policy - selected with DNSPolicy. To have DNS options set along - with hostNetwork, you have to specify DNS policy explicitly - to 'ClusterFirstWithHostNet'. - type: string - enableServiceLinks: - description: 'EnableServiceLinks indicates whether information - about services should be injected into pod''s environment - variables, matching the syntax of Docker links. Optional: - Defaults to true.' - type: boolean - ephemeralContainers: - description: List of ephemeral containers run in this - pod. Ephemeral containers may be run in an existing - pod to perform user-initiated actions such as debugging. - This list cannot be specified when creating a pod, - and it cannot be modified by updating the pod spec. - In order to add an ephemeral container to an existing - pod, use the pod's ephemeralcontainers subresource. - items: - description: An EphemeralContainer is a temporary - container that you may add to an existing Pod for - user-initiated activities such as debugging. Ephemeral - containers have no resource or scheduling guarantees, - and they will not be restarted when they exit or - when a Pod is removed or restarted. The kubelet - may evict a Pod if an ephemeral container causes - the Pod to exceed its resource allocation. - properties: - args: - description: 'Arguments to the entrypoint. The - image''s CMD is used if this is not provided. - Variable references $(VAR_NAME) are expanded - using the container''s environment. If a variable - cannot be resolved, the reference in the input - string will be unchanged. Double $$ are reduced - to a single $, which allows for escaping the - $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will - produce the string literal "$(VAR_NAME)".' - items: - type: string - type: array - command: - description: 'Entrypoint array. Not executed within - a shell. The image''s ENTRYPOINT is used if - this is not provided. Variable references $(VAR_NAME) - are expanded using the container''s environment. - If a variable cannot be resolved, the reference - in the input string will be unchanged. Double - $$ are reduced to a single $, which allows for - escaping the $(VAR_NAME) syntax: i.e.' - items: - type: string - type: array - env: - description: List of environment variables to - set in the container. Cannot be updated. - items: - description: EnvVar represents an environment - variable present in a Container. - properties: - name: - description: Name of the environment variable. - Must be a C_IDENTIFIER. - type: string - value: - description: 'Variable references $(VAR_NAME) - are expanded using the previously defined - environment variables in the container - and any service environment variables. - If a variable cannot be resolved, the - reference in the input string will be - unchanged. Double $$ are reduced to a - single $, which allows for escaping the - $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" - will produce the string literal "$(VAR_NAME)".' - type: string - valueFrom: - description: Source for the environment - variable's value. Cannot be used if value - is not empty. - properties: - configMapKeyRef: - description: Selects a key of a ConfigMap. - properties: - key: - description: The key to select. - type: string - name: - description: 'Name of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. - apiVersion, kind, uid?' - type: string - optional: - description: Specify whether the - ConfigMap or its key must be defined - type: boolean - required: - - key - type: object - x-kubernetes-map-type: atomic - fieldRef: - description: 'Selects a field of the - pod: supports metadata.name, metadata.namespace, - `metadata.labels['''']`, `metadata.annotations['''']`, - spec.nodeName, spec.serviceAccountName, - status.hostIP, status.podIP, status.podIPs.' - properties: - apiVersion: - description: Version of the schema - the FieldPath is written in terms - of, defaults to "v1". - type: string - fieldPath: - description: Path of the field to - select in the specified API version. - type: string - required: - - fieldPath - type: object - x-kubernetes-map-type: atomic - resourceFieldRef: - description: 'Selects a resource of - the container: only resources limits - and requests (limits.cpu, limits.memory, - limits.ephemeral-storage, requests.cpu, - requests.memory and requests.ephemeral-storage) - are currently supported.' - properties: - containerName: - description: 'Container name: required - for volumes, optional for env - vars' - type: string - divisor: - anyOf: - - type: integer - - type: string - description: Specifies the output - format of the exposed resources, - defaults to "1" - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - resource: - description: 'Required: resource - to select' - type: string - required: - - resource - type: object - x-kubernetes-map-type: atomic - secretKeyRef: - description: Selects a key of a secret - in the pod's namespace - properties: - key: - description: The key of the secret - to select from. Must be a valid - secret key. - type: string - name: - description: 'Name of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. - apiVersion, kind, uid?' - type: string - optional: - description: Specify whether the - Secret or its key must be defined - type: boolean - required: - - key - type: object - x-kubernetes-map-type: atomic - type: object - required: - - name - type: object - type: array - envFrom: - description: List of sources to populate environment - variables in the container. The keys defined - within a source must be a C_IDENTIFIER. All - invalid keys will be reported as an event when - the container is starting. When a key exists - in multiple sources, the value associated with - the last source will take precedence. Values - defined by an Env with a duplicate key will - take precedence. Cannot be updated. - items: - description: EnvFromSource represents the source - of a set of ConfigMaps - properties: - configMapRef: - description: The ConfigMap to select from - properties: - name: - description: 'Name of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' - type: string - optional: - description: Specify whether the ConfigMap - must be defined - type: boolean - type: object - x-kubernetes-map-type: atomic - prefix: - description: An optional identifier to prepend - to each key in the ConfigMap. Must be - a C_IDENTIFIER. - type: string - secretRef: - description: The Secret to select from - properties: - name: - description: 'Name of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' - type: string - optional: - description: Specify whether the Secret - must be defined - type: boolean - type: object - x-kubernetes-map-type: atomic - type: object - type: array - image: - description: 'Container image name. More info: - https://kubernetes.io/docs/concepts/containers/images' - type: string - imagePullPolicy: - description: 'Image pull policy. One of Always, - Never, IfNotPresent. Defaults to Always if :latest - tag is specified, or IfNotPresent otherwise. - Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images' - type: string - lifecycle: - description: Lifecycle is not allowed for ephemeral - containers. - properties: - postStart: - description: 'PostStart is called immediately - after a container is created. If the handler - fails, the container is terminated and restarted - according to its restart policy. Other management - of the container blocks until the hook completes. - More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks' - properties: - exec: - description: Exec specifies the action - to take. - properties: - command: - description: Command is the command - line to execute inside the container, - the working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it - is not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to - explicitly call out to that shell. - Exit status of 0 is treated as live/healthy - and non-zero is unhealthy. - items: - type: string - type: array - type: object - httpGet: - description: HTTPGet specifies the http - request to perform. - properties: - host: - description: Host name to connect - to, defaults to the pod IP. You - probably want to set "Host" in httpHeaders - instead. - type: string - httpHeaders: - description: Custom headers to set - in the request. HTTP allows repeated - headers. - items: - description: HTTPHeader describes - a custom header to be used in - HTTP probes - properties: - name: - description: The header field - name. This will be canonicalized - upon output, so case-variant - names will be understood as - the same header. - type: string - value: - description: The header field - value - type: string - required: - - name - - value - type: object - type: array - path: - description: Path to access on the - HTTP server. - type: string - port: - anyOf: - - type: integer - - type: string - description: Name or number of the - port to access on the container. - Number must be in the range 1 to - 65535. Name must be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. - type: string - required: - - port - type: object - tcpSocket: - description: Deprecated. TCPSocket is - NOT supported as a LifecycleHandler - and kept for the backward compatibility. - There are no validation of this field - and lifecycle hooks will fail in runtime - when tcp handler is specified. - properties: - host: - description: 'Optional: Host name - to connect to, defaults to the pod - IP.' - type: string - port: - anyOf: - - type: integer - - type: string - description: Number or name of the - port to access on the container. - Number must be in the range 1 to - 65535. Name must be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - required: - - port - type: object - type: object - preStop: - description: PreStop is called immediately - before a container is terminated due to - an API request or management event such - as liveness/startup probe failure, preemption, - resource contention, etc. The handler is - not called if the container crashes or exits. - The Pod's termination grace period countdown - begins before the PreStop hook is executed. - properties: - exec: - description: Exec specifies the action - to take. - properties: - command: - description: Command is the command - line to execute inside the container, - the working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it - is not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to - explicitly call out to that shell. - Exit status of 0 is treated as live/healthy - and non-zero is unhealthy. - items: - type: string - type: array - type: object - httpGet: - description: HTTPGet specifies the http - request to perform. - properties: - host: - description: Host name to connect - to, defaults to the pod IP. You - probably want to set "Host" in httpHeaders - instead. - type: string - httpHeaders: - description: Custom headers to set - in the request. HTTP allows repeated - headers. - items: - description: HTTPHeader describes - a custom header to be used in - HTTP probes - properties: - name: - description: The header field - name. This will be canonicalized - upon output, so case-variant - names will be understood as - the same header. - type: string - value: - description: The header field - value - type: string - required: - - name - - value - type: object - type: array - path: - description: Path to access on the - HTTP server. - type: string - port: - anyOf: - - type: integer - - type: string - description: Name or number of the - port to access on the container. - Number must be in the range 1 to - 65535. Name must be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. - type: string - required: - - port - type: object - tcpSocket: - description: Deprecated. TCPSocket is - NOT supported as a LifecycleHandler - and kept for the backward compatibility. - There are no validation of this field - and lifecycle hooks will fail in runtime - when tcp handler is specified. - properties: - host: - description: 'Optional: Host name - to connect to, defaults to the pod - IP.' - type: string - port: - anyOf: - - type: integer - - type: string - description: Number or name of the - port to access on the container. - Number must be in the range 1 to - 65535. Name must be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - required: - - port - type: object - type: object - type: object - livenessProbe: - description: Probes are not allowed for ephemeral - containers. - properties: - exec: - description: Exec specifies the action to - take. - properties: - command: - description: Command is the command line - to execute inside the container, the - working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it is - not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to explicitly - call out to that shell. Exit status - of 0 is treated as live/healthy and - non-zero is unhealthy. - items: - type: string - type: array - type: object - failureThreshold: - description: Minimum consecutive failures - for the probe to be considered failed after - having succeeded. Defaults to 3. Minimum - value is 1. - format: int32 - type: integer - grpc: - description: GRPC specifies an action involving - a GRPC port. - properties: - port: - description: Port number of the gRPC service. - Number must be in the range 1 to 65535. - format: int32 - type: integer - service: - description: "Service is the name of the - service to place in the gRPC HealthCheckRequest - (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). - \n If this is not specified, the default - behavior is defined by gRPC." - type: string - required: - - port - type: object - httpGet: - description: HTTPGet specifies the http request - to perform. - properties: - host: - description: Host name to connect to, - defaults to the pod IP. You probably - want to set "Host" in httpHeaders instead. - type: string - httpHeaders: - description: Custom headers to set in - the request. HTTP allows repeated headers. - items: - description: HTTPHeader describes a - custom header to be used in HTTP probes - properties: - name: - description: The header field name. - This will be canonicalized upon - output, so case-variant names - will be understood as the same - header. - type: string - value: - description: The header field value - type: string - required: - - name - - value - type: object - type: array - path: - description: Path to access on the HTTP - server. - type: string - port: - anyOf: - - type: integer - - type: string - description: Name or number of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. - type: string - required: - - port - type: object - initialDelaySeconds: - description: 'Number of seconds after the - container has started before liveness probes - are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - periodSeconds: - description: How often (in seconds) to perform - the probe. Default to 10 seconds. Minimum - value is 1. - format: int32 - type: integer - successThreshold: - description: Minimum consecutive successes - for the probe to be considered successful - after having failed. Defaults to 1. Must - be 1 for liveness and startup. Minimum value - is 1. - format: int32 - type: integer - tcpSocket: - description: TCPSocket specifies an action - involving a TCP port. - properties: - host: - description: 'Optional: Host name to connect - to, defaults to the pod IP.' - type: string - port: - anyOf: - - type: integer - - type: string - description: Number or name of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - required: - - port - type: object - terminationGracePeriodSeconds: - description: Optional duration in seconds - the pod needs to terminate gracefully upon - probe failure. The grace period is the duration - in seconds after the processes running in - the pod are sent a termination signal and - the time when the processes are forcibly - halted with a kill signal. Set this value - longer than the expected cleanup time for - your process. - format: int64 - type: integer - timeoutSeconds: - description: 'Number of seconds after which - the probe times out. Defaults to 1 second. - Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - type: object - name: - description: Name of the ephemeral container specified - as a DNS_LABEL. This name must be unique among - all containers, init containers and ephemeral - containers. - type: string - ports: - description: Ports are not allowed for ephemeral - containers. - items: - description: ContainerPort represents a network - port in a single container. - properties: - containerPort: - description: Number of port to expose on - the pod's IP address. This must be a valid - port number, 0 < x < 65536. - format: int32 - type: integer - hostIP: - description: What host IP to bind the external - port to. - type: string - hostPort: - description: Number of port to expose on - the host. If specified, this must be a - valid port number, 0 < x < 65536. If HostNetwork - is specified, this must match ContainerPort. - Most containers do not need this. - format: int32 - type: integer - name: - description: If specified, this must be - an IANA_SVC_NAME and unique within the - pod. Each named port in a pod must have - a unique name. Name for the port that - can be referred to by services. - type: string - protocol: - default: TCP - description: Protocol for port. Must be - UDP, TCP, or SCTP. Defaults to "TCP". - type: string - required: - - containerPort - type: object - type: array - x-kubernetes-list-map-keys: - - containerPort - - protocol - x-kubernetes-list-type: map - readinessProbe: - description: Probes are not allowed for ephemeral - containers. - properties: - exec: - description: Exec specifies the action to - take. - properties: - command: - description: Command is the command line - to execute inside the container, the - working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it is - not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to explicitly - call out to that shell. Exit status - of 0 is treated as live/healthy and - non-zero is unhealthy. - items: - type: string - type: array - type: object - failureThreshold: - description: Minimum consecutive failures - for the probe to be considered failed after - having succeeded. Defaults to 3. Minimum - value is 1. - format: int32 - type: integer - grpc: - description: GRPC specifies an action involving - a GRPC port. - properties: - port: - description: Port number of the gRPC service. - Number must be in the range 1 to 65535. - format: int32 - type: integer - service: - description: "Service is the name of the - service to place in the gRPC HealthCheckRequest - (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). - \n If this is not specified, the default - behavior is defined by gRPC." - type: string - required: - - port - type: object - httpGet: - description: HTTPGet specifies the http request - to perform. - properties: - host: - description: Host name to connect to, - defaults to the pod IP. You probably - want to set "Host" in httpHeaders instead. - type: string - httpHeaders: - description: Custom headers to set in - the request. HTTP allows repeated headers. - items: - description: HTTPHeader describes a - custom header to be used in HTTP probes - properties: - name: - description: The header field name. - This will be canonicalized upon - output, so case-variant names - will be understood as the same - header. - type: string - value: - description: The header field value - type: string - required: - - name - - value - type: object - type: array - path: - description: Path to access on the HTTP - server. - type: string - port: - anyOf: - - type: integer - - type: string - description: Name or number of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. - type: string - required: - - port - type: object - initialDelaySeconds: - description: 'Number of seconds after the - container has started before liveness probes - are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - periodSeconds: - description: How often (in seconds) to perform - the probe. Default to 10 seconds. Minimum - value is 1. - format: int32 - type: integer - successThreshold: - description: Minimum consecutive successes - for the probe to be considered successful - after having failed. Defaults to 1. Must - be 1 for liveness and startup. Minimum value - is 1. - format: int32 - type: integer - tcpSocket: - description: TCPSocket specifies an action - involving a TCP port. - properties: - host: - description: 'Optional: Host name to connect - to, defaults to the pod IP.' - type: string - port: - anyOf: - - type: integer - - type: string - description: Number or name of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - required: - - port - type: object - terminationGracePeriodSeconds: - description: Optional duration in seconds - the pod needs to terminate gracefully upon - probe failure. The grace period is the duration - in seconds after the processes running in - the pod are sent a termination signal and - the time when the processes are forcibly - halted with a kill signal. Set this value - longer than the expected cleanup time for - your process. - format: int64 - type: integer - timeoutSeconds: - description: 'Number of seconds after which - the probe times out. Defaults to 1 second. - Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - type: object - resizePolicy: - description: Resources resize policy for the container. - items: - description: ContainerResizePolicy represents - resource resize policy for the container. - properties: - resourceName: - description: 'Name of the resource to which - this resource resize policy applies. Supported - values: cpu, memory.' - type: string - restartPolicy: - description: Restart policy to apply when - specified resource is resized. If not - specified, it defaults to NotRequired. - type: string - required: - - resourceName - - restartPolicy - type: object - type: array - x-kubernetes-list-type: atomic - resources: - description: Resources are not allowed for ephemeral - containers. Ephemeral containers use spare resources - already allocated to the pod. - properties: - claims: - description: "Claims lists the names of resources, - defined in spec.resourceClaims, that are - used by this container. \n This is an alpha - field and requires enabling the DynamicResourceAllocation - feature gate. \n This field is immutable. - It can only be set for containers." - items: - description: ResourceClaim references one - entry in PodSpec.ResourceClaims. - properties: - name: - description: Name must match the name - of one entry in pod.spec.resourceClaims - of the Pod where this field is used. - It makes that resource available inside - a container. - type: string - required: - - name - type: object - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - limits: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: 'Limits describes the maximum - amount of compute resources allowed. More - info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' - type: object - requests: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: 'Requests describes the minimum - amount of compute resources required. If - Requests is omitted for a container, it - defaults to Limits if that is explicitly - specified, otherwise to an implementation-defined - value. Requests cannot exceed Limits. More - info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' - type: object - type: object - securityContext: - description: 'Optional: SecurityContext defines - the security options the ephemeral container - should be run with. If set, the fields of SecurityContext - override the equivalent fields of PodSecurityContext.' - properties: - allowPrivilegeEscalation: - description: 'AllowPrivilegeEscalation controls - whether a process can gain more privileges - than its parent process. This bool directly - controls if the no_new_privs flag will be - set on the container process. AllowPrivilegeEscalation - is true always when the container is: 1) - run as Privileged 2) has CAP_SYS_ADMIN Note - that this field cannot be set when spec.os.name - is windows.' - type: boolean - capabilities: - description: The capabilities to add/drop - when running containers. Defaults to the - default set of capabilities granted by the - container runtime. Note that this field - cannot be set when spec.os.name is windows. - properties: - add: - description: Added capabilities - items: - description: Capability represent POSIX - capabilities type - type: string - type: array - drop: - description: Removed capabilities - items: - description: Capability represent POSIX - capabilities type - type: string - type: array - type: object - privileged: - description: Run container in privileged mode. - Processes in privileged containers are essentially - equivalent to root on the host. Defaults - to false. Note that this field cannot be - set when spec.os.name is windows. - type: boolean - procMount: - description: procMount denotes the type of - proc mount to use for the containers. The - default is DefaultProcMount which uses the - container runtime defaults for readonly - paths and masked paths. This requires the - ProcMountType feature flag to be enabled. - Note that this field cannot be set when - spec.os.name is windows. - type: string - readOnlyRootFilesystem: - description: Whether this container has a - read-only root filesystem. Default is false. - Note that this field cannot be set when - spec.os.name is windows. - type: boolean - runAsGroup: - description: The GID to run the entrypoint - of the container process. Uses runtime default - if unset. May also be set in PodSecurityContext. If - set in both SecurityContext and PodSecurityContext, - the value specified in SecurityContext takes - precedence. Note that this field cannot - be set when spec.os.name is windows. - format: int64 - type: integer - runAsNonRoot: - description: Indicates that the container - must run as a non-root user. If true, the - Kubelet will validate the image at runtime - to ensure that it does not run as UID 0 - (root) and fail to start the container if - it does. If unset or false, no such validation - will be performed. May also be set in PodSecurityContext. - type: boolean - runAsUser: - description: The UID to run the entrypoint - of the container process. Defaults to user - specified in image metadata if unspecified. - May also be set in PodSecurityContext. If - set in both SecurityContext and PodSecurityContext, - the value specified in SecurityContext takes - precedence. Note that this field cannot - be set when spec.os.name is windows. - format: int64 - type: integer - seLinuxOptions: - description: The SELinux context to be applied - to the container. If unspecified, the container - runtime will allocate a random SELinux context - for each container. May also be set in - PodSecurityContext. If set in both SecurityContext - and PodSecurityContext, the value specified - in SecurityContext takes precedence. Note - that this field cannot be set when spec.os.name - is windows. - properties: - level: - description: Level is SELinux level label - that applies to the container. - type: string - role: - description: Role is a SELinux role label - that applies to the container. - type: string - type: - description: Type is a SELinux type label - that applies to the container. - type: string - user: - description: User is a SELinux user label - that applies to the container. - type: string - type: object - seccompProfile: - description: The seccomp options to use by - this container. If seccomp options are provided - at both the pod & container level, the container - options override the pod options. Note that - this field cannot be set when spec.os.name - is windows. - properties: - localhostProfile: - description: localhostProfile indicates - a profile defined in a file on the node - should be used. The profile must be - preconfigured on the node to work. Must - be a descending path, relative to the - kubelet's configured seccomp profile - location. Must only be set if type is - "Localhost". - type: string - type: - description: "type indicates which kind - of seccomp profile will be applied. - Valid options are: \n Localhost - a - profile defined in a file on the node - should be used. RuntimeDefault - the - container runtime default profile should - be used. Unconfined - no profile should - be applied." - type: string - required: - - type - type: object - windowsOptions: - description: The Windows specific settings - applied to all containers. If unspecified, - the options from the PodSecurityContext - will be used. If set in both SecurityContext - and PodSecurityContext, the value specified - in SecurityContext takes precedence. Note - that this field cannot be set when spec.os.name - is linux. - properties: - gmsaCredentialSpec: - description: GMSACredentialSpec is where - the GMSA admission webhook (https://github.com/kubernetes-sigs/windows-gmsa) - inlines the contents of the GMSA credential - spec named by the GMSACredentialSpecName - field. - type: string - gmsaCredentialSpecName: - description: GMSACredentialSpecName is - the name of the GMSA credential spec - to use. - type: string - hostProcess: - description: HostProcess determines if - a container should be run as a 'Host - Process' container. This field is alpha-level - and will only be honored by components - that enable the WindowsHostProcessContainers - feature flag. Setting this field without - the feature flag will result in errors - when validating the Pod. - type: boolean - runAsUserName: - description: The UserName in Windows to - run the entrypoint of the container - process. Defaults to the user specified - in image metadata if unspecified. May - also be set in PodSecurityContext. If - set in both SecurityContext and PodSecurityContext, - the value specified in SecurityContext - takes precedence. - type: string - type: object - type: object - startupProbe: - description: Probes are not allowed for ephemeral - containers. - properties: - exec: - description: Exec specifies the action to - take. - properties: - command: - description: Command is the command line - to execute inside the container, the - working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it is - not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to explicitly - call out to that shell. Exit status - of 0 is treated as live/healthy and - non-zero is unhealthy. - items: - type: string - type: array - type: object - failureThreshold: - description: Minimum consecutive failures - for the probe to be considered failed after - having succeeded. Defaults to 3. Minimum - value is 1. - format: int32 - type: integer - grpc: - description: GRPC specifies an action involving - a GRPC port. - properties: - port: - description: Port number of the gRPC service. - Number must be in the range 1 to 65535. - format: int32 - type: integer - service: - description: "Service is the name of the - service to place in the gRPC HealthCheckRequest - (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). - \n If this is not specified, the default - behavior is defined by gRPC." - type: string - required: - - port - type: object - httpGet: - description: HTTPGet specifies the http request - to perform. - properties: - host: - description: Host name to connect to, - defaults to the pod IP. You probably - want to set "Host" in httpHeaders instead. - type: string - httpHeaders: - description: Custom headers to set in - the request. HTTP allows repeated headers. - items: - description: HTTPHeader describes a - custom header to be used in HTTP probes - properties: - name: - description: The header field name. - This will be canonicalized upon - output, so case-variant names - will be understood as the same - header. - type: string - value: - description: The header field value - type: string - required: - - name - - value - type: object - type: array - path: - description: Path to access on the HTTP - server. - type: string - port: - anyOf: - - type: integer - - type: string - description: Name or number of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. - type: string - required: - - port - type: object - initialDelaySeconds: - description: 'Number of seconds after the - container has started before liveness probes - are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - periodSeconds: - description: How often (in seconds) to perform - the probe. Default to 10 seconds. Minimum - value is 1. - format: int32 - type: integer - successThreshold: - description: Minimum consecutive successes - for the probe to be considered successful - after having failed. Defaults to 1. Must - be 1 for liveness and startup. Minimum value - is 1. - format: int32 - type: integer - tcpSocket: - description: TCPSocket specifies an action - involving a TCP port. - properties: - host: - description: 'Optional: Host name to connect - to, defaults to the pod IP.' - type: string - port: - anyOf: - - type: integer - - type: string - description: Number or name of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - required: - - port - type: object - terminationGracePeriodSeconds: - description: Optional duration in seconds - the pod needs to terminate gracefully upon - probe failure. The grace period is the duration - in seconds after the processes running in - the pod are sent a termination signal and - the time when the processes are forcibly - halted with a kill signal. Set this value - longer than the expected cleanup time for - your process. - format: int64 - type: integer - timeoutSeconds: - description: 'Number of seconds after which - the probe times out. Defaults to 1 second. - Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - type: object - stdin: - description: Whether this container should allocate - a buffer for stdin in the container runtime. - If this is not set, reads from stdin in the - container will always result in EOF. Default - is false. - type: boolean - stdinOnce: - description: Whether the container runtime should - close the stdin channel after it has been opened - by a single attach. When stdin is true the stdin - stream will remain open across multiple attach - sessions. - type: boolean - targetContainerName: - description: "If set, the name of the container - from PodSpec that this ephemeral container targets. - The ephemeral container will be run in the namespaces - (IPC, PID, etc) of this container. If not set - then the ephemeral container uses the namespaces - configured in the Pod spec. \n The container - runtime must implement support for this feature." - type: string - terminationMessagePath: - description: 'Optional: Path at which the file - to which the container''s termination message - will be written is mounted into the container''s - filesystem. Message written is intended to be - brief final status, such as an assertion failure - message. Will be truncated by the node if greater - than 4096 bytes. The total message length across - all containers will be limited to 12kb. Defaults - to /dev/termination-log.' - type: string - terminationMessagePolicy: - description: Indicate how the termination message - should be populated. File will use the contents - of terminationMessagePath to populate the container - status message on both success and failure. - FallbackToLogsOnError will use the last chunk - of container log output if the termination message - file is empty and the container exited with - an error. - type: string - tty: - description: Whether this container should allocate - a TTY for itself, also requires 'stdin' to be - true. Default is false. - type: boolean - volumeDevices: - description: volumeDevices is the list of block - devices to be used by the container. - items: - description: volumeDevice describes a mapping - of a raw block device within a container. - properties: - devicePath: - description: devicePath is the path inside - of the container that the device will - be mapped to. - type: string - name: - description: name must match the name of - a persistentVolumeClaim in the pod - type: string - required: - - devicePath - - name - type: object - type: array - volumeMounts: - description: Pod volumes to mount into the container's - filesystem. Subpath mounts are not allowed for - ephemeral containers. Cannot be updated. - items: - description: VolumeMount describes a mounting - of a Volume within a container. - properties: - mountPath: - description: Path within the container at - which the volume should be mounted. Must - not contain ':'. - type: string - mountPropagation: - description: mountPropagation determines - how mounts are propagated from the host - to container and the other way around. - When not set, MountPropagationNone is - used. This field is beta in 1.10. - type: string - name: - description: This must match the Name of - a Volume. - type: string - readOnly: - description: Mounted read-only if true, - read-write otherwise (false or unspecified). - Defaults to false. - type: boolean - subPath: - description: Path within the volume from - which the container's volume should be - mounted. Defaults to "" (volume's root). - type: string - subPathExpr: - description: Expanded path within the volume - from which the container's volume should - be mounted. Behaves similarly to SubPath - but environment variable references $(VAR_NAME) - are expanded using the container's environment. - Defaults to "" (volume's root). SubPathExpr - and SubPath are mutually exclusive. - type: string - required: - - mountPath - - name - type: object - type: array - workingDir: - description: Container's working directory. If - not specified, the container runtime's default - will be used, which might be configured in the - container image. Cannot be updated. - type: string - required: - - name - type: object - type: array - hostAliases: - description: HostAliases is an optional list of hosts - and IPs that will be injected into the pod's hosts - file if specified. This is only valid for non-hostNetwork - pods. - items: - description: HostAlias holds the mapping between IP - and hostnames that will be injected as an entry - in the pod's hosts file. - properties: - hostnames: - description: Hostnames for the above IP address. - items: - type: string - type: array - ip: - description: IP address of the host file entry. - type: string - type: object - type: array - hostIPC: - description: 'Use the host''s ipc namespace. Optional: - Default to false.' - type: boolean - hostNetwork: - description: Host networking requested for this pod. - Use the host's network namespace. If this option is - set, the ports that will be used must be specified. - Default to false. - type: boolean - hostPID: - description: 'Use the host''s pid namespace. Optional: - Default to false.' - type: boolean - hostUsers: - description: 'Use the host''s user namespace. Optional: - Default to true. If set to true or not present, the - pod will be run in the host user namespace, useful - for when the pod needs a feature only available to - the host user namespace, such as loading a kernel - module with CAP_SYS_MODULE. When set to false, a new - userns is created for the pod.' - type: boolean - hostname: - description: Specifies the hostname of the Pod If not - specified, the pod's hostname will be set to a system-defined - value. - type: string - imagePullSecrets: - description: 'ImagePullSecrets is an optional list of - references to secrets in the same namespace to use - for pulling any of the images used by this PodSpec. - If specified, these secrets will be passed to individual - puller implementations for them to use. More info: - https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod' - items: - description: LocalObjectReference contains enough - information to let you locate the referenced object - inside the same namespace. - properties: - name: - description: 'Name of the referent. More info: - https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, kind, - uid?' - type: string - type: object - x-kubernetes-map-type: atomic - type: array - initContainers: - description: List of initialization containers belonging - to the pod. Init containers are executed in order - prior to containers being started. If any init container - fails, the pod is considered to have failed and is - handled according to its restartPolicy. The name for - an init container or normal container must be unique - among all containers. - items: - description: A single application container that you - want to run within a pod. - properties: - args: - description: 'Arguments to the entrypoint. The - container image''s CMD is used if this is not - provided. Variable references $(VAR_NAME) are - expanded using the container''s environment. - If a variable cannot be resolved, the reference - in the input string will be unchanged. Double - $$ are reduced to a single $, which allows for - escaping the $(VAR_NAME) syntax: i.e.' - items: - type: string - type: array - command: - description: 'Entrypoint array. Not executed within - a shell. The container image''s ENTRYPOINT is - used if this is not provided. Variable references - $(VAR_NAME) are expanded using the container''s - environment. If a variable cannot be resolved, - the reference in the input string will be unchanged. - Double $$ are reduced to a single $, which allows - for escaping the $(VAR_NAME) syntax: i.e.' - items: - type: string - type: array - env: - description: List of environment variables to - set in the container. Cannot be updated. - items: - description: EnvVar represents an environment - variable present in a Container. - properties: - name: - description: Name of the environment variable. - Must be a C_IDENTIFIER. - type: string - value: - description: 'Variable references $(VAR_NAME) - are expanded using the previously defined - environment variables in the container - and any service environment variables. - If a variable cannot be resolved, the - reference in the input string will be - unchanged. Double $$ are reduced to a - single $, which allows for escaping the - $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" - will produce the string literal "$(VAR_NAME)".' - type: string - valueFrom: - description: Source for the environment - variable's value. Cannot be used if value - is not empty. - properties: - configMapKeyRef: - description: Selects a key of a ConfigMap. - properties: - key: - description: The key to select. - type: string - name: - description: 'Name of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. - apiVersion, kind, uid?' - type: string - optional: - description: Specify whether the - ConfigMap or its key must be defined - type: boolean - required: - - key - type: object - x-kubernetes-map-type: atomic - fieldRef: - description: 'Selects a field of the - pod: supports metadata.name, metadata.namespace, - `metadata.labels['''']`, `metadata.annotations['''']`, - spec.nodeName, spec.serviceAccountName, - status.hostIP, status.podIP, status.podIPs.' - properties: - apiVersion: - description: Version of the schema - the FieldPath is written in terms - of, defaults to "v1". - type: string - fieldPath: - description: Path of the field to - select in the specified API version. - type: string - required: - - fieldPath - type: object - x-kubernetes-map-type: atomic - resourceFieldRef: - description: 'Selects a resource of - the container: only resources limits - and requests (limits.cpu, limits.memory, - limits.ephemeral-storage, requests.cpu, - requests.memory and requests.ephemeral-storage) - are currently supported.' - properties: - containerName: - description: 'Container name: required - for volumes, optional for env - vars' - type: string - divisor: - anyOf: - - type: integer - - type: string - description: Specifies the output - format of the exposed resources, - defaults to "1" - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - resource: - description: 'Required: resource - to select' - type: string - required: - - resource - type: object - x-kubernetes-map-type: atomic - secretKeyRef: - description: Selects a key of a secret - in the pod's namespace - properties: - key: - description: The key of the secret - to select from. Must be a valid - secret key. - type: string - name: - description: 'Name of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. - apiVersion, kind, uid?' - type: string - optional: - description: Specify whether the - Secret or its key must be defined - type: boolean - required: - - key - type: object - x-kubernetes-map-type: atomic - type: object - required: - - name - type: object - type: array - envFrom: - description: List of sources to populate environment - variables in the container. The keys defined - within a source must be a C_IDENTIFIER. All - invalid keys will be reported as an event when - the container is starting. When a key exists - in multiple sources, the value associated with - the last source will take precedence. Values - defined by an Env with a duplicate key will - take precedence. Cannot be updated. - items: - description: EnvFromSource represents the source - of a set of ConfigMaps - properties: - configMapRef: - description: The ConfigMap to select from - properties: - name: - description: 'Name of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' - type: string - optional: - description: Specify whether the ConfigMap - must be defined - type: boolean - type: object - x-kubernetes-map-type: atomic - prefix: - description: An optional identifier to prepend - to each key in the ConfigMap. Must be - a C_IDENTIFIER. - type: string - secretRef: - description: The Secret to select from - properties: - name: - description: 'Name of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' - type: string - optional: - description: Specify whether the Secret - must be defined - type: boolean - type: object - x-kubernetes-map-type: atomic - type: object - type: array - image: - description: 'Container image name. More info: - https://kubernetes.io/docs/concepts/containers/images - This field is optional to allow higher level - config management to default or override container - images in workload controllers like Deployments - and StatefulSets.' - type: string - imagePullPolicy: - description: 'Image pull policy. One of Always, - Never, IfNotPresent. Defaults to Always if :latest - tag is specified, or IfNotPresent otherwise. - Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images' - type: string - lifecycle: - description: Actions that the management system - should take in response to container lifecycle - events. Cannot be updated. - properties: - postStart: - description: 'PostStart is called immediately - after a container is created. If the handler - fails, the container is terminated and restarted - according to its restart policy. Other management - of the container blocks until the hook completes. - More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks' - properties: - exec: - description: Exec specifies the action - to take. - properties: - command: - description: Command is the command - line to execute inside the container, - the working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it - is not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to - explicitly call out to that shell. - Exit status of 0 is treated as live/healthy - and non-zero is unhealthy. - items: - type: string - type: array - type: object - httpGet: - description: HTTPGet specifies the http - request to perform. - properties: - host: - description: Host name to connect - to, defaults to the pod IP. You - probably want to set "Host" in httpHeaders - instead. - type: string - httpHeaders: - description: Custom headers to set - in the request. HTTP allows repeated - headers. - items: - description: HTTPHeader describes - a custom header to be used in - HTTP probes - properties: - name: - description: The header field - name. This will be canonicalized - upon output, so case-variant - names will be understood as - the same header. - type: string - value: - description: The header field - value - type: string - required: - - name - - value - type: object - type: array - path: - description: Path to access on the - HTTP server. - type: string - port: - anyOf: - - type: integer - - type: string - description: Name or number of the - port to access on the container. - Number must be in the range 1 to - 65535. Name must be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. - type: string - required: - - port - type: object - tcpSocket: - description: Deprecated. TCPSocket is - NOT supported as a LifecycleHandler - and kept for the backward compatibility. - There are no validation of this field - and lifecycle hooks will fail in runtime - when tcp handler is specified. - properties: - host: - description: 'Optional: Host name - to connect to, defaults to the pod - IP.' - type: string - port: - anyOf: - - type: integer - - type: string - description: Number or name of the - port to access on the container. - Number must be in the range 1 to - 65535. Name must be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - required: - - port - type: object - type: object - preStop: - description: PreStop is called immediately - before a container is terminated due to - an API request or management event such - as liveness/startup probe failure, preemption, - resource contention, etc. The handler is - not called if the container crashes or exits. - The Pod's termination grace period countdown - begins before the PreStop hook is executed. - properties: - exec: - description: Exec specifies the action - to take. - properties: - command: - description: Command is the command - line to execute inside the container, - the working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it - is not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to - explicitly call out to that shell. - Exit status of 0 is treated as live/healthy - and non-zero is unhealthy. - items: - type: string - type: array - type: object - httpGet: - description: HTTPGet specifies the http - request to perform. - properties: - host: - description: Host name to connect - to, defaults to the pod IP. You - probably want to set "Host" in httpHeaders - instead. - type: string - httpHeaders: - description: Custom headers to set - in the request. HTTP allows repeated - headers. - items: - description: HTTPHeader describes - a custom header to be used in - HTTP probes - properties: - name: - description: The header field - name. This will be canonicalized - upon output, so case-variant - names will be understood as - the same header. - type: string - value: - description: The header field - value - type: string - required: - - name - - value - type: object - type: array - path: - description: Path to access on the - HTTP server. - type: string - port: - anyOf: - - type: integer - - type: string - description: Name or number of the - port to access on the container. - Number must be in the range 1 to - 65535. Name must be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. - type: string - required: - - port - type: object - tcpSocket: - description: Deprecated. TCPSocket is - NOT supported as a LifecycleHandler - and kept for the backward compatibility. - There are no validation of this field - and lifecycle hooks will fail in runtime - when tcp handler is specified. - properties: - host: - description: 'Optional: Host name - to connect to, defaults to the pod - IP.' - type: string - port: - anyOf: - - type: integer - - type: string - description: Number or name of the - port to access on the container. - Number must be in the range 1 to - 65535. Name must be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - required: - - port - type: object - type: object - type: object - livenessProbe: - description: 'Periodic probe of container liveness. - Container will be restarted if the probe fails. - Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - properties: - exec: - description: Exec specifies the action to - take. - properties: - command: - description: Command is the command line - to execute inside the container, the - working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it is - not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to explicitly - call out to that shell. Exit status - of 0 is treated as live/healthy and - non-zero is unhealthy. - items: - type: string - type: array - type: object - failureThreshold: - description: Minimum consecutive failures - for the probe to be considered failed after - having succeeded. Defaults to 3. Minimum - value is 1. - format: int32 - type: integer - grpc: - description: GRPC specifies an action involving - a GRPC port. - properties: - port: - description: Port number of the gRPC service. - Number must be in the range 1 to 65535. - format: int32 - type: integer - service: - description: "Service is the name of the - service to place in the gRPC HealthCheckRequest - (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). - \n If this is not specified, the default - behavior is defined by gRPC." - type: string - required: - - port - type: object - httpGet: - description: HTTPGet specifies the http request - to perform. - properties: - host: - description: Host name to connect to, - defaults to the pod IP. You probably - want to set "Host" in httpHeaders instead. - type: string - httpHeaders: - description: Custom headers to set in - the request. HTTP allows repeated headers. - items: - description: HTTPHeader describes a - custom header to be used in HTTP probes - properties: - name: - description: The header field name. - This will be canonicalized upon - output, so case-variant names - will be understood as the same - header. - type: string - value: - description: The header field value - type: string - required: - - name - - value - type: object - type: array - path: - description: Path to access on the HTTP - server. - type: string - port: - anyOf: - - type: integer - - type: string - description: Name or number of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. - type: string - required: - - port - type: object - initialDelaySeconds: - description: 'Number of seconds after the - container has started before liveness probes - are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - periodSeconds: - description: How often (in seconds) to perform - the probe. Default to 10 seconds. Minimum - value is 1. - format: int32 - type: integer - successThreshold: - description: Minimum consecutive successes - for the probe to be considered successful - after having failed. Defaults to 1. Must - be 1 for liveness and startup. Minimum value - is 1. - format: int32 - type: integer - tcpSocket: - description: TCPSocket specifies an action - involving a TCP port. - properties: - host: - description: 'Optional: Host name to connect - to, defaults to the pod IP.' - type: string - port: - anyOf: - - type: integer - - type: string - description: Number or name of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - required: - - port - type: object - terminationGracePeriodSeconds: - description: Optional duration in seconds - the pod needs to terminate gracefully upon - probe failure. The grace period is the duration - in seconds after the processes running in - the pod are sent a termination signal and - the time when the processes are forcibly - halted with a kill signal. Set this value - longer than the expected cleanup time for - your process. - format: int64 - type: integer - timeoutSeconds: - description: 'Number of seconds after which - the probe times out. Defaults to 1 second. - Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - type: object - name: - description: Name of the container specified as - a DNS_LABEL. Each container in a pod must have - a unique name (DNS_LABEL). Cannot be updated. - type: string - ports: - description: List of ports to expose from the - container. Not specifying a port here DOES NOT - prevent that port from being exposed. Any port - which is listening on the default "0.0.0.0" - address inside a container will be accessible - from the network. Modifying this array with - strategic merge patch may corrupt the data. - For more information See https://github.com/kubernetes/kubernetes/issues/108255. - items: - description: ContainerPort represents a network - port in a single container. - properties: - containerPort: - description: Number of port to expose on - the pod's IP address. This must be a valid - port number, 0 < x < 65536. - format: int32 - type: integer - hostIP: - description: What host IP to bind the external - port to. - type: string - hostPort: - description: Number of port to expose on - the host. If specified, this must be a - valid port number, 0 < x < 65536. If HostNetwork - is specified, this must match ContainerPort. - Most containers do not need this. - format: int32 - type: integer - name: - description: If specified, this must be - an IANA_SVC_NAME and unique within the - pod. Each named port in a pod must have - a unique name. Name for the port that - can be referred to by services. - type: string - protocol: - default: TCP - description: Protocol for port. Must be - UDP, TCP, or SCTP. Defaults to "TCP". - type: string - required: - - containerPort - type: object - type: array - x-kubernetes-list-map-keys: - - containerPort - - protocol - x-kubernetes-list-type: map - readinessProbe: - description: 'Periodic probe of container service - readiness. Container will be removed from service - endpoints if the probe fails. Cannot be updated. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - properties: - exec: - description: Exec specifies the action to - take. - properties: - command: - description: Command is the command line - to execute inside the container, the - working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it is - not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to explicitly - call out to that shell. Exit status - of 0 is treated as live/healthy and - non-zero is unhealthy. - items: - type: string - type: array - type: object - failureThreshold: - description: Minimum consecutive failures - for the probe to be considered failed after - having succeeded. Defaults to 3. Minimum - value is 1. - format: int32 - type: integer - grpc: - description: GRPC specifies an action involving - a GRPC port. - properties: - port: - description: Port number of the gRPC service. - Number must be in the range 1 to 65535. - format: int32 - type: integer - service: - description: "Service is the name of the - service to place in the gRPC HealthCheckRequest - (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). - \n If this is not specified, the default - behavior is defined by gRPC." - type: string - required: - - port - type: object - httpGet: - description: HTTPGet specifies the http request - to perform. - properties: - host: - description: Host name to connect to, - defaults to the pod IP. You probably - want to set "Host" in httpHeaders instead. - type: string - httpHeaders: - description: Custom headers to set in - the request. HTTP allows repeated headers. - items: - description: HTTPHeader describes a - custom header to be used in HTTP probes - properties: - name: - description: The header field name. - This will be canonicalized upon - output, so case-variant names - will be understood as the same - header. - type: string - value: - description: The header field value - type: string - required: - - name - - value - type: object - type: array - path: - description: Path to access on the HTTP - server. - type: string - port: - anyOf: - - type: integer - - type: string - description: Name or number of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. - type: string - required: - - port - type: object - initialDelaySeconds: - description: 'Number of seconds after the - container has started before liveness probes - are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - periodSeconds: - description: How often (in seconds) to perform - the probe. Default to 10 seconds. Minimum - value is 1. - format: int32 - type: integer - successThreshold: - description: Minimum consecutive successes - for the probe to be considered successful - after having failed. Defaults to 1. Must - be 1 for liveness and startup. Minimum value - is 1. - format: int32 - type: integer - tcpSocket: - description: TCPSocket specifies an action - involving a TCP port. - properties: - host: - description: 'Optional: Host name to connect - to, defaults to the pod IP.' - type: string - port: - anyOf: - - type: integer - - type: string - description: Number or name of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - required: - - port - type: object - terminationGracePeriodSeconds: - description: Optional duration in seconds - the pod needs to terminate gracefully upon - probe failure. The grace period is the duration - in seconds after the processes running in - the pod are sent a termination signal and - the time when the processes are forcibly - halted with a kill signal. Set this value - longer than the expected cleanup time for - your process. - format: int64 - type: integer - timeoutSeconds: - description: 'Number of seconds after which - the probe times out. Defaults to 1 second. - Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - type: object - resizePolicy: - description: Resources resize policy for the container. - items: - description: ContainerResizePolicy represents - resource resize policy for the container. - properties: - resourceName: - description: 'Name of the resource to which - this resource resize policy applies. Supported - values: cpu, memory.' - type: string - restartPolicy: - description: Restart policy to apply when - specified resource is resized. If not - specified, it defaults to NotRequired. - type: string - required: - - resourceName - - restartPolicy - type: object - type: array - x-kubernetes-list-type: atomic - resources: - description: 'Compute Resources required by this - container. Cannot be updated. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' - properties: - claims: - description: "Claims lists the names of resources, - defined in spec.resourceClaims, that are - used by this container. \n This is an alpha - field and requires enabling the DynamicResourceAllocation - feature gate. \n This field is immutable. - It can only be set for containers." - items: - description: ResourceClaim references one - entry in PodSpec.ResourceClaims. - properties: - name: - description: Name must match the name - of one entry in pod.spec.resourceClaims - of the Pod where this field is used. - It makes that resource available inside - a container. - type: string - required: - - name - type: object - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - limits: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: 'Limits describes the maximum - amount of compute resources allowed. More - info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' - type: object - requests: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: 'Requests describes the minimum - amount of compute resources required. If - Requests is omitted for a container, it - defaults to Limits if that is explicitly - specified, otherwise to an implementation-defined - value. Requests cannot exceed Limits. More - info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' - type: object - type: object - securityContext: - description: 'SecurityContext defines the security - options the container should be run with. If - set, the fields of SecurityContext override - the equivalent fields of PodSecurityContext. - More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/' - properties: - allowPrivilegeEscalation: - description: 'AllowPrivilegeEscalation controls - whether a process can gain more privileges - than its parent process. This bool directly - controls if the no_new_privs flag will be - set on the container process. AllowPrivilegeEscalation - is true always when the container is: 1) - run as Privileged 2) has CAP_SYS_ADMIN Note - that this field cannot be set when spec.os.name - is windows.' - type: boolean - capabilities: - description: The capabilities to add/drop - when running containers. Defaults to the - default set of capabilities granted by the - container runtime. Note that this field - cannot be set when spec.os.name is windows. - properties: - add: - description: Added capabilities - items: - description: Capability represent POSIX - capabilities type - type: string - type: array - drop: - description: Removed capabilities - items: - description: Capability represent POSIX - capabilities type - type: string - type: array - type: object - privileged: - description: Run container in privileged mode. - Processes in privileged containers are essentially - equivalent to root on the host. Defaults - to false. Note that this field cannot be - set when spec.os.name is windows. - type: boolean - procMount: - description: procMount denotes the type of - proc mount to use for the containers. The - default is DefaultProcMount which uses the - container runtime defaults for readonly - paths and masked paths. This requires the - ProcMountType feature flag to be enabled. - Note that this field cannot be set when - spec.os.name is windows. - type: string - readOnlyRootFilesystem: - description: Whether this container has a - read-only root filesystem. Default is false. - Note that this field cannot be set when - spec.os.name is windows. - type: boolean - runAsGroup: - description: The GID to run the entrypoint - of the container process. Uses runtime default - if unset. May also be set in PodSecurityContext. If - set in both SecurityContext and PodSecurityContext, - the value specified in SecurityContext takes - precedence. Note that this field cannot - be set when spec.os.name is windows. - format: int64 - type: integer - runAsNonRoot: - description: Indicates that the container - must run as a non-root user. If true, the - Kubelet will validate the image at runtime - to ensure that it does not run as UID 0 - (root) and fail to start the container if - it does. If unset or false, no such validation - will be performed. May also be set in PodSecurityContext. - type: boolean - runAsUser: - description: The UID to run the entrypoint - of the container process. Defaults to user - specified in image metadata if unspecified. - May also be set in PodSecurityContext. If - set in both SecurityContext and PodSecurityContext, - the value specified in SecurityContext takes - precedence. Note that this field cannot - be set when spec.os.name is windows. - format: int64 - type: integer - seLinuxOptions: - description: The SELinux context to be applied - to the container. If unspecified, the container - runtime will allocate a random SELinux context - for each container. May also be set in - PodSecurityContext. If set in both SecurityContext - and PodSecurityContext, the value specified - in SecurityContext takes precedence. Note - that this field cannot be set when spec.os.name - is windows. - properties: - level: - description: Level is SELinux level label - that applies to the container. - type: string - role: - description: Role is a SELinux role label - that applies to the container. - type: string - type: - description: Type is a SELinux type label - that applies to the container. - type: string - user: - description: User is a SELinux user label - that applies to the container. - type: string - type: object - seccompProfile: - description: The seccomp options to use by - this container. If seccomp options are provided - at both the pod & container level, the container - options override the pod options. Note that - this field cannot be set when spec.os.name - is windows. - properties: - localhostProfile: - description: localhostProfile indicates - a profile defined in a file on the node - should be used. The profile must be - preconfigured on the node to work. Must - be a descending path, relative to the - kubelet's configured seccomp profile - location. Must only be set if type is - "Localhost". - type: string - type: - description: "type indicates which kind - of seccomp profile will be applied. - Valid options are: \n Localhost - a - profile defined in a file on the node - should be used. RuntimeDefault - the - container runtime default profile should - be used. Unconfined - no profile should - be applied." - type: string - required: - - type - type: object - windowsOptions: - description: The Windows specific settings - applied to all containers. If unspecified, - the options from the PodSecurityContext - will be used. If set in both SecurityContext - and PodSecurityContext, the value specified - in SecurityContext takes precedence. Note - that this field cannot be set when spec.os.name - is linux. - properties: - gmsaCredentialSpec: - description: GMSACredentialSpec is where - the GMSA admission webhook (https://github.com/kubernetes-sigs/windows-gmsa) - inlines the contents of the GMSA credential - spec named by the GMSACredentialSpecName - field. - type: string - gmsaCredentialSpecName: - description: GMSACredentialSpecName is - the name of the GMSA credential spec - to use. - type: string - hostProcess: - description: HostProcess determines if - a container should be run as a 'Host - Process' container. This field is alpha-level - and will only be honored by components - that enable the WindowsHostProcessContainers - feature flag. Setting this field without - the feature flag will result in errors - when validating the Pod. - type: boolean - runAsUserName: - description: The UserName in Windows to - run the entrypoint of the container - process. Defaults to the user specified - in image metadata if unspecified. May - also be set in PodSecurityContext. If - set in both SecurityContext and PodSecurityContext, - the value specified in SecurityContext - takes precedence. - type: string - type: object - type: object - startupProbe: - description: StartupProbe indicates that the Pod - has successfully initialized. If specified, - no other probes are executed until this completes - successfully. If this probe fails, the Pod will - be restarted, just as if the livenessProbe failed. - properties: - exec: - description: Exec specifies the action to - take. - properties: - command: - description: Command is the command line - to execute inside the container, the - working directory for the command is - root ('/') in the container's filesystem. - The command is simply exec'd, it is - not run inside a shell, so traditional - shell instructions ('|', etc) won't - work. To use a shell, you need to explicitly - call out to that shell. Exit status - of 0 is treated as live/healthy and - non-zero is unhealthy. - items: - type: string - type: array - type: object - failureThreshold: - description: Minimum consecutive failures - for the probe to be considered failed after - having succeeded. Defaults to 3. Minimum - value is 1. - format: int32 - type: integer - grpc: - description: GRPC specifies an action involving - a GRPC port. - properties: - port: - description: Port number of the gRPC service. - Number must be in the range 1 to 65535. - format: int32 - type: integer - service: - description: "Service is the name of the - service to place in the gRPC HealthCheckRequest - (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). - \n If this is not specified, the default - behavior is defined by gRPC." - type: string - required: - - port - type: object - httpGet: - description: HTTPGet specifies the http request - to perform. - properties: - host: - description: Host name to connect to, - defaults to the pod IP. You probably - want to set "Host" in httpHeaders instead. - type: string - httpHeaders: - description: Custom headers to set in - the request. HTTP allows repeated headers. - items: - description: HTTPHeader describes a - custom header to be used in HTTP probes - properties: - name: - description: The header field name. - This will be canonicalized upon - output, so case-variant names - will be understood as the same - header. - type: string - value: - description: The header field value - type: string - required: - - name - - value - type: object - type: array - path: - description: Path to access on the HTTP - server. - type: string - port: - anyOf: - - type: integer - - type: string - description: Name or number of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - scheme: - description: Scheme to use for connecting - to the host. Defaults to HTTP. - type: string - required: - - port - type: object - initialDelaySeconds: - description: 'Number of seconds after the - container has started before liveness probes - are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - periodSeconds: - description: How often (in seconds) to perform - the probe. Default to 10 seconds. Minimum - value is 1. - format: int32 - type: integer - successThreshold: - description: Minimum consecutive successes - for the probe to be considered successful - after having failed. Defaults to 1. Must - be 1 for liveness and startup. Minimum value - is 1. - format: int32 - type: integer - tcpSocket: - description: TCPSocket specifies an action - involving a TCP port. - properties: - host: - description: 'Optional: Host name to connect - to, defaults to the pod IP.' - type: string - port: - anyOf: - - type: integer - - type: string - description: Number or name of the port - to access on the container. Number must - be in the range 1 to 65535. Name must - be an IANA_SVC_NAME. - x-kubernetes-int-or-string: true - required: - - port - type: object - terminationGracePeriodSeconds: - description: Optional duration in seconds - the pod needs to terminate gracefully upon - probe failure. The grace period is the duration - in seconds after the processes running in - the pod are sent a termination signal and - the time when the processes are forcibly - halted with a kill signal. Set this value - longer than the expected cleanup time for - your process. - format: int64 - type: integer - timeoutSeconds: - description: 'Number of seconds after which - the probe times out. Defaults to 1 second. - Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - type: object - stdin: - description: Whether this container should allocate - a buffer for stdin in the container runtime. - If this is not set, reads from stdin in the - container will always result in EOF. Default - is false. - type: boolean - stdinOnce: - description: Whether the container runtime should - close the stdin channel after it has been opened - by a single attach. When stdin is true the stdin - stream will remain open across multiple attach - sessions. - type: boolean - terminationMessagePath: - description: 'Optional: Path at which the file - to which the container''s termination message - will be written is mounted into the container''s - filesystem. Message written is intended to be - brief final status, such as an assertion failure - message. Will be truncated by the node if greater - than 4096 bytes. The total message length across - all containers will be limited to 12kb. Defaults - to /dev/termination-log.' - type: string - terminationMessagePolicy: - description: Indicate how the termination message - should be populated. File will use the contents - of terminationMessagePath to populate the container - status message on both success and failure. - FallbackToLogsOnError will use the last chunk - of container log output if the termination message - file is empty and the container exited with - an error. - type: string - tty: - description: Whether this container should allocate - a TTY for itself, also requires 'stdin' to be - true. Default is false. - type: boolean - volumeDevices: - description: volumeDevices is the list of block - devices to be used by the container. - items: - description: volumeDevice describes a mapping - of a raw block device within a container. - properties: - devicePath: - description: devicePath is the path inside - of the container that the device will - be mapped to. - type: string - name: - description: name must match the name of - a persistentVolumeClaim in the pod - type: string - required: - - devicePath - - name - type: object - type: array - volumeMounts: - description: Pod volumes to mount into the container's - filesystem. Cannot be updated. - items: - description: VolumeMount describes a mounting - of a Volume within a container. - properties: - mountPath: - description: Path within the container at - which the volume should be mounted. Must - not contain ':'. - type: string - mountPropagation: - description: mountPropagation determines - how mounts are propagated from the host - to container and the other way around. - When not set, MountPropagationNone is - used. This field is beta in 1.10. - type: string - name: - description: This must match the Name of - a Volume. - type: string - readOnly: - description: Mounted read-only if true, - read-write otherwise (false or unspecified). - Defaults to false. - type: boolean - subPath: - description: Path within the volume from - which the container's volume should be - mounted. Defaults to "" (volume's root). - type: string - subPathExpr: - description: Expanded path within the volume - from which the container's volume should - be mounted. Behaves similarly to SubPath - but environment variable references $(VAR_NAME) - are expanded using the container's environment. - Defaults to "" (volume's root). SubPathExpr - and SubPath are mutually exclusive. - type: string - required: - - mountPath - - name - type: object - type: array - workingDir: - description: Container's working directory. If - not specified, the container runtime's default - will be used, which might be configured in the - container image. Cannot be updated. - type: string - required: - - name - type: object - type: array - nodeName: - description: NodeName is a request to schedule this - pod onto a specific node. If it is non-empty, the - scheduler simply schedules this pod onto that node, - assuming that it fits resource requirements. - type: string - nodeSelector: - additionalProperties: - type: string - description: 'NodeSelector is a selector which must - be true for the pod to fit on a node. Selector which - must match a node''s labels for the pod to be scheduled - on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/' - type: object - x-kubernetes-map-type: atomic - os: - description: "Specifies the OS of the containers in - the pod. Some pod and container fields are restricted - if this is set. \n If the OS field is set to linux, - the following fields must be unset: -securityContext.windowsOptions - \n If the OS field is set to windows, following fields - must be unset: - spec.hostPID - spec.hostIPC - spec.hostUsers - - spec.securityContext.seLinuxOptions - spec.securityContext." - properties: - name: - description: 'Name is the name of the operating - system. The currently supported values are linux - and windows. Additional value may be defined in - future and can be one of: https://github.com/opencontainers/runtime-spec/blob/master/config.md#platform-specific-configuration - Clients should expect to handle additional values - and treat unrecognized values in this field as - os: null' - type: string - required: - - name - type: object - overhead: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: Overhead represents the resource overhead - associated with running a pod for a given RuntimeClass. - This field will be autopopulated at admission time - by the RuntimeClass admission controller. If the RuntimeClass - admission controller is enabled, overhead must not - be set in Pod create requests. The RuntimeClass admission - controller will reject Pod create requests which have - the overhead already set. - type: object - preemptionPolicy: - description: PreemptionPolicy is the Policy for preempting - pods with lower priority. One of Never, PreemptLowerPriority. - Defaults to PreemptLowerPriority if unset. - type: string - priority: - description: The priority value. Various system components - use this field to find the priority of the pod. When - Priority Admission Controller is enabled, it prevents - users from setting this field. The admission controller - populates this field from PriorityClassName. The higher - the value, the higher the priority. - format: int32 - type: integer - priorityClassName: - description: If specified, indicates the pod's priority. - "system-node-critical" and "system-cluster-critical" - are two special keywords which indicate the highest - priorities with the former being the highest priority. - Any other name must be defined by creating a PriorityClass - object with that name. If not specified, the pod priority - will be default or zero if there is no default. - type: string - readinessGates: - description: 'If specified, all readiness gates will - be evaluated for pod readiness. A pod is ready when - all its containers are ready AND all conditions specified - in the readiness gates have status equal to "True" - More info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates' - items: - description: PodReadinessGate contains the reference - to a pod condition - properties: - conditionType: - description: ConditionType refers to a condition - in the pod's condition list with matching type. - type: string - required: - - conditionType - type: object - type: array - resourceClaims: - description: "ResourceClaims defines which ResourceClaims - must be allocated and reserved before the Pod is allowed - to start. The resources will be made available to - those containers which consume them by name. \n This - is an alpha field and requires enabling the DynamicResourceAllocation - feature gate. \n This field is immutable." - items: - description: PodResourceClaim references exactly one - ResourceClaim through a ClaimSource. It adds a name - to it that uniquely identifies the ResourceClaim - inside the Pod. Containers that need access to the - ResourceClaim reference it with this name. - properties: - name: - description: Name uniquely identifies this resource - claim inside the pod. This must be a DNS_LABEL. - type: string - source: - description: Source describes where to find the - ResourceClaim. - properties: - resourceClaimName: - description: ResourceClaimName is the name - of a ResourceClaim object in the same namespace - as this pod. - type: string - resourceClaimTemplateName: - description: "ResourceClaimTemplateName is - the name of a ResourceClaimTemplate object - in the same namespace as this pod. \n The - template will be used to create a new ResourceClaim, - which will be bound to this pod. When this - pod is deleted, the ResourceClaim will also - be deleted. The name of the ResourceClaim - will be -, where - is the PodResourceClaim.Name." - type: string - type: object - required: - - name - type: object - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - restartPolicy: - description: 'Restart policy for all containers within - the pod. One of Always, OnFailure, Never. In some - contexts, only a subset of those values may be permitted. - Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy' - type: string - runtimeClassName: - description: 'RuntimeClassName refers to a RuntimeClass - object in the node.k8s.io group, which should be used - to run this pod. If no RuntimeClass resource matches - the named class, the pod will not be run. If unset - or empty, the "legacy" RuntimeClass will be used, - which is an implicit class with an empty definition - that uses the default runtime handler. More info: - https://git.k8s.' - type: string - schedulerName: - description: If specified, the pod will be dispatched - by specified scheduler. If not specified, the pod - will be dispatched by default scheduler. - type: string - schedulingGates: - description: "SchedulingGates is an opaque list of values - that if specified will block scheduling the pod. If - schedulingGates is not empty, the pod will stay in - the SchedulingGated state and the scheduler will not - attempt to schedule the pod. \n SchedulingGates can - only be set at pod creation time, and be removed only - afterwards. \n This is a beta feature enabled by the - PodSchedulingReadiness feature gate." - items: - description: PodSchedulingGate is associated to a - Pod to guard its scheduling. - properties: - name: - description: Name of the scheduling gate. Each - scheduling gate must have a unique name field. - type: string - required: - - name - type: object - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - securityContext: - description: 'SecurityContext holds pod-level security - attributes and common container settings. Optional: - Defaults to empty. See type description for default - values of each field.' - properties: - fsGroup: - description: "A special supplemental group that - applies to all containers in a pod. Some volume - types allow the Kubelet to change the ownership - of that volume to be owned by the pod: \n 1. The - owning GID will be the FSGroup 2. The setgid bit - is set (new files created in the volume will be - owned by FSGroup) 3." - format: int64 - type: integer - fsGroupChangePolicy: - description: 'fsGroupChangePolicy defines behavior - of changing ownership and permission of the volume - before being exposed inside Pod. This field will - only apply to volume types which support fsGroup - based ownership(and permissions). It will have - no effect on ephemeral volume types such as: secret, - configmaps and emptydir. Valid values are "OnRootMismatch" - and "Always". If not specified, "Always" is used.' - type: string - runAsGroup: - description: The GID to run the entrypoint of the - container process. Uses runtime default if unset. - May also be set in SecurityContext. If set in - both SecurityContext and PodSecurityContext, the - value specified in SecurityContext takes precedence - for that container. Note that this field cannot - be set when spec.os.name is windows. - format: int64 - type: integer - runAsNonRoot: - description: Indicates that the container must run - as a non-root user. If true, the Kubelet will - validate the image at runtime to ensure that it - does not run as UID 0 (root) and fail to start - the container if it does. If unset or false, no - such validation will be performed. May also be - set in SecurityContext. - type: boolean - runAsUser: - description: The UID to run the entrypoint of the - container process. Defaults to user specified - in image metadata if unspecified. May also be - set in SecurityContext. If set in both SecurityContext - and PodSecurityContext, the value specified in - SecurityContext takes precedence for that container. - Note that this field cannot be set when spec.os.name - is windows. - format: int64 - type: integer - seLinuxOptions: - description: The SELinux context to be applied to - all containers. If unspecified, the container - runtime will allocate a random SELinux context - for each container. May also be set in SecurityContext. If - set in both SecurityContext and PodSecurityContext, - the value specified in SecurityContext takes precedence - for that container. Note that this field cannot - be set when spec.os.name is windows. - properties: - level: - description: Level is SELinux level label that - applies to the container. - type: string - role: - description: Role is a SELinux role label that - applies to the container. - type: string - type: - description: Type is a SELinux type label that - applies to the container. - type: string - user: - description: User is a SELinux user label that - applies to the container. - type: string - type: object - seccompProfile: - description: The seccomp options to use by the containers - in this pod. Note that this field cannot be set - when spec.os.name is windows. - properties: - localhostProfile: - description: localhostProfile indicates a profile - defined in a file on the node should be used. - The profile must be preconfigured on the node - to work. Must be a descending path, relative - to the kubelet's configured seccomp profile - location. Must only be set if type is "Localhost". - type: string - type: - description: "type indicates which kind of seccomp - profile will be applied. Valid options are: - \n Localhost - a profile defined in a file - on the node should be used. RuntimeDefault - - the container runtime default profile should - be used. Unconfined - no profile should be - applied." - type: string - required: - - type - type: object - supplementalGroups: - description: A list of groups applied to the first - process run in each container, in addition to - the container's primary GID, the fsGroup (if specified), - and group memberships defined in the container - image for the uid of the container process. If - unspecified, no additional groups are added to - any container. - items: - format: int64 - type: integer - type: array - sysctls: - description: Sysctls hold a list of namespaced sysctls - used for the pod. Pods with unsupported sysctls - (by the container runtime) might fail to launch. - Note that this field cannot be set when spec.os.name - is windows. - items: - description: Sysctl defines a kernel parameter - to be set - properties: - name: - description: Name of a property to set - type: string - value: - description: Value of a property to set - type: string - required: - - name - - value - type: object - type: array - windowsOptions: - description: The Windows specific settings applied - to all containers. If unspecified, the options - within a container's SecurityContext will be used. - If set in both SecurityContext and PodSecurityContext, - the value specified in SecurityContext takes precedence. - Note that this field cannot be set when spec.os.name - is linux. - properties: - gmsaCredentialSpec: - description: GMSACredentialSpec is where the - GMSA admission webhook (https://github.com/kubernetes-sigs/windows-gmsa) - inlines the contents of the GMSA credential - spec named by the GMSACredentialSpecName field. - type: string - gmsaCredentialSpecName: - description: GMSACredentialSpecName is the name - of the GMSA credential spec to use. - type: string - hostProcess: - description: HostProcess determines if a container - should be run as a 'Host Process' container. - This field is alpha-level and will only be - honored by components that enable the WindowsHostProcessContainers - feature flag. Setting this field without the - feature flag will result in errors when validating - the Pod. - type: boolean - runAsUserName: - description: The UserName in Windows to run - the entrypoint of the container process. Defaults - to the user specified in image metadata if - unspecified. May also be set in PodSecurityContext. - If set in both SecurityContext and PodSecurityContext, - the value specified in SecurityContext takes - precedence. - type: string - type: object - type: object - serviceAccount: - description: 'DeprecatedServiceAccount is a depreciated - alias for ServiceAccountName. Deprecated: Use serviceAccountName - instead.' - type: string - serviceAccountName: - description: 'ServiceAccountName is the name of the - ServiceAccount to use to run this pod. More info: - https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/' - type: string - setHostnameAsFQDN: - description: If true the pod's hostname will be configured - as the pod's FQDN, rather than the leaf name (the - default). In Linux containers, this means setting - the FQDN in the hostname field of the kernel (the - nodename field of struct utsname). - type: boolean - shareProcessNamespace: - description: 'Share a single process namespace between - all of the containers in a pod. When this is set containers - will be able to view and signal processes from other - containers in the same pod, and the first process - in each container will not be assigned PID 1. HostPID - and ShareProcessNamespace cannot both be set. Optional: - Default to false.' - type: boolean - subdomain: - description: If specified, the fully qualified Pod hostname - will be "...svc.". If not specified, the pod will not have - a domainname at all. - type: string - terminationGracePeriodSeconds: - description: Optional duration in seconds the pod needs - to terminate gracefully. May be decreased in delete - request. Value must be non-negative integer. The value - zero indicates stop immediately via the kill signal - (no opportunity to shut down). If this value is nil, - the default grace period will be used instead. - format: int64 - type: integer - tolerations: - description: If specified, the pod's tolerations. - items: - description: The pod this Toleration is attached to - tolerates any taint that matches the triple - using the matching operator . - properties: - effect: - description: Effect indicates the taint effect - to match. Empty means match all taint effects. - When specified, allowed values are NoSchedule, - PreferNoSchedule and NoExecute. - type: string - key: - description: Key is the taint key that the toleration - applies to. Empty means match all taint keys. - If the key is empty, operator must be Exists; - this combination means to match all values and - all keys. - type: string - operator: - description: Operator represents a key's relationship - to the value. Valid operators are Exists and - Equal. Defaults to Equal. Exists is equivalent - to wildcard for value, so that a pod can tolerate - all taints of a particular category. - type: string - tolerationSeconds: - description: TolerationSeconds represents the - period of time the toleration (which must be - of effect NoExecute, otherwise this field is - ignored) tolerates the taint. By default, it - is not set, which means tolerate the taint forever - (do not evict). Zero and negative values will - be treated as 0 (evict immediately) by the system. - format: int64 - type: integer - value: - description: Value is the taint value the toleration - matches to. If the operator is Exists, the value - should be empty, otherwise just a regular string. - type: string - type: object - type: array - topologySpreadConstraints: - description: TopologySpreadConstraints describes how - a group of pods ought to spread across topology domains. - Scheduler will schedule pods in a way which abides - by the constraints. All topologySpreadConstraints - are ANDed. - items: - description: TopologySpreadConstraint specifies how - to spread matching pods among the given topology. - properties: - labelSelector: - description: LabelSelector is used to find matching - pods. Pods that match this label selector are - counted to determine the number of pods in their - corresponding topology domain. - properties: - matchExpressions: - description: matchExpressions is a list of - label selector requirements. The requirements - are ANDed. - items: - description: A label selector requirement - is a selector that contains values, a - key, and an operator that relates the - key and values. - properties: - key: - description: key is the label key that - the selector applies to. - type: string - operator: - description: operator represents a key's - relationship to a set of values. Valid - operators are In, NotIn, Exists and - DoesNotExist. - type: string - values: - description: values is an array of string - values. If the operator is In or NotIn, - the values array must be non-empty. - If the operator is Exists or DoesNotExist, - the values array must be empty. This - array is replaced during a strategic - merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} - pairs. A single {key,value} in the matchLabels - map is equivalent to an element of matchExpressions, - whose key field is "key", the operator is - "In", and the values array contains only - "value". The requirements are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - matchLabelKeys: - description: MatchLabelKeys is a set of pod label - keys to select the pods over which spreading - will be calculated. The keys are used to lookup - values from the incoming pod labels, those key-value - labels are ANDed with labelSelector to select - the group of existing pods over which spreading - will be calculated for the incoming pod. The - same key is forbidden to exist in both MatchLabelKeys - and LabelSelector. - items: - type: string - type: array - x-kubernetes-list-type: atomic - maxSkew: - description: MaxSkew describes the degree to which - pods may be unevenly distributed. When `whenUnsatisfiable=DoNotSchedule`, - it is the maximum permitted difference between - the number of matching pods in the target topology - and the global minimum. The global minimum is - the minimum number of matching pods in an eligible - domain or zero if the number of eligible domains - is less than MinDomains. - format: int32 - type: integer - minDomains: - description: MinDomains indicates a minimum number - of eligible domains. When the number of eligible - domains with matching topology keys is less - than minDomains, Pod Topology Spread treats - "global minimum" as 0, and then the calculation - of Skew is performed. And when the number of - eligible domains with matching topology keys - equals or greater than minDomains, this value - has no effect on scheduling. - format: int32 - type: integer - nodeAffinityPolicy: - description: "NodeAffinityPolicy indicates how - we will treat Pod's nodeAffinity/nodeSelector - when calculating pod topology spread skew. Options - are: - Honor: only nodes matching nodeAffinity/nodeSelector - are included in the calculations. - Ignore: - nodeAffinity/nodeSelector are ignored. All nodes - are included in the calculations. \n If this - value is nil, the behavior is equivalent to - the Honor policy." - type: string - nodeTaintsPolicy: - description: "NodeTaintsPolicy indicates how we - will treat node taints when calculating pod - topology spread skew. Options are: - Honor: - nodes without taints, along with tainted nodes - for which the incoming pod has a toleration, - are included. - Ignore: node taints are ignored. - All nodes are included. \n If this value is - nil, the behavior is equivalent to the Ignore - policy." - type: string - topologyKey: - description: TopologyKey is the key of node labels. - Nodes that have a label with this key and identical - values are considered to be in the same topology. - We consider each as a "bucket", - and try to put balanced number of pods into - each bucket. We define a domain as a particular - instance of a topology. - type: string - whenUnsatisfiable: - description: WhenUnsatisfiable indicates how to - deal with a pod if it doesn't satisfy the spread - constraint. - DoNotSchedule (default) tells - the scheduler not to schedule it. - ScheduleAnyway - tells the scheduler to schedule the pod in any - location, but giving higher precedence to topologies - that would help reduce the skew. - type: string - required: - - maxSkew - - topologyKey - - whenUnsatisfiable - type: object - type: array - x-kubernetes-list-map-keys: - - topologyKey - - whenUnsatisfiable - x-kubernetes-list-type: map - volumes: - description: 'List of volumes that can be mounted by - containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes' - items: - description: Volume represents a named volume in a - pod that may be accessed by any container in the - pod. - properties: - awsElasticBlockStore: - description: 'awsElasticBlockStore represents - an AWS Disk resource that is attached to a kubelet''s - host machine and then exposed to the pod. More - info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore' - properties: - fsType: - description: 'fsType is the filesystem type - of the volume that you want to mount. Tip: - Ensure that the filesystem type is supported - by the host operating system. Examples: - "ext4", "xfs", "ntfs". Implicitly inferred - to be "ext4" if unspecified. More info: - https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore - TODO: how do we prevent errors in the filesystem - from compromising the machine' - type: string - partition: - description: 'partition is the partition in - the volume that you want to mount. If omitted, - the default is to mount by volume name. - Examples: For volume /dev/sda1, you specify - the partition as "1". Similarly, the volume - partition for /dev/sda is "0" (or you can - leave the property empty).' - format: int32 - type: integer - readOnly: - description: 'readOnly value true will force - the readOnly setting in VolumeMounts. More - info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore' - type: boolean - volumeID: - description: 'volumeID is unique ID of the - persistent disk resource in AWS (Amazon - EBS volume). More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore' - type: string - required: - - volumeID - type: object - azureDisk: - description: azureDisk represents an Azure Data - Disk mount on the host and bind mount to the - pod. - properties: - cachingMode: - description: 'cachingMode is the Host Caching - mode: None, Read Only, Read Write.' - type: string - diskName: - description: diskName is the Name of the data - disk in the blob storage - type: string - diskURI: - description: diskURI is the URI of data disk - in the blob storage - type: string - fsType: - description: fsType is Filesystem type to - mount. Must be a filesystem type supported - by the host operating system. Ex. "ext4", - "xfs", "ntfs". Implicitly inferred to be - "ext4" if unspecified. - type: string - kind: - description: 'kind expected values are Shared: - multiple blob disks per storage account Dedicated: - single blob disk per storage account Managed: - azure managed data disk (only in managed - availability set). defaults to shared' - type: string - readOnly: - description: readOnly Defaults to false (read/write). - ReadOnly here will force the ReadOnly setting - in VolumeMounts. - type: boolean - required: - - diskName - - diskURI - type: object - azureFile: - description: azureFile represents an Azure File - Service mount on the host and bind mount to - the pod. - properties: - readOnly: - description: readOnly defaults to false (read/write). - ReadOnly here will force the ReadOnly setting - in VolumeMounts. - type: boolean - secretName: - description: secretName is the name of secret - that contains Azure Storage Account Name - and Key - type: string - shareName: - description: shareName is the azure share - Name - type: string - required: - - secretName - - shareName - type: object - cephfs: - description: cephFS represents a Ceph FS mount - on the host that shares a pod's lifetime - properties: - monitors: - description: 'monitors is Required: Monitors - is a collection of Ceph monitors More info: - https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it' - items: - type: string - type: array - path: - description: 'path is Optional: Used as the - mounted root, rather than the full Ceph - tree, default is /' - type: string - readOnly: - description: 'readOnly is Optional: Defaults - to false (read/write). ReadOnly here will - force the ReadOnly setting in VolumeMounts. - More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it' - type: boolean - secretFile: - description: 'secretFile is Optional: SecretFile - is the path to key ring for User, default - is /etc/ceph/user.secret More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it' - type: string - secretRef: - description: 'secretRef is Optional: SecretRef - is reference to the authentication secret - for User, default is empty. More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it' - properties: - name: - description: 'Name of the referent. More - info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' - type: string - type: object - x-kubernetes-map-type: atomic - user: - description: 'user is optional: User is the - rados user name, default is admin More info: - https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it' - type: string - required: - - monitors - type: object - cinder: - description: 'cinder represents a cinder volume - attached and mounted on kubelets host machine. - More info: https://examples.k8s.io/mysql-cinder-pd/README.md' - properties: - fsType: - description: 'fsType is the filesystem type - to mount. Must be a filesystem type supported - by the host operating system. Examples: - "ext4", "xfs", "ntfs". Implicitly inferred - to be "ext4" if unspecified. More info: - https://examples.k8s.io/mysql-cinder-pd/README.md' - type: string - readOnly: - description: 'readOnly defaults to false (read/write). - ReadOnly here will force the ReadOnly setting - in VolumeMounts. More info: https://examples.k8s.io/mysql-cinder-pd/README.md' - type: boolean - secretRef: - description: 'secretRef is optional: points - to a secret object containing parameters - used to connect to OpenStack.' - properties: - name: - description: 'Name of the referent. More - info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' - type: string - type: object - x-kubernetes-map-type: atomic - volumeID: - description: 'volumeID used to identify the - volume in cinder. More info: https://examples.k8s.io/mysql-cinder-pd/README.md' - type: string - required: - - volumeID - type: object - configMap: - description: configMap represents a configMap - that should populate this volume - properties: - defaultMode: - description: 'defaultMode is optional: mode - bits used to set permissions on created - files by default. Must be an octal value - between 0000 and 0777 or a decimal value - between 0 and 511. YAML accepts both octal - and decimal values, JSON requires decimal - values for mode bits. Defaults to 0644. - Directories within the path are not affected - by this setting.' - format: int32 - type: integer - items: - description: items if unspecified, each key-value - pair in the Data field of the referenced - ConfigMap will be projected into the volume - as a file whose name is the key and content - is the value. If specified, the listed keys - will be projected into the specified paths, - and unlisted keys will not be present. - items: - description: Maps a string key to a path - within a volume. - properties: - key: - description: key is the key to project. - type: string - mode: - description: 'mode is Optional: mode - bits used to set permissions on this - file. Must be an octal value between - 0000 and 0777 or a decimal value between - 0 and 511. YAML accepts both octal - and decimal values, JSON requires - decimal values for mode bits. If not - specified, the volume defaultMode - will be used.' - format: int32 - type: integer - path: - description: path is the relative path - of the file to map the key to. May - not be an absolute path. May not contain - the path element '..'. May not start - with the string '..'. - type: string - required: - - key - - path - type: object - type: array - name: - description: 'Name of the referent. More info: - https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' - type: string - optional: - description: optional specify whether the - ConfigMap or its keys must be defined - type: boolean - type: object - x-kubernetes-map-type: atomic - csi: - description: csi (Container Storage Interface) - represents ephemeral storage that is handled - by certain external CSI drivers (Beta feature). - properties: - driver: - description: driver is the name of the CSI - driver that handles this volume. Consult - with your admin for the correct name as - registered in the cluster. - type: string - fsType: - description: fsType to mount. Ex. "ext4", - "xfs", "ntfs". If not provided, the empty - value is passed to the associated CSI driver - which will determine the default filesystem - to apply. - type: string - nodePublishSecretRef: - description: nodePublishSecretRef is a reference - to the secret object containing sensitive - information to pass to the CSI driver to - complete the CSI NodePublishVolume and NodeUnpublishVolume - calls. This field is optional, and may - be empty if no secret is required. If the - secret object contains more than one secret, - all secret references are passed. - properties: - name: - description: 'Name of the referent. More - info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' - type: string - type: object - x-kubernetes-map-type: atomic - readOnly: - description: readOnly specifies a read-only - configuration for the volume. Defaults to - false (read/write). - type: boolean - volumeAttributes: - additionalProperties: - type: string - description: volumeAttributes stores driver-specific - properties that are passed to the CSI driver. - Consult your driver's documentation for - supported values. - type: object - required: - - driver - type: object - downwardAPI: - description: downwardAPI represents downward API - about the pod that should populate this volume - properties: - defaultMode: - description: 'Optional: mode bits to use on - created files by default. Must be a Optional: - mode bits used to set permissions on created - files by default. Must be an octal value - between 0000 and 0777 or a decimal value - between 0 and 511. YAML accepts both octal - and decimal values, JSON requires decimal - values for mode bits. Defaults to 0644. - Directories within the path are not affected - by this setting.' - format: int32 - type: integer - items: - description: Items is a list of downward API - volume file - items: - description: DownwardAPIVolumeFile represents - information to create the file containing - the pod field - properties: - fieldRef: - description: 'Required: Selects a field - of the pod: only annotations, labels, - name and namespace are supported.' - properties: - apiVersion: - description: Version of the schema - the FieldPath is written in terms - of, defaults to "v1". - type: string - fieldPath: - description: Path of the field to - select in the specified API version. - type: string - required: - - fieldPath - type: object - x-kubernetes-map-type: atomic - mode: - description: 'Optional: mode bits used - to set permissions on this file, must - be an octal value between 0000 and - 0777 or a decimal value between 0 - and 511. YAML accepts both octal and - decimal values, JSON requires decimal - values for mode bits. If not specified, - the volume defaultMode will be used.' - format: int32 - type: integer - path: - description: 'Required: Path is the - relative path name of the file to - be created. Must not be absolute or - contain the ''..'' path. Must be utf-8 - encoded. The first item of the relative - path must not start with ''..''' - type: string - resourceFieldRef: - description: 'Selects a resource of - the container: only resources limits - and requests (limits.cpu, limits.memory, - requests.cpu and requests.memory) - are currently supported.' - properties: - containerName: - description: 'Container name: required - for volumes, optional for env - vars' - type: string - divisor: - anyOf: - - type: integer - - type: string - description: Specifies the output - format of the exposed resources, - defaults to "1" - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - resource: - description: 'Required: resource - to select' - type: string - required: - - resource - type: object - x-kubernetes-map-type: atomic - required: - - path - type: object - type: array - type: object - emptyDir: - description: 'emptyDir represents a temporary - directory that shares a pod''s lifetime. More - info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' - properties: - medium: - description: 'medium represents what type - of storage medium should back this directory. - The default is "" which means to use the - node''s default medium. Must be an empty - string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' - type: string - sizeLimit: - anyOf: - - type: integer - - type: string - description: 'sizeLimit is the total amount - of local storage required for this EmptyDir - volume. The size limit is also applicable - for memory medium. The maximum usage on - memory medium EmptyDir would be the minimum - value between the SizeLimit specified here - and the sum of memory limits of all containers - in a pod. The default is nil which means - that the limit is undefined. More info: - https://kubernetes.' - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: object - ephemeral: - description: ephemeral represents a volume that - is handled by a cluster storage driver. The - volume's lifecycle is tied to the pod that defines - it - it will be created before the pod starts, - and deleted when the pod is removed. - properties: - volumeClaimTemplate: - description: Will be used to create a stand-alone - PVC to provision the volume. The pod in - which this EphemeralVolumeSource is embedded - will be the owner of the PVC, i.e. the PVC - will be deleted together with the pod. The - name of the PVC will be `-` where `` is the name - from the `PodSpec.Volumes` array entry. - properties: - metadata: - description: May contain labels and annotations - that will be copied into the PVC when - creating it. No other fields are allowed - and will be rejected during validation. - properties: - annotations: - additionalProperties: - type: string - type: object - finalizers: - items: - type: string - type: array - labels: - additionalProperties: - type: string - type: object - name: - type: string - namespace: - type: string - type: object - spec: - description: The specification for the - PersistentVolumeClaim. The entire content - is copied unchanged into the PVC that - gets created from this template. The - same fields as in a PersistentVolumeClaim - are also valid here. - properties: - accessModes: - description: 'accessModes contains - the desired access modes the volume - should have. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1' - items: - type: string - type: array - dataSource: - description: 'dataSource field can - be used to specify either: * An - existing VolumeSnapshot object (snapshot.storage.k8s.io/VolumeSnapshot) - * An existing PVC (PersistentVolumeClaim) - If the provisioner or an external - controller can support the specified - data source, it will create a new - volume based on the contents of - the specified data source.' - properties: - apiGroup: - description: APIGroup is the group - for the resource being referenced. - If APIGroup is not specified, - the specified Kind must be in - the core API group. For any - other third-party types, APIGroup - is required. - type: string - kind: - description: Kind is the type - of resource being referenced - type: string - name: - description: Name is the name - of resource being referenced - type: string - required: - - kind - - name - type: object - x-kubernetes-map-type: atomic - dataSourceRef: - description: dataSourceRef specifies - the object from which to populate - the volume with data, if a non-empty - volume is desired. This may be any - object from a non-empty API group - (non core object) or a PersistentVolumeClaim - object. When this field is specified, - volume binding will only succeed - if the type of the specified object - matches some installed volume populator - or dynamic provisioner. - properties: - apiGroup: - description: APIGroup is the group - for the resource being referenced. - If APIGroup is not specified, - the specified Kind must be in - the core API group. For any - other third-party types, APIGroup - is required. - type: string - kind: - description: Kind is the type - of resource being referenced - type: string - name: - description: Name is the name - of resource being referenced - type: string - namespace: - description: Namespace is the - namespace of resource being - referenced Note that when a - namespace is specified, a gateway.networking.k8s.io/ReferenceGrant - object is required in the referent - namespace to allow that namespace's - owner to accept the reference. - See the ReferenceGrant documentation - for details. (Alpha) This field - requires the CrossNamespaceVolumeDataSource - feature gate to be enabled. - type: string - required: - - kind - - name - type: object - resources: - description: 'resources represents - the minimum resources the volume - should have. If RecoverVolumeExpansionFailure - feature is enabled users are allowed - to specify resource requirements - that are lower than previous value - but must still be higher than capacity - recorded in the status field of - the claim. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#resources' - properties: - claims: - description: "Claims lists the - names of resources, defined - in spec.resourceClaims, that - are used by this container. - \n This is an alpha field and - requires enabling the DynamicResourceAllocation - feature gate. \n This field - is immutable. It can only be - set for containers." - items: - description: ResourceClaim references - one entry in PodSpec.ResourceClaims. - properties: - name: - description: Name must match - the name of one entry - in pod.spec.resourceClaims - of the Pod where this - field is used. It makes - that resource available - inside a container. - type: string - required: - - name - type: object - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - limits: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: 'Limits describes - the maximum amount of compute - resources allowed. More info: - https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' - type: object - requests: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: 'Requests describes - the minimum amount of compute - resources required. If Requests - is omitted for a container, - it defaults to Limits if that - is explicitly specified, otherwise - to an implementation-defined - value. Requests cannot exceed - Limits. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/' - type: object - type: object - selector: - description: selector is a label query - over volumes to consider for binding. - properties: - matchExpressions: - description: matchExpressions - is a list of label selector - requirements. The requirements - are ANDed. - items: - description: A label selector - requirement is a selector - that contains values, a key, - and an operator that relates - the key and values. - properties: - key: - description: key is the - label key that the selector - applies to. - type: string - operator: - description: operator represents - a key's relationship to - a set of values. Valid - operators are In, NotIn, - Exists and DoesNotExist. - type: string - values: - description: values is an - array of string values. - If the operator is In - or NotIn, the values array - must be non-empty. If - the operator is Exists - or DoesNotExist, the values - array must be empty. This - array is replaced during - a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a - map of {key,value} pairs. A - single {key,value} in the matchLabels - map is equivalent to an element - of matchExpressions, whose key - field is "key", the operator - is "In", and the values array - contains only "value". The requirements - are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - storageClassName: - description: 'storageClassName is - the name of the StorageClass required - by the claim. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1' - type: string - volumeMode: - description: volumeMode defines what - type of volume is required by the - claim. Value of Filesystem is implied - when not included in claim spec. - type: string - volumeName: - description: volumeName is the binding - reference to the PersistentVolume - backing this claim. - type: string - type: object - required: - - spec - type: object - type: object - fc: - description: fc represents a Fibre Channel resource - that is attached to a kubelet's host machine - and then exposed to the pod. - properties: - fsType: - description: 'fsType is the filesystem type - to mount. Must be a filesystem type supported - by the host operating system. Ex. "ext4", - "xfs", "ntfs". Implicitly inferred to be - "ext4" if unspecified. TODO: how do we prevent - errors in the filesystem from compromising - the machine' - type: string - lun: - description: 'lun is Optional: FC target lun - number' - format: int32 - type: integer - readOnly: - description: 'readOnly is Optional: Defaults - to false (read/write). ReadOnly here will - force the ReadOnly setting in VolumeMounts.' - type: boolean - targetWWNs: - description: 'targetWWNs is Optional: FC target - worldwide names (WWNs)' - items: - type: string - type: array - wwids: - description: 'wwids Optional: FC volume world - wide identifiers (wwids) Either wwids or - combination of targetWWNs and lun must be - set, but not both simultaneously.' - items: - type: string - type: array - type: object - flexVolume: - description: flexVolume represents a generic volume - resource that is provisioned/attached using - an exec based plugin. - properties: - driver: - description: driver is the name of the driver - to use for this volume. - type: string - fsType: - description: fsType is the filesystem type - to mount. Must be a filesystem type supported - by the host operating system. Ex. "ext4", - "xfs", "ntfs". The default filesystem depends - on FlexVolume script. - type: string - options: - additionalProperties: - type: string - description: 'options is Optional: this field - holds extra command options if any.' - type: object - readOnly: - description: 'readOnly is Optional: defaults - to false (read/write). ReadOnly here will - force the ReadOnly setting in VolumeMounts.' - type: boolean - secretRef: - description: 'secretRef is Optional: secretRef - is reference to the secret object containing - sensitive information to pass to the plugin - scripts. This may be empty if no secret - object is specified. If the secret object - contains more than one secret, all secrets - are passed to the plugin scripts.' - properties: - name: - description: 'Name of the referent. More - info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' - type: string - type: object - x-kubernetes-map-type: atomic - required: - - driver - type: object - flocker: - description: flocker represents a Flocker volume - attached to a kubelet's host machine. This depends - on the Flocker control service being running - properties: - datasetName: - description: datasetName is Name of the dataset - stored as metadata -> name on the dataset - for Flocker should be considered as deprecated - type: string - datasetUUID: - description: datasetUUID is the UUID of the - dataset. This is unique identifier of a - Flocker dataset - type: string - type: object - gcePersistentDisk: - description: 'gcePersistentDisk represents a GCE - Disk resource that is attached to a kubelet''s - host machine and then exposed to the pod. More - info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk' - properties: - fsType: - description: 'fsType is filesystem type of - the volume that you want to mount. Tip: - Ensure that the filesystem type is supported - by the host operating system. Examples: - "ext4", "xfs", "ntfs". Implicitly inferred - to be "ext4" if unspecified. More info: - https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk - TODO: how do we prevent errors in the filesystem - from compromising the machine' - type: string - partition: - description: 'partition is the partition in - the volume that you want to mount. If omitted, - the default is to mount by volume name. - Examples: For volume /dev/sda1, you specify - the partition as "1". Similarly, the volume - partition for /dev/sda is "0" (or you can - leave the property empty). More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk' - format: int32 - type: integer - pdName: - description: 'pdName is unique name of the - PD resource in GCE. Used to identify the - disk in GCE. More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk' - type: string - readOnly: - description: 'readOnly here will force the - ReadOnly setting in VolumeMounts. Defaults - to false. More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk' - type: boolean - required: - - pdName - type: object - gitRepo: - description: 'gitRepo represents a git repository - at a particular revision. DEPRECATED: GitRepo - is deprecated. To provision a container with - a git repo, mount an EmptyDir into an InitContainer - that clones the repo using git, then mount the - EmptyDir into the Pod''s container.' - properties: - directory: - description: directory is the target directory - name. Must not contain or start with '..'. If - '.' is supplied, the volume directory will - be the git repository. Otherwise, if specified, - the volume will contain the git repository - in the subdirectory with the given name. - type: string - repository: - description: repository is the URL - type: string - revision: - description: revision is the commit hash for - the specified revision. - type: string - required: - - repository - type: object - glusterfs: - description: 'glusterfs represents a Glusterfs - mount on the host that shares a pod''s lifetime. - More info: https://examples.k8s.io/volumes/glusterfs/README.md' - properties: - endpoints: - description: 'endpoints is the endpoint name - that details Glusterfs topology. More info: - https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod' - type: string - path: - description: 'path is the Glusterfs volume - path. More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod' - type: string - readOnly: - description: 'readOnly here will force the - Glusterfs volume to be mounted with read-only - permissions. Defaults to false. More info: - https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod' - type: boolean - required: - - endpoints - - path - type: object - hostPath: - description: 'hostPath represents a pre-existing - file or directory on the host machine that is - directly exposed to the container. This is generally - used for system agents or other privileged things - that are allowed to see the host machine. Most - containers will NOT need this. More info: https://kubernetes.' - properties: - path: - description: 'path of the directory on the - host. If the path is a symlink, it will - follow the link to the real path. More info: - https://kubernetes.io/docs/concepts/storage/volumes#hostpath' - type: string - type: - description: 'type for HostPath Volume Defaults - to "" More info: https://kubernetes.io/docs/concepts/storage/volumes#hostpath' - type: string - required: - - path - type: object - iscsi: - description: 'iscsi represents an ISCSI Disk resource - that is attached to a kubelet''s host machine - and then exposed to the pod. More info: https://examples.k8s.io/volumes/iscsi/README.md' - properties: - chapAuthDiscovery: - description: chapAuthDiscovery defines whether - support iSCSI Discovery CHAP authentication - type: boolean - chapAuthSession: - description: chapAuthSession defines whether - support iSCSI Session CHAP authentication - type: boolean - fsType: - description: 'fsType is the filesystem type - of the volume that you want to mount. Tip: - Ensure that the filesystem type is supported - by the host operating system. Examples: - "ext4", "xfs", "ntfs". Implicitly inferred - to be "ext4" if unspecified. More info: - https://kubernetes.io/docs/concepts/storage/volumes#iscsi - TODO: how do we prevent errors in the filesystem - from compromising the machine' - type: string - initiatorName: - description: initiatorName is the custom iSCSI - Initiator Name. If initiatorName is specified - with iscsiInterface simultaneously, new - iSCSI interface : will be created for the connection. - type: string - iqn: - description: iqn is the target iSCSI Qualified - Name. - type: string - iscsiInterface: - description: iscsiInterface is the interface - Name that uses an iSCSI transport. Defaults - to 'default' (tcp). - type: string - lun: - description: lun represents iSCSI Target Lun - number. - format: int32 - type: integer - portals: - description: portals is the iSCSI Target Portal - List. The portal is either an IP or ip_addr:port - if the port is other than default (typically - TCP ports 860 and 3260). - items: - type: string - type: array - readOnly: - description: readOnly here will force the - ReadOnly setting in VolumeMounts. Defaults - to false. - type: boolean - secretRef: - description: secretRef is the CHAP Secret - for iSCSI target and initiator authentication - properties: - name: - description: 'Name of the referent. More - info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' - type: string - type: object - x-kubernetes-map-type: atomic - targetPortal: - description: targetPortal is iSCSI Target - Portal. The Portal is either an IP or ip_addr:port - if the port is other than default (typically - TCP ports 860 and 3260). - type: string - required: - - iqn - - lun - - targetPortal - type: object - name: - description: 'name of the volume. Must be a DNS_LABEL - and unique within the pod. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' - type: string - nfs: - description: 'nfs represents an NFS mount on the - host that shares a pod''s lifetime More info: - https://kubernetes.io/docs/concepts/storage/volumes#nfs' - properties: - path: - description: 'path that is exported by the - NFS server. More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs' - type: string - readOnly: - description: 'readOnly here will force the - NFS export to be mounted with read-only - permissions. Defaults to false. More info: - https://kubernetes.io/docs/concepts/storage/volumes#nfs' - type: boolean - server: - description: 'server is the hostname or IP - address of the NFS server. More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs' - type: string - required: - - path - - server - type: object - persistentVolumeClaim: - description: 'persistentVolumeClaimVolumeSource - represents a reference to a PersistentVolumeClaim - in the same namespace. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#persistentvolumeclaims' - properties: - claimName: - description: 'claimName is the name of a PersistentVolumeClaim - in the same namespace as the pod using this - volume. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#persistentvolumeclaims' - type: string - readOnly: - description: readOnly Will force the ReadOnly - setting in VolumeMounts. Default false. - type: boolean - required: - - claimName - type: object - photonPersistentDisk: - description: photonPersistentDisk represents a - PhotonController persistent disk attached and - mounted on kubelets host machine - properties: - fsType: - description: fsType is the filesystem type - to mount. Must be a filesystem type supported - by the host operating system. Ex. "ext4", - "xfs", "ntfs". Implicitly inferred to be - "ext4" if unspecified. - type: string - pdID: - description: pdID is the ID that identifies - Photon Controller persistent disk - type: string - required: - - pdID - type: object - portworxVolume: - description: portworxVolume represents a portworx - volume attached and mounted on kubelets host - machine - properties: - fsType: - description: fSType represents the filesystem - type to mount Must be a filesystem type - supported by the host operating system. - Ex. "ext4", "xfs". Implicitly inferred to - be "ext4" if unspecified. - type: string - readOnly: - description: readOnly defaults to false (read/write). - ReadOnly here will force the ReadOnly setting - in VolumeMounts. - type: boolean - volumeID: - description: volumeID uniquely identifies - a Portworx volume - type: string - required: - - volumeID - type: object - projected: - description: projected items for all in one resources - secrets, configmaps, and downward API - properties: - defaultMode: - description: defaultMode are the mode bits - used to set permissions on created files - by default. Must be an octal value between - 0000 and 0777 or a decimal value between - 0 and 511. YAML accepts both octal and decimal - values, JSON requires decimal values for - mode bits. Directories within the path are - not affected by this setting. - format: int32 - type: integer - sources: - description: sources is the list of volume - projections - items: - description: Projection that may be projected - along with other supported volume types - properties: - configMap: - description: configMap information about - the configMap data to project - properties: - items: - description: items if unspecified, - each key-value pair in the Data - field of the referenced ConfigMap - will be projected into the volume - as a file whose name is the key - and content is the value. If specified, - the listed keys will be projected - into the specified paths, and - unlisted keys will not be present. - items: - description: Maps a string key - to a path within a volume. - properties: - key: - description: key is the key - to project. - type: string - mode: - description: 'mode is Optional: - mode bits used to set permissions - on this file. Must be an - octal value between 0000 - and 0777 or a decimal value - between 0 and 511. YAML - accepts both octal and decimal - values, JSON requires decimal - values for mode bits. If - not specified, the volume - defaultMode will be used.' - format: int32 - type: integer - path: - description: path is the relative - path of the file to map - the key to. May not be an - absolute path. May not contain - the path element '..'. May - not start with the string - '..'. - type: string - required: - - key - - path - type: object - type: array - name: - description: 'Name of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. - apiVersion, kind, uid?' - type: string - optional: - description: optional specify whether - the ConfigMap or its keys must - be defined - type: boolean - type: object - x-kubernetes-map-type: atomic - downwardAPI: - description: downwardAPI information - about the downwardAPI data to project - properties: - items: - description: Items is a list of - DownwardAPIVolume file - items: - description: DownwardAPIVolumeFile - represents information to create - the file containing the pod - field - properties: - fieldRef: - description: 'Required: Selects - a field of the pod: only - annotations, labels, name - and namespace are supported.' - properties: - apiVersion: - description: Version of - the schema the FieldPath - is written in terms - of, defaults to "v1". - type: string - fieldPath: - description: Path of the - field to select in the - specified API version. - type: string - required: - - fieldPath - type: object - x-kubernetes-map-type: atomic - mode: - description: 'Optional: mode - bits used to set permissions - on this file, must be an - octal value between 0000 - and 0777 or a decimal value - between 0 and 511. YAML - accepts both octal and decimal - values, JSON requires decimal - values for mode bits. If - not specified, the volume - defaultMode will be used.' - format: int32 - type: integer - path: - description: 'Required: Path - is the relative path name - of the file to be created. - Must not be absolute or - contain the ''..'' path. - Must be utf-8 encoded. The - first item of the relative - path must not start with - ''..''' - type: string - resourceFieldRef: - description: 'Selects a resource - of the container: only resources - limits and requests (limits.cpu, - limits.memory, requests.cpu - and requests.memory) are - currently supported.' - properties: - containerName: - description: 'Container - name: required for volumes, - optional for env vars' - type: string - divisor: - anyOf: - - type: integer - - type: string - description: Specifies - the output format of - the exposed resources, - defaults to "1" - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - resource: - description: 'Required: - resource to select' - type: string - required: - - resource - type: object - x-kubernetes-map-type: atomic - required: - - path - type: object - type: array - type: object - secret: - description: secret information about - the secret data to project - properties: - items: - description: items if unspecified, - each key-value pair in the Data - field of the referenced Secret - will be projected into the volume - as a file whose name is the key - and content is the value. If specified, - the listed keys will be projected - into the specified paths, and - unlisted keys will not be present. - items: - description: Maps a string key - to a path within a volume. - properties: - key: - description: key is the key - to project. - type: string - mode: - description: 'mode is Optional: - mode bits used to set permissions - on this file. Must be an - octal value between 0000 - and 0777 or a decimal value - between 0 and 511. YAML - accepts both octal and decimal - values, JSON requires decimal - values for mode bits. If - not specified, the volume - defaultMode will be used.' - format: int32 - type: integer - path: - description: path is the relative - path of the file to map - the key to. May not be an - absolute path. May not contain - the path element '..'. May - not start with the string - '..'. - type: string - required: - - key - - path - type: object - type: array - name: - description: 'Name of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. - apiVersion, kind, uid?' - type: string - optional: - description: optional field specify - whether the Secret or its key - must be defined - type: boolean - type: object - x-kubernetes-map-type: atomic - serviceAccountToken: - description: serviceAccountToken is - information about the serviceAccountToken - data to project - properties: - audience: - description: audience is the intended - audience of the token. A recipient - of a token must identify itself - with an identifier specified in - the audience of the token, and - otherwise should reject the token. - The audience defaults to the identifier - of the apiserver. - type: string - expirationSeconds: - description: expirationSeconds is - the requested duration of validity - of the service account token. - As the token approaches expiration, - the kubelet volume plugin will - proactively rotate the service - account token. The kubelet will - start trying to rotate the token - if the token is older than 80 - percent of its time to live or - if the token is older than 24 - hours.Defaults to 1 hour and must - be at least 10 minutes. - format: int64 - type: integer - path: - description: path is the path relative - to the mount point of the file - to project the token into. - type: string - required: - - path - type: object - type: object - type: array - type: object - quobyte: - description: quobyte represents a Quobyte mount - on the host that shares a pod's lifetime - properties: - group: - description: group to map volume access to - Default is no group - type: string - readOnly: - description: readOnly here will force the - Quobyte volume to be mounted with read-only - permissions. Defaults to false. - type: boolean - registry: - description: registry represents a single - or multiple Quobyte Registry services specified - as a string as host:port pair (multiple - entries are separated with commas) which - acts as the central registry for volumes - type: string - tenant: - description: tenant owning the given Quobyte - volume in the Backend Used with dynamically - provisioned Quobyte volumes, value is set - by the plugin - type: string - user: - description: user to map volume access to - Defaults to serivceaccount user - type: string - volume: - description: volume is a string that references - an already created Quobyte volume by name. - type: string - required: - - registry - - volume - type: object - rbd: - description: 'rbd represents a Rados Block Device - mount on the host that shares a pod''s lifetime. - More info: https://examples.k8s.io/volumes/rbd/README.md' - properties: - fsType: - description: 'fsType is the filesystem type - of the volume that you want to mount. Tip: - Ensure that the filesystem type is supported - by the host operating system. Examples: - "ext4", "xfs", "ntfs". Implicitly inferred - to be "ext4" if unspecified. More info: - https://kubernetes.io/docs/concepts/storage/volumes#rbd - TODO: how do we prevent errors in the filesystem - from compromising the machine' - type: string - image: - description: 'image is the rados image name. - More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' - type: string - keyring: - description: 'keyring is the path to key ring - for RBDUser. Default is /etc/ceph/keyring. - More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' - type: string - monitors: - description: 'monitors is a collection of - Ceph monitors. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' - items: - type: string - type: array - pool: - description: 'pool is the rados pool name. - Default is rbd. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' - type: string - readOnly: - description: 'readOnly here will force the - ReadOnly setting in VolumeMounts. Defaults - to false. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' - type: boolean - secretRef: - description: 'secretRef is name of the authentication - secret for RBDUser. If provided overrides - keyring. Default is nil. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' - properties: - name: - description: 'Name of the referent. More - info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' - type: string - type: object - x-kubernetes-map-type: atomic - user: - description: 'user is the rados user name. - Default is admin. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it' - type: string - required: - - image - - monitors - type: object - scaleIO: - description: scaleIO represents a ScaleIO persistent - volume attached and mounted on Kubernetes nodes. - properties: - fsType: - description: fsType is the filesystem type - to mount. Must be a filesystem type supported - by the host operating system. Ex. "ext4", - "xfs", "ntfs". Default is "xfs". - type: string - gateway: - description: gateway is the host address of - the ScaleIO API Gateway. - type: string - protectionDomain: - description: protectionDomain is the name - of the ScaleIO Protection Domain for the - configured storage. - type: string - readOnly: - description: readOnly Defaults to false (read/write). - ReadOnly here will force the ReadOnly setting - in VolumeMounts. - type: boolean - secretRef: - description: secretRef references to the secret - for ScaleIO user and other sensitive information. - If this is not provided, Login operation - will fail. - properties: - name: - description: 'Name of the referent. More - info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' - type: string - type: object - x-kubernetes-map-type: atomic - sslEnabled: - description: sslEnabled Flag enable/disable - SSL communication with Gateway, default - false - type: boolean - storageMode: - description: storageMode indicates whether - the storage for a volume should be ThickProvisioned - or ThinProvisioned. Default is ThinProvisioned. - type: string - storagePool: - description: storagePool is the ScaleIO Storage - Pool associated with the protection domain. - type: string - system: - description: system is the name of the storage - system as configured in ScaleIO. - type: string - volumeName: - description: volumeName is the name of a volume - already created in the ScaleIO system that - is associated with this volume source. - type: string - required: - - gateway - - secretRef - - system - type: object - secret: - description: 'secret represents a secret that - should populate this volume. More info: https://kubernetes.io/docs/concepts/storage/volumes#secret' - properties: - defaultMode: - description: 'defaultMode is Optional: mode - bits used to set permissions on created - files by default. Must be an octal value - between 0000 and 0777 or a decimal value - between 0 and 511. YAML accepts both octal - and decimal values, JSON requires decimal - values for mode bits. Defaults to 0644. - Directories within the path are not affected - by this setting.' - format: int32 - type: integer - items: - description: items If unspecified, each key-value - pair in the Data field of the referenced - Secret will be projected into the volume - as a file whose name is the key and content - is the value. If specified, the listed keys - will be projected into the specified paths, - and unlisted keys will not be present. - items: - description: Maps a string key to a path - within a volume. - properties: - key: - description: key is the key to project. - type: string - mode: - description: 'mode is Optional: mode - bits used to set permissions on this - file. Must be an octal value between - 0000 and 0777 or a decimal value between - 0 and 511. YAML accepts both octal - and decimal values, JSON requires - decimal values for mode bits. If not - specified, the volume defaultMode - will be used.' - format: int32 - type: integer - path: - description: path is the relative path - of the file to map the key to. May - not be an absolute path. May not contain - the path element '..'. May not start - with the string '..'. - type: string - required: - - key - - path - type: object - type: array - optional: - description: optional field specify whether - the Secret or its keys must be defined - type: boolean - secretName: - description: 'secretName is the name of the - secret in the pod''s namespace to use. More - info: https://kubernetes.io/docs/concepts/storage/volumes#secret' - type: string - type: object - storageos: - description: storageOS represents a StorageOS - volume attached and mounted on Kubernetes nodes. - properties: - fsType: - description: fsType is the filesystem type - to mount. Must be a filesystem type supported - by the host operating system. Ex. "ext4", - "xfs", "ntfs". Implicitly inferred to be - "ext4" if unspecified. - type: string - readOnly: - description: readOnly defaults to false (read/write). - ReadOnly here will force the ReadOnly setting - in VolumeMounts. - type: boolean - secretRef: - description: secretRef specifies the secret - to use for obtaining the StorageOS API credentials. If - not specified, default values will be attempted. - properties: - name: - description: 'Name of the referent. More - info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' - type: string - type: object - x-kubernetes-map-type: atomic - volumeName: - description: volumeName is the human-readable - name of the StorageOS volume. Volume names - are only unique within a namespace. - type: string - volumeNamespace: - description: volumeNamespace specifies the - scope of the volume within StorageOS. If - no namespace is specified then the Pod's - namespace will be used. This allows the - Kubernetes name scoping to be mirrored within - StorageOS for tighter integration. Set VolumeName - to any name to override the default behaviour. - Set to "default" if you are not using namespaces - within StorageOS. - type: string - type: object - vsphereVolume: - description: vsphereVolume represents a vSphere - volume attached and mounted on kubelets host - machine - properties: - fsType: - description: fsType is filesystem type to - mount. Must be a filesystem type supported - by the host operating system. Ex. "ext4", - "xfs", "ntfs". Implicitly inferred to be - "ext4" if unspecified. - type: string - storagePolicyID: - description: storagePolicyID is the storage - Policy Based Management (SPBM) profile ID - associated with the StoragePolicyName. - type: string - storagePolicyName: - description: storagePolicyName is the storage - Policy Based Management (SPBM) profile name. - type: string - volumePath: - description: volumePath is the path that identifies - vSphere volume vmdk - type: string - required: - - volumePath - type: object - required: - - name - type: object - type: array - required: - - containers - type: object - type: object - type: object - description: '`MPIReplicaSpecs` contains maps from `MPIReplicaType` - to `ReplicaSpec` that specify the MPI replicas to run.' - type: object - runPolicy: - description: '`RunPolicy` encapsulates various runtime policies of - the distributed training job, for example how to clean up resources - and how long the job can stay active.' - properties: - activeDeadlineSeconds: - description: Specifies the duration in seconds relative to the - startTime that the job may be active before the system tries - to terminate it; value must be positive integer. - format: int64 - type: integer - backoffLimit: - description: Optional number of retries before marking this job - failed. - format: int32 - type: integer - cleanPodPolicy: - description: CleanPodPolicy defines the policy to kill pods after - the job completes. Default to None. - type: string - schedulingPolicy: - description: SchedulingPolicy defines the policy related to scheduling, - e.g. gang-scheduling - properties: - minAvailable: - format: int32 - type: integer - minResources: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: object - priorityClass: - type: string - queue: - type: string - scheduleTimeoutSeconds: - format: int32 - type: integer - type: object - suspend: - default: false - description: suspend specifies whether the Job controller should - create Pods or not. If a Job is created with suspend set to - true, no Pods are created by the Job controller. If a Job is - suspended after creation (i.e. the flag goes from false to true), - the Job controller will delete all active Pods and PodGroups - associated with this Job. Users must design their workload to - gracefully handle this. - type: boolean - ttlSecondsAfterFinished: - description: TTLSecondsAfterFinished is the TTL to clean up jobs. - It may take extra ReconcilePeriod seconds for the cleanup, since - reconcile gets called periodically. Default to infinite. - format: int32 - type: integer - type: object - slotsPerWorker: - description: Specifies the number of slots per worker used in hostfile. - Defaults to 1. - format: int32 - type: integer - required: - - mpiReplicaSpecs - type: object - status: - description: JobStatus represents the current observed state of the training - Job. - properties: - completionTime: - description: Represents time when the job was completed. It is not - guaranteed to be set in happens-before order across separate operations. - It is represented in RFC3339 form and is in UTC. - format: date-time - type: string - conditions: - description: Conditions is an array of current observed job conditions. - items: - description: JobCondition describes the state of the job at a certain - point. - properties: - lastTransitionTime: - description: Last time the condition transitioned from one status - to another. - format: date-time - type: string - lastUpdateTime: - description: The last time this condition was updated. - format: date-time - type: string - message: - description: A human readable message indicating details about - the transition. - type: string - reason: - description: The reason for the condition's last transition. - type: string - status: - description: Status of the condition, one of True, False, Unknown. - type: string - type: - description: Type of job condition. - type: string - required: - - status - - type - type: object - type: array - lastReconcileTime: - description: Represents last time when the job was reconciled. It - is not guaranteed to be set in happens-before order across separate - operations. It is represented in RFC3339 form and is in UTC. - format: date-time - type: string - replicaStatuses: - additionalProperties: - description: ReplicaStatus represents the current observed state - of the replica. - properties: - active: - description: The number of actively running pods. - format: int32 - type: integer - failed: - description: The number of pods which reached phase Failed. - format: int32 - type: integer - labelSelector: - description: 'Deprecated: Use Selector instead' - properties: - matchExpressions: - description: matchExpressions is a list of label selector - requirements. The requirements are ANDed. - items: - description: A label selector requirement is a selector - that contains values, a key, and an operator that relates - the key and values. - properties: - key: - description: key is the label key that the selector - applies to. - type: string - operator: - description: operator represents a key's relationship - to a set of values. Valid operators are In, NotIn, - Exists and DoesNotExist. - type: string - values: - description: values is an array of string values. - If the operator is In or NotIn, the values array - must be non-empty. If the operator is Exists or - DoesNotExist, the values array must be empty. This - array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} pairs. - A single {key,value} in the matchLabels map is equivalent - to an element of matchExpressions, whose key field is - "key", the operator is "In", and the values array contains - only "value". The requirements are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - selector: - description: A Selector is a label query over a set of resources. - The result of matchLabels and matchExpressions are ANDed. - An empty Selector matches all objects. A null Selector matches - no objects. - type: string - succeeded: - description: The number of pods which reached phase Succeeded. - format: int32 - type: integer - type: object - description: ReplicaStatuses is map of ReplicaType and ReplicaStatus, - specifies the status of each replica. - type: object - startTime: - description: Represents time when the job was acknowledged by the - job controller. It is not guaranteed to be set in happens-before - order across separate operations. It is represented in RFC3339 form - and is in UTC. - format: date-time - type: string - type: object - type: object - served: true - storage: true - subresources: - status: {} diff --git a/tools/roles/kubeflow/tasks/image_pulling.yml b/tools/roles/kubeflow/tasks/image_pulling.yml index 7eba7025b..5e8d8bb10 100644 --- a/tools/roles/kubeflow/tasks/image_pulling.yml +++ b/tools/roles/kubeflow/tasks/image_pulling.yml @@ -12,10 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. --- + +- name: Set Linux max user instance + ansible.builtin.command: "sysctl fs.inotify.max_user_instances={{ max_user_instances }}" + changed_when: false + +- name: Set Linux max watches + ansible.builtin.command: "sysctl fs.inotify.max_user_watches={{ max_user_watches }}" + changed_when: false + - name: Pull images for always case environment: http_proxy: "{{ http_proxy }}" https_proxy: "{{ https_proxy }}" + no_proxy: "{{ oim_hostname }},{{ admin_nic_ip }}" block: - name: Set empty image list ansible.builtin.set_fact: diff --git a/tools/roles/kubeflow/tasks/prereq.yml b/tools/roles/kubeflow/tasks/prereq.yml index 426eb7c12..facd9e0d1 100644 --- a/tools/roles/kubeflow/tasks/prereq.yml +++ b/tools/roles/kubeflow/tasks/prereq.yml @@ -97,7 +97,7 @@ dest: "{{ kubeflow_dir_path }}" mode: "{{ file_permission }}" -- name: Setup kubeflow repo to control plane +- name: Setup kubeflow repo to Omnia Infrastructure Manager ansible.builtin.unarchive: src: "{{ kubeflow_dir_path }}/{{ kubeflow_git_repo_path }}.tar.gz" dest: "{{ kubeflow_dir_path }}" @@ -109,7 +109,7 @@ dest: "{{ kustomize_dir }}" mode: "{{ file_permission }}" -- name: Setup kustomize repo to control plane +- name: Setup kustomize repo to Omnia Infrastructure Manager ansible.builtin.unarchive: src: "{{ kustomize_dir }}/{{ kustomize_git_repo_path }}.tar.gz" dest: "{{ kustomize_dir }}" @@ -120,14 +120,6 @@ cmd: chmod a+x "{{ kustomize_binary_path }}" changed_when: false -- name: Set Linux max user instance - ansible.builtin.command: "sysctl fs.inotify.max_user_instances={{ max_user_instances }}" - changed_when: false - -- name: Set Linux max watches - ansible.builtin.command: "sysctl fs.inotify.max_user_watches={{ max_user_watches }}" - changed_when: false - - name: Replace value of jupyter-web secure cookies ansible.builtin.lineinfile: path: "{{ jw_app }}" @@ -157,16 +149,6 @@ when: crd_info.stdout | length > 0 changed_when: false -- name: Copy rendered YAML file to compute nodes - ansible.builtin.copy: - src: "{{ role_path }}/files/crd_mpijobs_kubeflow.yml" - dest: "{{ kubeflow_dir_path }}/kubeflow" - mode: "{{ file_permission }}" - -- name: Apply CRD YAML - ansible.builtin.command: kubectl apply -f "{{ kubeflow_dir_path }}/kubeflow/crd_mpijobs_kubeflow.yml" - changed_when: false - - name: Change istio ingressgateway service from ClusterIP to LoadBalancer ansible.builtin.replace: path: "{{ istio_ingressgateway_service_yaml_file_path }}" diff --git a/tools/roles/kubeflow/tasks/validate_inventory.yml b/tools/roles/kubeflow/tasks/validate_inventory.yml new file mode 100644 index 000000000..dac03229f --- /dev/null +++ b/tools/roles/kubeflow/tasks/validate_inventory.yml @@ -0,0 +1,40 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Inventory not provided + ansible.builtin.fail: + msg: "{{ kubeflow_empty_inventory_fail_msg }}" + when: + - groups['all'] is defined + - (groups['all'] | length == 0) + +- name: Invalid inventory format for Kubeflow + ansible.builtin.fail: + msg: "{{ kubeflow_invalid_kube_inventory_fail_msg }}" + when: + - "('kube_control_plane' not in groups or 'kube_node' not in groups)" + +- name: Kube control plane group to contain exactly 1 kube control plane + ansible.builtin.assert: + that: + - "groups['kube_control_plane'] | length | int == 1" + success_msg: "{{ kubeflow_kube_one_node_validation_success_msg }}" + fail_msg: "{{ kubeflow_kube_one_node_validation_fail_msg }}" + +- name: Kube node group to contain atleast 1 kube node + ansible.builtin.assert: + that: "groups['kube_node'] | length | int >= 1" + success_msg: "{{ kubeflow_kube_node_validation_success_msg }}" + fail_msg: "{{ kubeflow_kube_node_validation_fail_msg }}" diff --git a/tools/roles/kubeflow/vars/main.yml b/tools/roles/kubeflow/vars/main.yml index 24381f1ba..4f2ce0178 100644 --- a/tools/roles/kubeflow/vars/main.yml +++ b/tools/roles/kubeflow/vars/main.yml @@ -13,9 +13,7 @@ # limitations under the License. --- file_permission: 755 -download_dest: /tmp/kustomize_v5.0.3_linux_amd64.tar.gz kustomize_binary_path: /opt/omnia/kustomize/kustomize -branch: v1.8-branch deployment_success_message: Deployment is successful deployment_failure_message: Deployment is not ready warning_msg: Do not continue, if kubeflow deployment is running for first time @@ -39,9 +37,22 @@ k8s_not_deployed: "Kubernetes is prerequisite for deploying kubeflow. Please dep jw_app: /opt/omnia/kubeflow/kubeflow/apps/jupyter/jupyter-web-app/upstream/base/params.env tb_app: /opt/omnia/kubeflow/kubeflow/apps/tensorboard/tensorboards-web-app/upstream/base/params.env vw_app: /opt/omnia/kubeflow/kubeflow/apps/volumes-web-app/upstream/base/params.env -istio_ingressgateway_service_yaml_file_path: /opt/omnia/kubeflow/kubeflow/common/istio-1-17/istio-install/base/patches/service.yaml +istio_ingressgateway_service_yaml_file_path: /opt/omnia/kubeflow/kubeflow/common/istio-1-22/istio-install/base/patches/service.yaml kserve_exist: "Kserve is already deployed, please remove kserve to deploy kubeflow." # usage main.yml local_repo_access_config_file: "/opt/omnia/offline/local_repo_access.yml" istio_module_dir: /etc/modules-load.d istio_module_file: /etc/modules-load.d/99-istio-modules.conf + +# Usage: validate_inventory.yml +kubeflow_empty_inventory_fail_msg: > + "Failed. Inventory not provided. + Please re-run the playbook with an inventory that includes the groups 'kube_control_plane' and 'kube_node' by using the -i inventory option" +kubeflow_invalid_kube_inventory_fail_msg: "Failed. Kubeflow software is present in software_config.json. +Invalid inventory format, specify kube_control_plane, kube_node" +kubeflow_kube_one_node_validation_fail_msg: "Failed. kubeflow software is present in software_config.json. +There should be exactly one entry for kube_control_plane in the inventory" +kubeflow_kube_one_node_validation_success_msg: "One kube_control_plane exists in the inventory" +kubeflow_kube_node_validation_fail_msg: "Failed. kubeflow software is present in software_config.json. +At least one kube_node should be present in the inventory." +kubeflow_kube_node_validation_success_msg: "At least one kube_node exists in the inventory" diff --git a/upgrade/roles/uninstall_open_ldap/tasks/main.yml b/tools/roles/mpijob/tasks/fetch_software_config.yml similarity index 58% rename from upgrade/roles/uninstall_open_ldap/tasks/main.yml rename to tools/roles/mpijob/tasks/fetch_software_config.yml index 7f8d778e0..f78cc2bdf 100644 --- a/upgrade/roles/uninstall_open_ldap/tasks/main.yml +++ b/tools/roles/mpijob/tasks/fetch_software_config.yml @@ -1,4 +1,4 @@ -# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,23 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. --- + - name: Load software_config.json ansible.builtin.include_vars: file: "{{ software_config_json_file }}" name: software_config -- name: Check openldap support +- name: Load k8s.json ansible.builtin.set_fact: - openldap_support: "{{ software_config.softwares | selectattr('name', 'equalto', 'openldap') | list | length > 0 }}" - -- name: Uninstall open_ldap packages - ansible.builtin.package: - name: "{{ uninstall_open_ldap_packages }}" - state: absent - when: openldap_support + k8s_packages_json: "{{ lookup('file', k8s_packages_file) | from_json }}" -- name: Delete conf file if exists - ansible.builtin.file: - path: "{{ uninstall_open_ldap_conf_dest }}" - state: absent - when: openldap_support +- name: Extract and set facts for mpi-operator manifest URL + ansible.builtin.set_fact: + mpi_operator: "{{ k8s_packages_json['k8s']['cluster'] | selectattr('type', 'equalto', 'manifest') | selectattr('package', 'search', 'mpi') | map(attribute='package') | join }}" # noqa: yaml[line-length] diff --git a/upgrade/roles/backup_omniadb/tasks/backup_old_data.yml b/tools/roles/mpijob/tasks/inv_check.yml similarity index 60% rename from upgrade/roles/backup_omniadb/tasks/backup_old_data.yml rename to tools/roles/mpijob/tasks/inv_check.yml index 85a8fe46b..293830132 100644 --- a/upgrade/roles/backup_omniadb/tasks/backup_old_data.yml +++ b/tools/roles/mpijob/tasks/inv_check.yml @@ -12,16 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -- name: Ping old omnia database - community.postgresql.postgresql_ping: - db: omniadb - login_password: "{{ postgresdb_password }}" - register: db_ping -- name: Back up old omnia database - community.postgresql.postgresql_db: - db: omniadb - login_password: "{{ postgresdb_password }}" - state: dump - target: "{{ backup_location }}/backup.sql" - when: db_ping.is_available +- name: Inventory not provided + ansible.builtin.fail: + msg: "{{ mpijob_empty_inventory_fail_msg }}" + when: + - groups['all'] is defined + - (groups['all'] | length == 0) + +- name: Validate kube_control_plane group + ansible.builtin.assert: + that: + - groups['kube_control_plane'] is defined + - "groups['kube_control_plane'] | length | int == 1" + fail_msg: "{{ fail_node_kube_control_plane }}" diff --git a/tools/roles/mpijob/tasks/mpi_job_v1.yml b/tools/roles/mpijob/tasks/mpi_job_v1.yml new file mode 100644 index 000000000..4b24d7f18 --- /dev/null +++ b/tools/roles/mpijob/tasks/mpi_job_v1.yml @@ -0,0 +1,54 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Revert to MPIv1 Operator + tags: mpiv1 + block: + - name: Get kubeflow CRD information + ansible.builtin.shell: kubectl get crd | grep mpijobs.kubeflow.org + register: crd_info + changed_when: false + ignore_errors: true + + - name: Delete existing CRD + ansible.builtin.command: kubectl delete crd mpijobs.kubeflow.org + when: crd_info.stdout | length > 0 + changed_when: false + + - name: Install Kubeflow Training Operator + ansible.builtin.shell: | + set -o pipefail && \ + cd /opt/omnia/kubeflow/kubeflow && \ + /opt/omnia/kustomize/kustomize build apps/training-operator/upstream/overlays/kubeflow | kubectl apply -f - + register: install_training_operator_result + retries: "{{ retry_count }}" + delay: "{{ time_delay }}" + until: install_training_operator_result.rc == 0 + changed_when: false + + - name: Verify MPIJob API version + ansible.builtin.command: kubectl explain mpijob --api-version=kubeflow.org/v1 + register: kubectl_explain_output + changed_when: false + + - name: Check MPIJob API version + ansible.builtin.debug: + msg: "{{ v1_success }}" + when: kubectl_explain_output.rc == 0 + + - name: Fail if MPIJob API version v1 is not present + ansible.builtin.fail: + msg: "{{ v1_failure }}" + when: kubectl_explain_output.rc != 0 diff --git a/tools/roles/mpijob/tasks/mpi_job_v2beta1.yml b/tools/roles/mpijob/tasks/mpi_job_v2beta1.yml new file mode 100644 index 000000000..e5f7a40b7 --- /dev/null +++ b/tools/roles/mpijob/tasks/mpi_job_v2beta1.yml @@ -0,0 +1,73 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Setup MPIv2beta1 Operator + tags: + - mpiv2beta1 + block: + - name: Check if Kubeflow training operator is present + ansible.builtin.command: kubectl get deployment -n kubeflow training-operator + register: check_training_operator + changed_when: false + + - name: Remove Kubeflow training operator + ansible.builtin.shell: | + set -o pipefail && \ + cd /opt/omnia/kubeflow/kubeflow && \ + /opt/omnia/kustomize/kustomize build apps/training-operator/upstream/overlays/kubeflow | kubectl delete -f - + when: check_training_operator.rc == 0 + register: remove_training_operator_result + retries: "{{ retry_count }}" + delay: "{{ time_delay }}" + until: remove_training_operator_result.rc == 0 + changed_when: false + + - name: Include local repo access variable file + ansible.builtin.include_vars: + file: "{{ local_repo_access_config_file }}" + + - name: Include local_repo_config.yml vars + ansible.builtin.include_vars: + file: "{{ local_repo_config_file }}" + + - name: Download rendered YAML file to compute nodes + ansible.builtin.get_url: + url: "{{ offline_manifest_path }}/{{ hostvars['localhost']['mpi_operator'] }}.yaml" + dest: "{{ kubeflow_dir_path }}/kubeflow/mpi-operator.yaml" + mode: "{{ file_permission }}" + + - name: Install MPI Operator + ansible.builtin.command: kubectl apply -f "{{ kubeflow_dir_path }}/kubeflow/mpi-operator.yaml" --server-side --force-conflicts + failed_when: false + changed_when: false + + - name: Warning - Please wait, This task will take few seconds + ansible.builtin.pause: + seconds: "{{ pause_time }}" + + - name: Verify MPIJob API version + ansible.builtin.command: kubectl explain mpijob --api-version=kubeflow.org/v2beta1 + register: kubectl_explain_output + changed_when: false + + - name: Check if MPIJob API version v2beta1 is present + ansible.builtin.debug: + msg: "{{ v2_success }}" + when: kubectl_explain_output.rc == 0 + + - name: Fail if MPIJob API version v2beta1 is not present + ansible.builtin.fail: + msg: "{{ v2_failure }}" + when: kubectl_explain_output.rc != 0 diff --git a/tools/roles/mpijob/tasks/verify_kubeflow.yml b/tools/roles/mpijob/tasks/verify_kubeflow.yml new file mode 100644 index 000000000..f71074bd9 --- /dev/null +++ b/tools/roles/mpijob/tasks/verify_kubeflow.yml @@ -0,0 +1,31 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check kubeflow Installation and Tag + block: + - name: Check Kubeflow Deployment Status + ansible.builtin.command: kubectl get deployments -n kubeflow + register: deployment_status + changed_when: false + + - name: Determine failure if deployment status check fails + ansible.builtin.fail: + msg: "{{ deployment_failure_message }}" + when: deployment_status.stdout == "" + + - name: Check if any tags are provided + ansible.builtin.fail: + msg: "{{ tag_error }}" + when: ansible_run_tags is undefined or (ansible_run_tags | select('in', ['mpiv1', 'mpiv2beta1']) | list | length == 0) diff --git a/tools/roles/mpijob/vars/main.yml b/tools/roles/mpijob/vars/main.yml new file mode 100644 index 000000000..79f317cb0 --- /dev/null +++ b/tools/roles/mpijob/vars/main.yml @@ -0,0 +1,45 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Usage: mpi_job_v2beta1.yml +local_repo_access_config_file: "/opt/omnia/offline/local_repo_access.yml" +local_repo_config_file: "{{ role_path }}/../../../input/local_repo_config.yml" +kubeflow_dir_path: "/opt/omnia/kubeflow" +file_permission: 755 + +# Usage: mpi_job_v1.yml +retry_count: 3 +time_delay: 10 +pause_time: 5 +v1_success: "MPIJob API version v1 is present." +v1_failure: "MPIJob API version v1 is not present." + +# Usage: mpi_job_v2beta1.yml +v2_success: "MPIJob API version v2beta1 is present." +v2_failure: "MPIJob API version v2beta1 is not present." + +# Usage: verify_kubeflow.yml +tag_error: "Error: No tags provided! Please specify mpiv1 or mpiv2beta1 tag." +deployment_failure_message: "Error: Ensure kubeflow is deployed first." + +# Usage: inv_check.yml +fail_node_kube_control_plane: "Failed. There should be exactly one entry for kube_control_plane in the inventory" +mpijob_empty_inventory_fail_msg: > + "Failed. Inventory not provided. + Please re-run the playbook with an inventory that includes the groups 'kube_control_plane' by using the -i inventory option" + +# Usage: fetch_software_config.yml +software_config_json_file: "{{ role_path }}/../../../input/software_config.json" +k8s_packages_file: "{{ role_path }}/../../../input/config/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/k8s.json" diff --git a/tools/roles/pytorch/tasks/check_prerequisites.yml b/tools/roles/pytorch/tasks/check_prerequisites.yml index fc1cf2e53..66706d316 100644 --- a/tools/roles/pytorch/tasks/check_prerequisites.yml +++ b/tools/roles/pytorch/tasks/check_prerequisites.yml @@ -41,40 +41,41 @@ - name: Set packages variables when: pytorch_packages.ansible_facts block: - - name: Assign image pull commands + - name: Assign pytorch_cpu image pull commands ansible.builtin.set_fact: - pytorch_cpu_image_package: "{{ pytorch_packages.ansible_facts.pytorch_cpu.cluster[0].package }}" - pytorch_cpu_image_version: "{{ pytorch_packages.ansible_facts.pytorch_cpu.cluster[0].tag }}" - pytorch_amd_image_package: "{{ pytorch_packages.ansible_facts.pytorch_amd.cluster[0].package }}" - pytorch_amd_image_version: "{{ pytorch_packages.ansible_facts.pytorch_amd.cluster[0].tag }}" + pytorch_cpu_image_package: "{{ pytorch_packages.ansible_facts.pytorch_cpu.cluster[item_index1].package }}" + pytorch_cpu_image_version: "{{ pytorch_packages.ansible_facts.pytorch_cpu.cluster[item_index1].tag }}" + when: pytorch_packages.ansible_facts.pytorch_cpu.cluster[item_index1].tag is defined + loop: "{{ pytorch_packages.ansible_facts.pytorch_cpu.cluster }}" + loop_control: + index_var: item_index1 - - name: Assign NVIDIA image pull commands + - name: Assign pytorch_amd image pull commands ansible.builtin.set_fact: - pytorch_nvidia_image_package: "{{ pytorch_packages.ansible_facts.pytorch_nvidia.cluster[item_index].package }}" - pytorch_nvidia_image_version: "{{ pytorch_packages.ansible_facts.pytorch_nvidia.cluster[item_index].tag }}" - when: pytorch_packages.ansible_facts.pytorch_nvidia.cluster[item_index].tag is defined - loop: "{{ pytorch_packages.ansible_facts.pytorch_nvidia.cluster }}" + pytorch_amd_image_package: "{{ pytorch_packages.ansible_facts.pytorch_amd.cluster[item_index2].package }}" + pytorch_amd_image_version: "{{ pytorch_packages.ansible_facts.pytorch_amd.cluster[item_index2].tag }}" + when: pytorch_packages.ansible_facts.pytorch_amd.cluster[item_index2].tag is defined + loop: "{{ pytorch_packages.ansible_facts.pytorch_amd.cluster }}" loop_control: - index_var: item_index + index_var: item_index2 - - name: Combine the pythorch image and version + - name: Assign pytorch_gaudi image pull commands ansible.builtin.set_fact: - pytorch_cpu_image: "{{ pytorch_cpu_image_package }}:{{ pytorch_cpu_image_version }}" - pytorch_amd_image: "{{ pytorch_amd_image_package }}:{{ pytorch_amd_image_version }}" - pytorch_nvidia_image: "{{ pytorch_nvidia_image_package }}:{{ pytorch_nvidia_image_version }}" - - - name: Validate image pull commands - ansible.builtin.assert: - that: - - pytorch_cpu_image is defined - - pytorch_amd_image is defined - - pytorch_nvidia_image is defined + pytorch_gaudi_image_package: "{{ pytorch_packages.ansible_facts.pytorch_gaudi.cluster[item_index3].package }}" + pytorch_gaudi_image_version: "{{ pytorch_packages.ansible_facts.pytorch_gaudi.cluster[item_index3].tag }}" + when: pytorch_packages.ansible_facts.pytorch_gaudi.cluster[item_index3].tag is defined + loop: "{{ pytorch_packages.ansible_facts.pytorch_gaudi.cluster }}" + loop_control: + index_var: item_index3 - - name: Assign image pull commands + - name: Assign pytorch_nvidia image pull commands ansible.builtin.set_fact: - cpu_image_run_cmd: "nerdctl run -it --rm -v /opt/omnia/:/workspace/ {{ pytorch_cpu_image }} python ./pytorch_example.py" - nvidia_image_run_cmd: "nerdctl run --gpus all -it --rm -v /opt/omnia/:/workspace/ {{ pytorch_nvidia_image }} python ./pytorch_example.py" - amd_run_cmd_end: " --group-add video --ipc=host --shm-size 8G -v /opt/omnia/:/var/lib/jenkins {{ pytorch_amd_image }} python ./pytorch_example.py" + pytorch_nvidia_image_package: "{{ pytorch_packages.ansible_facts.pytorch_nvidia.cluster[item_index4].package }}" + pytorch_nvidia_image_version: "{{ pytorch_packages.ansible_facts.pytorch_nvidia.cluster[item_index4].tag }}" + when: pytorch_packages.ansible_facts.pytorch_nvidia.cluster[item_index4].tag is defined + loop: "{{ pytorch_packages.ansible_facts.pytorch_nvidia.cluster }}" + loop_control: + index_var: item_index4 - name: Initialize processing_unit ansible.builtin.set_fact: @@ -105,6 +106,17 @@ processing_unit: "nvidia" when: nvidia_gpu.rc == 0 + - name: Check Gaudi HPU + ansible.builtin.command: hl-smi + register: gaudi_gpu + changed_when: true + failed_when: false + + - name: Modify processing_unit_gaudi + ansible.builtin.set_fact: + processing_unit: "gaudi" + when: gaudi_gpu.rc == 0 + - name: Check and create omnia folder block: - name: Check if omnia folder exists @@ -138,6 +150,22 @@ - name: Check prerequisites (AMD) when: processing_unit == "amd" block: + - name: Validate pytorch_amd_image_package and pytorch_amd_image_version + ansible.builtin.assert: + that: + - pytorch_amd_image_package is defined + - pytorch_amd_image_version is defined + fail_msg: "{{ pytorch_amd_definition_fail_msg }}" + + - name: Set pytorch_amd_image + ansible.builtin.set_fact: + pytorch_amd_image: "{{ pytorch_amd_image_package }}:{{ pytorch_amd_image_version }}" + + - name: Set amd_run_cmd_start and amd_run_cmd_end + ansible.builtin.set_fact: + amd_run_cmd_start: "nerdctl run -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd " + amd_run_cmd_end: " --group-add video --ipc=host --shm-size 8G -v /opt/omnia/:/var/lib/jenkins {{ pytorch_amd_image }} python ./pytorch_example.py" + - name: List files in /dev/dri ansible.builtin.command: ls /dev/dri register: dri_files diff --git a/tools/roles/pytorch/tasks/check_software_config_file.yml b/tools/roles/pytorch/tasks/check_software_config_file.yml new file mode 100644 index 000000000..29cb4d446 --- /dev/null +++ b/tools/roles/pytorch/tasks/check_software_config_file.yml @@ -0,0 +1,29 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +--- + +- name: Load software_config.json + ansible.builtin.include_vars: + file: "{{ software_config_json_file }}" + name: software_config + +- name: Check if pytorch entry is present in software_config.json + ansible.builtin.set_fact: + pytorch_present: "{{ software_config['softwares'] | selectattr('name', 'equalto', 'pytorch') | list | count > 0 }}" + +- name: Fail if pytorch entry is not present in software_config.json + ansible.builtin.fail: + msg: "{{ pytorch_not_in_software_config }}" + when: not pytorch_present diff --git a/tools/roles/pytorch/tasks/inv_check.yml b/tools/roles/pytorch/tasks/inv_check.yml new file mode 100644 index 000000000..469b8944d --- /dev/null +++ b/tools/roles/pytorch/tasks/inv_check.yml @@ -0,0 +1,39 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +- name: Inventory not provided + ansible.builtin.fail: + msg: "{{ pytorch_empty_inventory_fail_msg }}" + when: + - groups['all'] is defined + - (groups['all'] | length == 0) + +- name: Validate kube_control_plane and kube_node group in inventory + ansible.builtin.assert: + that: + - groups['kube_control_plane'] is defined + - groups['kube_node'] is defined + fail_msg: "{{ fail_inv_format }}" + +- name: Validate kube_control_plane group + ansible.builtin.assert: + that: "groups['kube_control_plane'] | length | int == 1" + fail_msg: "{{ fail_node_kube_control_plane }}" + +- name: Validate kube_node group + ansible.builtin.assert: + that: "groups['kube_node'] | length | int >= 1" + fail_msg: "{{ fail_no_node_kube_node }}" diff --git a/tools/roles/pytorch/tasks/pytorch_install.yml b/tools/roles/pytorch/tasks/pytorch_install.yml index 1d6115a27..e0f5c18d0 100644 --- a/tools/roles/pytorch/tasks/pytorch_install.yml +++ b/tools/roles/pytorch/tasks/pytorch_install.yml @@ -18,8 +18,20 @@ environment: http_proxy: "{{ http_proxy }}" https_proxy: "{{ https_proxy }}" + no_proxy: "{{ oim_hostname }},{{ admin_nic_ip }}" when: processing_unit == "cpu" block: + - name: Validate pytorch_cpu_image_package and pytorch_cpu_image_version + ansible.builtin.assert: + that: + - pytorch_cpu_image_package is defined + - pytorch_cpu_image_version is defined + fail_msg: "{{ pytorch_cpu_definition_fail_msg }}" + + - name: Set pytorch_cpu_image + ansible.builtin.set_fact: + pytorch_cpu_image: "{{ pytorch_cpu_image_package }}:{{ pytorch_cpu_image_version }}" + - name: Pull CPU container image ansible.builtin.command: "nerdctl pull {{ pytorch_cpu_image }}" changed_when: false @@ -40,6 +52,7 @@ environment: http_proxy: "{{ http_proxy }}" https_proxy: "{{ https_proxy }}" + no_proxy: "{{ oim_hostname }},{{ admin_nic_ip }}" when: processing_unit == "amd" block: - name: Pull AMD GPU container image @@ -62,8 +75,20 @@ environment: http_proxy: "{{ http_proxy }}" https_proxy: "{{ https_proxy }}" + no_proxy: "{{ oim_hostname }},{{ admin_nic_ip }}" when: processing_unit == "nvidia" block: + - name: Validate pytorch_nvidia_image_package and pytorch_nvidia_image_version + ansible.builtin.assert: + that: + - pytorch_nvidia_image_package is defined + - pytorch_nvidia_image_version is defined + fail_msg: "{{ pytorch_nvidia_definition_fail_msg }}" + + - name: Set pytorch_nvidia_image + ansible.builtin.set_fact: + pytorch_nvidia_image: "{{ pytorch_nvidia_image_package }}:{{ pytorch_nvidia_image_version }}" + - name: Pull NVIDIA GPU container image ansible.builtin.command: "nerdctl pull {{ pytorch_nvidia_image }}" changed_when: false @@ -80,6 +105,40 @@ installation_status: "{{ installation_failed_status }}: {{ error_pull_container }}" when: nvidia_deployment_output.rc != 0 +- name: Install pytorch (GAUDI) + environment: + http_proxy: "{{ http_proxy }}" + https_proxy: "{{ https_proxy }}" + no_proxy: "{{ oim_hostname }},{{ admin_nic_ip }}" + when: processing_unit == "gaudi" + block: + - name: Validate pytorch_gaudi_image_package and pytorch_gaudi_image_version + ansible.builtin.assert: + that: + - pytorch_gaudi_image_package is defined + - pytorch_gaudi_image_version is defined + fail_msg: "{{ pytorch_gaudi_definition_fail_msg }}" + + - name: Set pytorch_gaudi_image + ansible.builtin.set_fact: + pytorch_gaudi_image: "{{ pytorch_gaudi_image_package }}:{{ pytorch_gaudi_image_version }}" + + - name: Pull GAUDI HPU container image + ansible.builtin.command: "nerdctl pull {{ pytorch_gaudi_image }}" + changed_when: false + failed_when: false + register: gaudi_deployment_output + + - name: Failed to pull GAUDI HPU container + ansible.builtin.debug: + msg: "Warning: {{ error_pull_container }}" + when: gaudi_deployment_output.rc != 0 + + - name: Updating pytorch installation_status (GAUDI) + ansible.builtin.set_fact: + installation_status: "{{ installation_failed_status }}: {{ error_pull_container }}" + when: gaudi_deployment_output.rc != 0 + - name: Modify pytorch_installation_status ansible.builtin.set_fact: pytorch_installation_status: true diff --git a/tools/roles/pytorch/tasks/pytorch_verify.yml b/tools/roles/pytorch/tasks/pytorch_verify.yml index 75a1a6f12..291b4d639 100644 --- a/tools/roles/pytorch/tasks/pytorch_verify.yml +++ b/tools/roles/pytorch/tasks/pytorch_verify.yml @@ -26,6 +26,10 @@ - name: Example pytorch inference (CPU) when: processing_unit == "cpu" block: + - name: Set cpu_image_run_cmd + ansible.builtin.set_fact: + cpu_image_run_cmd: "nerdctl run -it --rm -v /opt/omnia/:/workspace/ {{ pytorch_cpu_image }} python ./pytorch_example.py" + - name: Run cpu container with example file ansible.builtin.command: "{{ cpu_image_run_cmd }}" when: copy_example_file is succeeded @@ -66,6 +70,10 @@ - name: Example pytorch inference (NVIDIA) when: processing_unit == "nvidia" block: + - name: Set nvidia_image_run_cmd + ansible.builtin.set_fact: + nvidia_image_run_cmd: "nerdctl run --gpus all -it --rm -v /opt/omnia/:/workspace/ {{ pytorch_nvidia_image }} python ./pytorch_example.py" + - name: Run nvidia container with example file ansible.builtin.command: "{{ nvidia_image_run_cmd }}" when: copy_example_file is succeeded @@ -82,3 +90,28 @@ ansible.builtin.set_fact: installation_status: "{{ installation_failed_status }}: {{ error_example_failed }}" when: nvidia_container_example_run.rc != 0 + +- name: Example pytorch inference (GAUDI) + when: processing_unit == "gaudi" + block: + - name: Set gaudi_image_run_cmd + ansible.builtin.set_fact: + gaudi_image_run_cmd: "nerdctl run -it --privileged -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice + --net=host --ipc=host -v /opt/omnia/:/workspace/ {{ pytorch_gaudi_image }} python /workspace/pytorch_example.py" + + - name: Run gaudi container with example file + ansible.builtin.command: "{{ gaudi_image_run_cmd }}" + when: copy_example_file is succeeded + changed_when: false + failed_when: false + register: gaudi_container_example_run + + - name: Example container inference failed (GAUDI) + ansible.builtin.debug: + msg: "Warning: {{ error_example_failed }}" + when: gaudi_container_example_run.rc != 0 + + - name: Updating pytorch verification status (GAUDI) + ansible.builtin.set_fact: + installation_status: "{{ installation_failed_status }}: {{ error_example_failed }}" + when: gaudi_container_example_run.rc != 0 diff --git a/tools/roles/pytorch/vars/main.yml b/tools/roles/pytorch/vars/main.yml index 84466356b..32d27df36 100644 --- a/tools/roles/pytorch/vars/main.yml +++ b/tools/roles/pytorch/vars/main.yml @@ -18,6 +18,21 @@ installation_success_status: "Pytorch succesfully installed" installation_failed_status: "Failed to install pytorch" +# Validate inventory +fail_no_node_kube_node: "Failed. Pytorch software is present in software_config.json. No node is part of kube_node group in inventory" +fail_inv_format: "Failed. Pytorch software is present in software_config.json. +Invalid inventory format, specify kube_control_plane and kube_node." +fail_node_kube_control_plane: "Failed. Pytorch software is present in software_config.json. +There should be exactly one entry for kube_control_plane in the inventory" +pytorch_empty_inventory_fail_msg: | + "Failed. Inventory not provided. + Please re-run the playbook with an inventory that includes the groups 'kube_control_plane' and 'kube_node' by using the -i inventory option" + +# For checking entry in software_config.json +pytorch_not_in_software_config: | + "pytorch is not present in software_config.json. + Please mention pytorch in software_config.json and execute pytorch.yml again" + # For env proxy setup local_repo_access_path: "/opt/omnia/offline/local_repo_access.yml" @@ -34,11 +49,9 @@ pytorch_json_file: "{{ role_path }}/../../../input/config/{{ cluster_os_type }}/ # Errors: check_prerequisites.yml error_check_gpu_failed: "GPU Driver absent. Skipping pytorch installation." error_check_container_engine_failed: "pytorch requires containerd as a prerequisite. Please execute scheduler.yml/omnia.yml and install k8s and execute -pytorch again" + pytorch again" error_dri_file_failed: "Unable to read dri files in /dev/dri folder" -# CMD: check_prerequisites.yml -amd_run_cmd_start: "nerdctl run -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd " # check_prerequisites.yml - Create omnia folder if it doesn't exist omnia_foler_path: "/opt/omnia" @@ -47,6 +60,14 @@ omnia_foler_mode: "0755" # Errors: pytorch_install.yml error_pull_container: "Unable to pull container image" +pytorch_cpu_definition_fail_msg: | + "Failed. pytorch_cpu details is not defined in pytorch.json. Verify the pytorch.json and re-run the playbook." +pytorch_amd_definition_fail_msg: | + "Failed. AMD GPU detected and pytorch_amd details is not defined in pytorch.json. Verify the pytorch.json and re-run the playbook." +pytorch_nvidia_definition_fail_msg: | + "Failed. NVIDIA GPU detected and pytorch_nvidia details is not defined in pytorch.json. Verify the pytorch.json and re-run the playbook." +pytorch_gaudi_definition_fail_msg: | + "Failed. Intel Gaudi GPU detected and pytorch_gaudi details is not defined in pytorch.json. Verify the pytorch.json and re-run the playbook." # Usage: pytorch_verify.yml example_src_file_path: "pytorch_example.py" diff --git a/tools/roles/tensorflow/tasks/check_software_config_file.yml b/tools/roles/tensorflow/tasks/check_software_config_file.yml new file mode 100644 index 000000000..891ba80f2 --- /dev/null +++ b/tools/roles/tensorflow/tasks/check_software_config_file.yml @@ -0,0 +1,29 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +--- + +- name: Load software_config.json + ansible.builtin.include_vars: + file: "{{ software_config_json_file }}" + name: software_config + +- name: Check if tensorflow entry is present in software_config.json + ansible.builtin.set_fact: + tensorflow_present: "{{ software_config['softwares'] | selectattr('name', 'equalto', 'tensorflow') | list | count > 0 }}" + +- name: Fail if tensorflow entry is not present in software_config.json + ansible.builtin.fail: + msg: "{{ tensorflow_not_in_software_config }}" + when: not tensorflow_present diff --git a/tools/roles/tensorflow/tasks/inv_check.yml b/tools/roles/tensorflow/tasks/inv_check.yml new file mode 100644 index 000000000..6bdc36b6d --- /dev/null +++ b/tools/roles/tensorflow/tasks/inv_check.yml @@ -0,0 +1,39 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +- name: Inventory not provided + ansible.builtin.fail: + msg: "{{ tensorflow_empty_inventory_fail_msg }}" + when: + - groups['all'] is defined + - (groups['all'] | length == 0) + +- name: Validate kube_control_plane and kube_node group in inventory + ansible.builtin.assert: + that: + - groups['kube_control_plane'] is defined + - groups['kube_node'] is defined + fail_msg: "{{ fail_inv_format }}" + +- name: Validate kube_control_plane group + ansible.builtin.assert: + that: "groups['kube_control_plane'] | length | int == 1" + fail_msg: "{{ fail_node_kube_control_plane }}" + +- name: Validate kube_node group + ansible.builtin.assert: + that: "groups['kube_node'] | length | int >= 1" + fail_msg: "{{ fail_no_node_kube_node }}" diff --git a/tools/roles/tensorflow/tasks/tensorflow_install.yml b/tools/roles/tensorflow/tasks/tensorflow_install.yml index de2dbc6ea..ffa6b436b 100644 --- a/tools/roles/tensorflow/tasks/tensorflow_install.yml +++ b/tools/roles/tensorflow/tasks/tensorflow_install.yml @@ -18,6 +18,7 @@ environment: http_proxy: "{{ http_proxy }}" https_proxy: "{{ https_proxy }}" + no_proxy: "{{ oim_hostname }},{{ admin_nic_ip }}" when: processing_unit == "cpu" block: - name: Pull CPU container image @@ -40,6 +41,7 @@ environment: http_proxy: "{{ http_proxy }}" https_proxy: "{{ https_proxy }}" + no_proxy: "{{ oim_hostname }},{{ admin_nic_ip }}" when: processing_unit == "amd" block: - name: Pull AMD GPU container image @@ -62,6 +64,7 @@ environment: http_proxy: "{{ http_proxy }}" https_proxy: "{{ https_proxy }}" + no_proxy: "{{ oim_hostname }},{{ admin_nic_ip }}" when: processing_unit == "nvidia" block: - name: Pull NVIDIA GPU container image diff --git a/tools/roles/tensorflow/vars/main.yml b/tools/roles/tensorflow/vars/main.yml index d24041478..f99660c35 100644 --- a/tools/roles/tensorflow/vars/main.yml +++ b/tools/roles/tensorflow/vars/main.yml @@ -18,6 +18,21 @@ installation_success_status: "Tensorflow succesfully installed" installation_failed_status: "Failed to install tensorflow" +# Validate inventory +fail_no_node_kube_node: "Failed. Tensorflow software is present in software_config.json. No node is part of kube_node group in inventory" +fail_inv_format: "Failed. Tensorflow software is present in software_config.json. +Invalid inventory format, specify kube_control_plane and kube_node." +fail_node_kube_control_plane: "Failed. Tensorflow software is present in software_config.json. +There should be exactly one entry for kube_control_plane in the inventory" +tensorflow_empty_inventory_fail_msg: | + "Failed. Inventory not provided. + Please re-run the playbook with an inventory that includes the groups 'kube_control_plane' and 'kube_node' by using the -i inventory option" + +# For checking entry in software_config.json +tensorflow_not_in_software_config: | + "tensorflow is not present in software_config.json. + Please mention tensorflow in software_config.json and execute tensorflow.yml again" + # For env proxy setup local_repo_access_path: "/opt/omnia/offline/local_repo_access.yml" @@ -33,8 +48,8 @@ nvidia_container_toolkit: "nvidia-container-toolkit" # Errors: check_prerequisites.yml error_check_gpu_failed: "GPU Driver absent. Skipping tensorflow installation." -error_check_container_engine_failed: "tensorflow requires containerd as a prerequisite. Please execute scheduler.yml/omnia.yml and install k8s and -execute tensorflow again" +error_check_container_engine_failed: "tensorflow requires containerd as a prerequisite. Please execute scheduler.yml/omnia.yml and install k8s and execute +tensorflow again" error_dri_file_failed: "Unable to read dri files in /dev/dri folder" # CMD: check_prerequisites.yml diff --git a/tools/roles/vllm/tasks/check_prerequisites.yml b/tools/roles/vllm/tasks/check_prerequisites.yml index 080a43705..892d5dbed 100644 --- a/tools/roles/vllm/tasks/check_prerequisites.yml +++ b/tools/roles/vllm/tasks/check_prerequisites.yml @@ -48,6 +48,8 @@ vllm_amd_image_version: "{{ vllm_packages.ansible_facts.vllm_amd.cluster[0].tag }}" vllm_pytorch_cuda_version: "{{ vllm_packages.ansible_facts.vllm_nvidia.cluster[1].package }}" vllm_nvidia_package: "{{ vllm_packages.ansible_facts.vllm_nvidia.cluster[2].package }}" + vllm_numpy_package: "{{ vllm_packages.ansible_facts.vllm_nvidia.cluster[3].package }}" + vllm_python_version: "{{ vllm_packages.ansible_facts['vllm_nvidia']['cluster'] | selectattr('type', 'in', ['deb', 'rpm']) | selectattr('package', 'search', 'python') | map(attribute='package') | first }}" # noqa: yaml[line-length] - name: Validate package details ansible.builtin.assert: @@ -154,6 +156,10 @@ when: dri_files.rc != 0 - name: Check prerequisites(NVIDIA) + environment: + http_proxy: "{{ http_proxy }}" + https_proxy: "{{ https_proxy }}" + no_proxy: "{{ oim_hostname }},{{ admin_nic_ip }}" when: processing_unit == "nvidia" block: - name: Check Python version @@ -162,7 +168,7 @@ changed_when: false failed_when: false - - name: Install Python 3.9 + - name: Install {{ vllm_python_version }} ansible.builtin.package: name: "{{ vllm_python_version }}" state: present @@ -170,15 +176,19 @@ changed_when: true failed_when: false - - name: Install pip and python3.9 distutils + - name: Install pip and disutils for {{ vllm_python_version }} ansible.builtin.package: name: "{{ vllm_python_package }}" state: present - changed_when: true - failed_when: false + when: ansible_distribution | lower == ubuntu_os + + - name: Ensure and install pip package + ansible.builtin.command: + cmd: "{{ vllm_python_version }} -m ensurepip" + changed_when: false - name: Install pytorch - ansible.builtin.command: "{{ vllm_python_version }} -m pip install torch==2.1.2 --upgrade --index-url https://download.pytorch.org/whl/cu121" + ansible.builtin.command: "{{ vllm_python_version }} -m pip install {{ vllm_pytorch_cuda_version }}" changed_when: true failed_when: false register: pytorch_package_status diff --git a/tools/roles/vllm/tasks/check_software_config_file.yml b/tools/roles/vllm/tasks/check_software_config_file.yml new file mode 100644 index 000000000..aca172ba4 --- /dev/null +++ b/tools/roles/vllm/tasks/check_software_config_file.yml @@ -0,0 +1,29 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +--- + +- name: Load software_config.json + ansible.builtin.include_vars: + file: "{{ software_config_json_file }}" + name: software_config + +- name: Check if vllm entry is present in software_config.json + ansible.builtin.set_fact: + vllm_present: "{{ software_config['softwares'] | selectattr('name', 'equalto', 'vllm') | list | count > 0 }}" + +- name: Fail if vllm entry is not present in software_config.json + ansible.builtin.fail: + msg: "{{ vllm_not_in_software_config }}" + when: not vllm_present diff --git a/tools/roles/vllm/tasks/inv_check.yml b/tools/roles/vllm/tasks/inv_check.yml new file mode 100644 index 000000000..9ed0f0b7b --- /dev/null +++ b/tools/roles/vllm/tasks/inv_check.yml @@ -0,0 +1,39 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +- name: Inventory not provided + ansible.builtin.fail: + msg: "{{ vllm_empty_inventory_fail_msg }}" + when: + - groups['all'] is defined + - (groups['all'] | length == 0) + +- name: Validate kube_control_plane and kube_node group in inventory + ansible.builtin.assert: + that: + - groups['kube_control_plane'] is defined + - groups['kube_node'] is defined + fail_msg: "{{ fail_inv_format }}" + +- name: Validate kube_control_plane group + ansible.builtin.assert: + that: "groups['kube_control_plane'] | length | int == 1" + fail_msg: "{{ fail_node_kube_control_plane }}" + +- name: Validate kube_node group + ansible.builtin.assert: + that: "groups['kube_node'] | length | int >= 1" + fail_msg: "{{ fail_no_node_kube_node }}" diff --git a/tools/roles/vllm/tasks/main.yml b/tools/roles/vllm/tasks/main.yml index a6c9dcae0..00a1f3e9c 100644 --- a/tools/roles/vllm/tasks/main.yml +++ b/tools/roles/vllm/tasks/main.yml @@ -13,6 +13,20 @@ # limitations under the License. --- +# Supported OS check +- name: Load software_config.json + ansible.builtin.include_vars: + file: "{{ software_config_json_file }}" + name: software_config + +- name: Display warning if cluster OS is not a supported OS for vllm + ansible.builtin.pause: + seconds: "{{ wait_time }}" + prompt: "{{ unsupported_os_msg }}" + when: + - (software_config.cluster_os_type == 'ubuntu' and software_config.cluster_os_version != '22.04') or + (software_config.cluster_os_type == 'rhel' and software_config.cluster_os_version != '8.8') + # Initial variables - name: Initialize vllm_prerequisite_status ansible.builtin.set_fact: diff --git a/tools/roles/vllm/tasks/vllm_install.yml b/tools/roles/vllm/tasks/vllm_install.yml index 939184d5a..09d45bf9d 100644 --- a/tools/roles/vllm/tasks/vllm_install.yml +++ b/tools/roles/vllm/tasks/vllm_install.yml @@ -18,6 +18,7 @@ environment: http_proxy: "{{ http_proxy }}" https_proxy: "{{ https_proxy }}" + no_proxy: "{{ oim_hostname }},{{ admin_nic_ip }}" when: processing_unit == "amd" block: - name: Pull vLLM container image @@ -40,10 +41,11 @@ environment: http_proxy: "{{ http_proxy }}" https_proxy: "{{ https_proxy }}" + no_proxy: "{{ oim_hostname }},{{ admin_nic_ip }}" when: processing_unit == "nvidia" block: - name: Install vLLM package - ansible.builtin.command: "{{ vllm_python_version }} -m pip install {{ vllm_nvidia_package }}" + ansible.builtin.command: "{{ vllm_python_version }} -m pip install {{ vllm_nvidia_package }} {{ vllm_numpy_package }}" changed_when: false failed_when: false register: nvidia_deployment_output diff --git a/tools/roles/vllm/tasks/vllm_verify.yml b/tools/roles/vllm/tasks/vllm_verify.yml index d1c97a77b..977c8dabd 100644 --- a/tools/roles/vllm/tasks/vllm_verify.yml +++ b/tools/roles/vllm/tasks/vllm_verify.yml @@ -47,6 +47,7 @@ environment: http_proxy: "{{ http_proxy }}" https_proxy: "{{ https_proxy }}" + no_proxy: "{{ oim_hostname }},{{ admin_nic_ip }}" when: processing_unit == "nvidia" block: - name: Run vllm_nvidia_example diff --git a/tools/roles/vllm/vars/main.yml b/tools/roles/vllm/vars/main.yml index fcfca7e0c..d1097dc2a 100644 --- a/tools/roles/vllm/vars/main.yml +++ b/tools/roles/vllm/vars/main.yml @@ -19,7 +19,22 @@ vllm_warning: "The vLLM container requires at least 60GB of storage. Additionall so please ensure that the necessary storage is available." installation_success_status: "vLLM succesfully installed" installation_failed_status: "Failed to install vLLM" -vllm_python_version: "python3.9" +unsupported_os_msg: "The vLLM feature is only supported on Ubuntu 22.04 and RHEL 8.8. Installation may fail on other OS versions." + +# Validate inventory +fail_no_node_kube_node: "Failed. vllm software is present in software_config.json. No node is part of kube_node group in inventory" +fail_inv_format: "Failed. vllm software is present in software_config.json. +Invalid inventory format, specify kube_control_plane and kube_node." +fail_node_kube_control_plane: "Failed. vllm software is present in software_config.json. +There should be exactly one entry for kube_control_plane in the inventory" +vllm_empty_inventory_fail_msg: | + "Failed. Inventory not provided. + Please re-run the playbook with an inventory that includes the groups 'kube_control_plane' and 'kube_node' by using the -i inventory option" + +# For checking entry in software_config.json +vllm_not_in_software_config: | + "vllm is not present in software_config.json. + Please mention vllm in software_config.json and execute vllm.yml again" # For env proxy setup local_repo_access_path: "/opt/omnia/offline/local_repo_access.yml" @@ -28,8 +43,8 @@ local_repo_access_path: "/opt/omnia/offline/local_repo_access.yml" software_config_json_file: "{{ role_path }}/../../../input/software_config.json" software_config_parameters_fail_msg: "Failed. Please ensure cluster_os_type, cluster_os_verion, repo_config, softwares are defined in software_config.json" vllm_python_package: - - python3-pip - - python3.9-distutils + - "{{ vllm_python_version.split('.')[0] }}-pip" + - "{{ vllm_python_version }}-distutils" # vllm.json file path vllm_json_file: "{{ role_path }}/../../../input/config/{{ cluster_os_type }}/{{ cluster_os_version }}/vllm.json" @@ -44,6 +59,7 @@ error_check_pytorch_failed: "Failed to install dependency - pytorch" omnia_foler_path: "/opt/omnia" omnia_foler_stat: "directory" omnia_foler_mode: "0755" +ubuntu_os: "ubuntu" # Usage: vllm_install.yml vllm_run_cmd_start: "nerdctl run -it --network=host --group-add=video --ipc=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device /dev/kfd " @@ -51,8 +67,8 @@ vllm_run_cmd_mid: " -v /opt/omnia/:/app/model " vllm_run_cmd_end: " /bin/bash -c 'export http_proxy={{ http_proxy }} && export https_proxy={{ https_proxy }} && python /app/model/vllm_example.py'" # Errors: vllm_install.yml -error_pull_container: "Failed to install vLLM container on AMD GPU node. Please ensure you have enough disk space (recommended: 100 GB) or check -your network connection to the local repository" +error_pull_container: "Failed to install vLLM container on AMD GPU node. Please ensure you have enough disk space (recommended: 100 GB) or check your +network connection to the local repository" error_vllm_package: "Failed to install vLLM on NVIDIA node. Please ensure you have enough disk space (recommended: 100 GB) or check your network connection to the local repository" diff --git a/tools/tensorflow.yml b/tools/tensorflow.yml index 4a377f174..053f41437 100644 --- a/tools/tensorflow.yml +++ b/tools/tensorflow.yml @@ -17,12 +17,29 @@ ansible.builtin.import_playbook: ../utils/servicetag_host_mapping.yml when: not ( hostvars['127.0.0.1']['update_inventory_executed'] | default(false) | bool ) +- name: Inventory Check + hosts: localhost + tasks: + - name: Set flag to indicate check_venv.yml has been executed + ansible.builtin.set_fact: + check_venv_executed: true + + - name: Check entry in software_config json + ansible.builtin.include_role: + name: tensorflow + tasks_from: check_software_config_file.yml + + - name: Check inventory format + ansible.builtin.include_role: + name: tensorflow + tasks_from: inv_check.yml + - name: Update Repositories/Registries on nodes ansible.builtin.import_playbook: ../utils/update_user_repo.yml when: not ( hostvars['127.0.0.1']['update_user_repo_executed'] | default(false) | bool ) - name: Installing tensorflow - hosts: kube_node, kube_control_node + hosts: kube_node, kube_control_plane gather_facts: false roles: - tensorflow diff --git a/tools/vllm.yml b/tools/vllm.yml index 00484445d..6f619d41d 100644 --- a/tools/vllm.yml +++ b/tools/vllm.yml @@ -17,6 +17,23 @@ ansible.builtin.import_playbook: ../utils/servicetag_host_mapping.yml when: not ( hostvars['127.0.0.1']['update_inventory_executed'] | default(false) | bool ) +- name: Inventory Check + hosts: localhost + tasks: + - name: Set flag to indicate check_venv.yml has been executed + ansible.builtin.set_fact: + check_venv_executed: true + + - name: Check entry in software_config json + ansible.builtin.include_role: + name: vllm + tasks_from: check_software_config_file.yml + + - name: Check inventory format + ansible.builtin.include_role: + name: vllm + tasks_from: inv_check.yml + - name: Update Repositories/Registries on nodes ansible.builtin.import_playbook: ../utils/update_user_repo.yml when: not ( hostvars['127.0.0.1']['update_user_repo_executed'] | default(false) | bool ) diff --git a/upgrade/ansible.cfg b/upgrade/ansible.cfg index d65ead48c..fb6e4204f 100644 --- a/upgrade/ansible.cfg +++ b/upgrade/ansible.cfg @@ -1,10 +1,12 @@ [defaults] log_path = /var/log/omnia/upgrade.log -roles_path = ./roles:../prepare_cp/roles:../discovery/roles +roles_path = ./roles:../prepare_oim/roles:../discovery/roles:../utils/roles host_key_checking = false forks = 5 timeout = 180 -display_skipped_hosts = false +executable = /bin/bash +collections_path = $VIRTUAL_ENV +inventory= /opt/omnia/omnia_inventory/ [persistent_connection] command_timeout = 180 diff --git a/upgrade/prepare_config.yml b/upgrade/prepare_config.yml deleted file mode 100644 index 5197311bf..000000000 --- a/upgrade/prepare_config.yml +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Validate parameters - hosts: localhost - connection: local - roles: - - validate_upgrade_config - -- name: Set upgrade status - hosts: localhost - connection: local - tasks: - - name: Validate omnia version - ansible.builtin.include_role: - name: "{{ playbook_dir }}/roles/validate_omnia_version" # noqa:role-name[path] - - - name: Invoke omnia 1.6 pre-req - ansible.builtin.command: sh "{{ playbook_dir }}/../prereq.sh" - changed_when: true - when: upgrade_status - -- name: Import parameters from Omnia - hosts: localhost - connection: local - tasks: - - name: Import parameters and backup Omniadb - when: upgrade_status - block: - - name: Import input parameters - ansible.builtin.include_role: - name: import_input_parameters - - - name: Encrypt inputs - ansible.builtin.include_role: - name: encrypt_inputs - - - name: Backup omniadb - ansible.builtin.include_role: - name: backup_omniadb - - - name: Upgrade inventory - ansible.builtin.include_role: - name: upgrade_inventory - - - name: Backup telemetry - ansible.builtin.include_role: - name: backup_telemetry - -- name: Invoke specific tasks from update_metadata role - hosts: localhost - connection: local - tasks: - - name: Include only update tasks from upgrade - ansible.builtin.include_role: - name: update_metadata - tasks_from: update.yml - when: not upgrade_status - - - name: Include only update_metadata tasks - ansible.builtin.include_role: - name: update_metadata - tasks_from: update_metadata.yml - -- name: Display User Message - hosts: localhost - connection: local - tasks: - - name: Include user message - ansible.builtin.include_role: - name: "{{ playbook_dir }}/roles/user_messages" # noqa:role-name[path] - - - name: Print user upgrade message - ansible.builtin.debug: - msg: "{{ user_msg_prepare_config.split('\n') }}" - when: upgrade_status - - - name: Print user message - ansible.builtin.debug: - msg: "{{ user_msg_prepare_config2.split('\n') }}" - when: not upgrade_status diff --git a/upgrade/prepare_upgrade.yml b/upgrade/prepare_upgrade.yml deleted file mode 100644 index 638894a63..000000000 --- a/upgrade/prepare_upgrade.yml +++ /dev/null @@ -1,149 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Set upgrade status - hosts: localhost - connection: local - tasks: - - name: Validate omnia version - ansible.builtin.include_role: - name: "{{ playbook_dir }}/roles/validate_omnia_version" # noqa:role-name[path] - -# This playbook should be invoked with -i option pointing to omnia 1.5 inventory file -- name: Validate parameters - hosts: localhost - connection: local - tasks: - - name: Validate parameters - when: upgrade_status - block: - - name: Encrypt inputs - ansible.builtin.include_role: - name: encrypt_inputs - - - name: Validate input configs - ansible.builtin.include_role: - name: validate_input_configs - -- name: Cleanup Control plane - hosts: localhost - connection: local - tasks: - - name: Cleanup Control plane - when: upgrade_status - block: - - name: Docker registry uninstall - ansible.builtin.include_role: - name: docker_registry_uninstall - - - name: Telemetry uninstall - ansible.builtin.include_role: - name: telemetry_uninstall - -# Uninstall k8s on omnia 1.5 cluster -- name: Uninstall kubernetes on k8s worker - hosts: compute - tasks: - - name: Uninstall kubernetes - when: upgrade_status - block: - - name: Uninstall k8s cluster - ansible.builtin.include_role: - name: uninstall_k8s_cluster - - - name: Preinstall cluster cleanup - ansible.builtin.include_role: - name: preinstall_cluster_cleanup - -- name: Uninstall kubernetes on k8s manager - hosts: manager - tasks: - - name: Uninstall kubernetes - when: upgrade_status - block: - - name: Uninstall k8s cluster - ansible.builtin.include_role: - name: uninstall_k8s_cluster - - - name: Preinstall cluster cleanup - ansible.builtin.include_role: - name: preinstall_cluster_cleanup - -- name: Uninstall openldap - hosts: manager, compute, login - tasks: - - name: Uninstall openldap - when: upgrade_status - block: - - name: Uninstall open_ldap - ansible.builtin.include_role: - name: uninstall_open_ldap - -- name: Run local_repo.yml on new input directory - ansible.builtin.import_playbook: "{{ playbook_dir}}/../local_repo/local_repo.yml" - when: upgrade_status - -- name: Invoke omnia 1.6 prepare_cp roles # noqa:role-name[path] - hosts: localhost - connection: local - tasks: - - name: Invoke omnia 1.6 prepare_cp - when: upgrade_status - block: - - name: Invoke configure proxy - ansible.builtin.include_role: - name: "{{ playbook_dir }}/../prepare_cp/roles/configure_proxy" # noqa:role-name[path] - - - name: Invoke telemetry cp - ansible.builtin.include_role: - name: "{{ playbook_dir }}/../prepare_cp/roles/omnia_telemetry_cp" # noqa:role-name[path] - - - name: Invoke omnia appliance - ansible.builtin.include_role: - name: "{{ playbook_dir }}/../prepare_cp/roles/omnia_appliance_cp" # noqa:role-name[path] - - - name: Upgrade omniadb - ansible.builtin.include_role: - name: upgrade_omniadb - - - name: Upgrade xcat - ansible.builtin.include_role: - name: upgrade_xcat - - - name: Invoke prepare_cp - ansible.builtin.include_role: - name: prepare_cp_for_upgrade - - - name: Import input parameters - ansible.builtin.include_role: - name: metadata_update - -- name: Display User Message - hosts: localhost - connection: local - tasks: - - name: Include user message - ansible.builtin.include_role: - name: "{{ playbook_dir }}/roles/user_messages" # noqa:role-name[path] - - - name: Print user upgrade message - ansible.builtin.debug: - msg: "{{ user_msg_prepare_upgrade.split('\n') }}" - when: upgrade_status - - - name: Print user message - ansible.builtin.debug: - msg: "{{ user_msg_prepare_upgrade2.split('\n') }}" - when: not upgrade_status diff --git a/upgrade/roles/backup_telemetry/tasks/main.yml b/upgrade/restore_oim.yml similarity index 78% rename from upgrade/roles/backup_telemetry/tasks/main.yml rename to upgrade/restore_oim.yml index fac5574f0..9c57f1808 100644 --- a/upgrade/roles/backup_telemetry/tasks/main.yml +++ b/upgrade/restore_oim.yml @@ -12,8 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -- name: Include telemetry config vars - ansible.builtin.include_tasks: include_telemetry_config.yml -- name: Dump telmetry db - ansible.builtin.include_tasks: telemetry_dump.yml +- name: Restore k8s, omnia_telemetry and omnia_inventory files on Omnia Infrastructure Manager + hosts: localhost + gather_facts: true + roles: + - restore_oim diff --git a/upgrade/roles/backup_k8s/files/backup.sh b/upgrade/roles/backup_k8s/files/backup.sh new file mode 100644 index 000000000..65a339cc8 --- /dev/null +++ b/upgrade/roles/backup_k8s/files/backup.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +# Backup all resources +kubectl get all --all-namespaces -o yaml > all-resources.yaml + +# Backup ConfigMaps and Secrets +kubectl get configmaps --all-namespaces -o yaml > configmaps.yaml +kubectl get secrets --all-namespaces -o yaml > secrets.yaml + +# Backup Deployements +kubectl get deployments --all-namespaces -o yaml > deployments.yaml + +# Backup PVCs +kubectl get pvc --all-namespaces -o yaml > pvcs.yaml + +# Backup PVs +kubectl get pv -A -o yaml > pv.yaml + +# Backup CRDs +kubectl get crd -o yaml > crds.yaml + +# Backup Cluster Roles and Role Bindings +kubectl get clusterroles -o yaml > clusterroles.yaml +kubectl get clusterrolebindings -o yaml > clusterrolebindings.yaml + +# Backup Namespaces +kubectl get namespaces -o yaml > namespaces.yaml + +# Backup Service Accounts +kubectl get serviceaccounts --all-namespaces -o yaml > serviceaccounts.yaml + +# Backup Network Policies +kubectl get networkpolicies --all-namespaces -o yaml > networkpolicies.yaml + +# Backup Resource Quotas and Limit Ranges +kubectl get resourcequotas --all-namespaces -o yaml > resourcequotas.yaml +kubectl get limitranges --all-namespaces -o yaml > limitranges.yaml + +# Statefulsets +kubectl get statefulsets -A -o yaml > statefulsets.yaml + +# Daemonsets +kubectl get daemonsets --all-namespaces -o yaml > all-daemonsets.yaml +echo "Backup completed successfully." diff --git a/upgrade/roles/backup_k8s/tasks/etcd.yml b/upgrade/roles/backup_k8s/tasks/etcd.yml new file mode 100644 index 000000000..1ace15b7f --- /dev/null +++ b/upgrade/roles/backup_k8s/tasks/etcd.yml @@ -0,0 +1,136 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Install kubernetes pip module + ansible.builtin.pip: + name: "{{ k8s_pip_package }}" + state: present + executable: pip3 + +- name: Get info for all Kubernetes services + kubernetes.core.k8s_info: + kind: Service + register: svc_info + +- name: Allocate static IP to all loadbalancer services + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: Service + metadata: + name: "{{ item.metadata.name }}" + namespace: "{{ item.metadata.namespace }}" + spec: + type: LoadBalancer + loadBalancerIP: "{{ item.status.loadBalancer.ingress[0].ip }}" + loop: "{{ svc_info.resources }}" + when: item.spec.type == 'LoadBalancer' and item.metadata.name in ['grafana', 'timescaledb'] + no_log: true + +- name: Check existence of "{{ etcd_env_file }}" + ansible.builtin.stat: + path: "{{ etcd_env_file }}" + register: etcd_env_file_result + +- name: Fail if etcd env file does not exist + ansible.builtin.fail: + msg: "{{ etcd_file_fail_msg }}" + when: not etcd_env_file_result.stat.exists + +- name: Read the "{{ etcd_env_file }}" + ansible.builtin.command: cat "{{ etcd_env_file }}" + register: etcd_env + changed_when: false + no_log: true + +- name: Set environment variables as facts + ansible.builtin.set_fact: + env_vars: "{{ env_vars | default({}) | combine({item.split('=')[0]: item.split('=')[1]}) }}" + loop: "{{ etcd_env.stdout_lines }}" + when: item and '=' in item and not item.startswith('#') + no_log: true + +- name: Run etcdctl snapshot save command + ansible.builtin.command: > + etcdctl snapshot save {{ snapshot_db_name }} + --endpoints={{ env_vars.ETCD_ADVERTISE_CLIENT_URLS }} + --cacert={{ env_vars.ETCD_TRUSTED_CA_FILE }} + --cert={{ env_vars.ETCD_CERT_FILE }} + --key={{ env_vars.ETCD_KEY_FILE }} + register: snapshot_result + failed_when: false + changed_when: false + +- name: Check if etcd snapshot save was Successful + ansible.builtin.fail: + msg: "{{ etcd_snapshot_save_fail_msg }}" + when: snapshot_result.rc != 0 + +- name: Run etcdctl snapshot status to check snapshot + ansible.builtin.command: etcdctl snapshot status {{ snapshot_db_name }} + register: snapshot_status + failed_when: false + changed_when: false + +- name: Check if etcd snapshot status was Successful + ansible.builtin.fail: + msg: "Backup of etcd snapshot status failed: {{ snapshot_status.stderr }}" + when: snapshot_status.rc != 0 + +- name: Copy mount location data to k8s_backup_location folder + ansible.builtin.copy: + src: "{{ mount_location.rstrip('/') }}" + dest: "{{ k8s_backup_location }}" + mode: "{{ directory_mode }}" + +- name: Create manifests directory + ansible.builtin.file: + path: "{{ manifests_dir }}" + state: directory + mode: "{{ directory_mode }}" + +- name: Copy backup.sh file + ansible.builtin.copy: + src: "{{ backup_file_source }}" + dest: "{{ shell_script_file }}" + mode: "{{ file_mode }}" + +- name: Taking backup of manifest files + ansible.builtin.command: "{{ shell_script_file }}" + args: + chdir: "{{ manifests_dir }}" + register: shell_output + changed_when: false + +- name: Remove existing k8s tarball if it exists + ansible.builtin.file: + path: "{{ k8s_backup_location_tarball }}" + state: absent + +- name: Create a tarball of k8s backup location + community.general.archive: + path: "{{ k8s_backup_location }}" + dest: "{{ k8s_backup_location_tarball }}" + mode: "{{ file_permission }}" + format: gz + +- name: Show the created tarball name + ansible.builtin.debug: + msg: "Created tarball: {{ k8s_backup_location_tarball }} at {{ backup_location }}" + +- name: Display backup message + ansible.builtin.debug: + var: shell_output.stdout diff --git a/upgrade/roles/backup_k8s/tasks/main.yml b/upgrade/roles/backup_k8s/tasks/main.yml new file mode 100644 index 000000000..fd0c2bb33 --- /dev/null +++ b/upgrade/roles/backup_k8s/tasks/main.yml @@ -0,0 +1,34 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Backup for k8s + when: k8s_upgrade_precheck_flag and k8s_backup_status + block: + - name: Create k8s_backup_location directory + ansible.builtin.file: + path: "{{ k8s_backup_location }}" + state: directory + mode: "{{ directory_mode }}" + + - name: Backup MySQL db + ansible.builtin.include_tasks: mysqldb.yml + when: mysqldb_backup_flag + + - name: Backup timescale db + ansible.builtin.include_tasks: timescaledb.yml + when: timescaledb_backup_flag + + - name: Backup etcd + ansible.builtin.include_tasks: etcd.yml diff --git a/upgrade/roles/backup_k8s/tasks/mysqldb.yml b/upgrade/roles/backup_k8s/tasks/mysqldb.yml new file mode 100644 index 000000000..e182cbf5f --- /dev/null +++ b/upgrade/roles/backup_k8s/tasks/mysqldb.yml @@ -0,0 +1,61 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Get MySQL Pod Name + ansible.builtin.command: kubectl get pod -n "{{ telemetry_namespace }}" -l app="mysqldb" -o jsonpath='{.items[0].metadata.name}' + register: mysql_pod_name + changed_when: false + +- name: Check if MySQL Pod Exists + ansible.builtin.command: kubectl get pod "{{ mysql_pod_name.stdout }}" -n "{{ telemetry_namespace }}" --no-headers + register: pod_check + changed_when: false + ignore_errors: true + no_log: true + +- name: Fail if MySQL Pod Does Not Exist + ansible.builtin.fail: + msg: "Pod '{{ mysql_pod_name }}' does not exist in namespace '{{ telemetry_namespace }}'" + when: pod_check.rc != 0 + +- name: Get MySQL pod status + ansible.builtin.command: kubectl get pod -n "{{ telemetry_namespace }}" "{{ mysql_pod_name.stdout }}" -o jsonpath='{.status.phase}' + register: pod_status + when: pod_check.rc == 0 + failed_when: false + changed_when: false + +- name: Fail if MySQL Pod is Not Running + ansible.builtin.fail: + msg: "Pod '{{ mysql_pod_name.stdout }}' is not running. Current status: {{ pod_status.stdout }}" + when: pod_status.stdout != 'Running' + +- name: Backup for MYSQL Database + block: + - name: Backup MySQL Database + ansible.builtin.command: + kubectl exec -n {{ telemetry_namespace }} {{ mysql_pod_name.stdout }} -- sh -c 'mysqldump -u {{ mysqldb_user }} -p"{{ mysqldb_password }}" {{ mysqldb_name }} > /tmp/{{ mysql_telemetry_db_backup_file }}' # noqa: yaml[line-length] + register: backup_result + changed_when: false + no_log: true + + - name: Check if mysqldb backup was Successful + ansible.builtin.fail: + msg: "Mysqldb backup failed: {{ backup_result.stderr }}" + when: backup_result.rc != 0 + + - name: Copy backup file from pod to "{{ k8s_backup_location }}" + ansible.builtin.command: kubectl cp "{{ telemetry_namespace }}"/{{ mysql_pod_name.stdout }}:"/tmp/{{ mysql_telemetry_db_backup_file }}" "{{ k8s_backup_location }}/{{ mysqldb_local_backup_file }}" # noqa: yaml[line-length] + changed_when: false diff --git a/upgrade/roles/backup_k8s/tasks/timescaledb.yml b/upgrade/roles/backup_k8s/tasks/timescaledb.yml new file mode 100644 index 000000000..8db54b2f1 --- /dev/null +++ b/upgrade/roles/backup_k8s/tasks/timescaledb.yml @@ -0,0 +1,84 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Get timescaledb pod name + ansible.builtin.command: kubectl get pod -n "{{ telemetry_namespace }}" -l app="{{ timescaledb_k8s_name }}" -o jsonpath="{.items[0].metadata.name}" + register: timescaledb_pod_name + changed_when: false + failed_when: false + +- name: Check if Pod Exists + ansible.builtin.command: kubectl get pod "{{ timescaledb_pod_name.stdout }}" -n "{{ telemetry_namespace }}" --no-headers + register: timescaledb_pod_check + changed_when: false + ignore_errors: true + no_log: true + +- name: Fail if Pod Does Not Exist + ansible.builtin.fail: + msg: "Pod '{{ timescaledb_pod_name }}' does not exist in namespace '{{ telemetry_namespace }}'" + when: timescaledb_pod_check.rc != 0 + +- name: Get pod status + ansible.builtin.command: kubectl get pod -n "{{ telemetry_namespace }}" "{{ timescaledb_pod_name.stdout }}" -o jsonpath='{.status.phase}' + register: timescaledb_pod_status + when: timescaledb_pod_check.rc == 0 + failed_when: false + changed_when: false + +- name: Print pod status + ansible.builtin.debug: + msg: "The timescaledb pod status is {{ timescaledb_pod_status.stdout }}.Telemetry backup will be taken only when pod is in running state." + +- name: Fail if Pod is Not Running + ansible.builtin.fail: + msg: "Pod '{{ timescaledb_pod_name.stdout }}' is not running. Current status: {{ timescaledb_pod_status.stdout }}" + when: timescaledb_pod_status.stdout != 'Running' + +- name: Get external IP of timescaledb service + ansible.builtin.command: kubectl get svc "{{ timescaledb_k8s_name }}" -n "{{ telemetry_namespace }}" -o jsonpath='{.status.loadBalancer.ingress[0].ip}' + register: timescaledb_service_external_ip + failed_when: false + changed_when: false + +- name: Dump database + ansible.builtin.command: kubectl exec -it "{{ timescaledb_pod_name.stdout }}" -n "{{ telemetry_namespace }}" -- pg_dump -d "postgres://{{ timescaledb_user }}:{{ timescaledb_password }}@{{ timescaledb_service_external_ip.stdout }}:5432/{{ database_name }}" --format=plain --quote-all-identifiers --no-tablespaces --no-owner --no-privileges --file="{{ timescale_telemetry_backup_file }}" # noqa: yaml[line-length] + changed_when: false + register: dump_telemetry_result + when: + - "'running' in timescaledb_pod_status.stdout | lower" + +- name: Copy backup file from pod to "{{ k8s_backup_location }}" + ansible.builtin.command: kubectl cp "{{ telemetry_namespace }}"/{{ timescaledb_pod_name.stdout }}:"{{ timescale_telemetry_backup_file }}" "{{ k8s_backup_location }}/{{ timescale_telemetry_backup_file }}" # noqa: yaml[line-length] + changed_when: false + when: + - dump_telemetry_result is defined + - dump_telemetry_result.rc == 0 + +- name: Create tar of timescaledb pod data + ansible.builtin.shell: > + set -o pipefail && \ + kubectl exec "{{ timescaledb_pod_name.stdout }}" + -n "{{ telemetry_namespace }}" -- tar -czf "{{ idrac_telemetry_path }}" -C "{{ postgresql_pod_data }}" . + register: timescaledb_tar + changed_when: timescaledb_tar.rc == 0 + when: timescaledb_pod_check.rc == 0 + +- name: Copy timescaledb_data tar file to k8s_backup_location + ansible.builtin.copy: + src: "{{ idrac_mnt_tar }}" + dest: "{{ k8s_backup_location }}" + mode: "{{ file_mode }}" + when: timescaledb_tar.rc == 0 diff --git a/upgrade/roles/backup_k8s/vars/main.yml b/upgrade/roles/backup_k8s/vars/main.yml new file mode 100644 index 000000000..d45ffc2d6 --- /dev/null +++ b/upgrade/roles/backup_k8s/vars/main.yml @@ -0,0 +1,48 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Usage: main.yml +k8s_backup_location: "{{ backup_location }}/k8s" +file_permission: '0644' +directory_mode: "755" + +# Usage: timescaledb.yml +database_name: "telemetry_metrics" +telemetry_namespace: "telemetry-and-visualizations" +timescaledb_k8s_name: timescaledb +timescale_telemetry_backup_file: "telemetry_tsdb_dump.sql" +idrac_telemetry_path: "/go/src/github.com/telemetry-reference-tools/omnia_timescaledb.tar.gz" +postgresql_pod_data: "/var/lib/postgresql/data" +idrac_mnt_tar: "{{ mount_location }}/iDRAC-Telemetry-Reference-Tools/omnia_timescaledb.tar.gz" +file_mode: "0644" + +# Usage: mysqldb.yml +mysqldb_k8s_name: mysqldb +mysqldb_local_backup_file: "mysqldb_dump.sql" +mysqldb_name: "idrac_telemetrysource_services_db" +mysql_telemetry_db_backup_file: "idrac_telemetrysource_services_db_backup.sql" + +# Usage: etcd.yml +k8s_pip_package: "kubernetes==30.1.0" +etcd_env_file: "/etc/etcd.env" +etcd_file_fail_msg: "Failed. {{ etcd_env_file }} doesn't exist on system. So backup of etcd cannot be performed." +snapshot_db_name: "{{ k8s_backup_location }}/snapshot.db" +etcd_snapshot_save_fail_msg: "Backup of etcd snapshot failed: {{ snapshot_result.stderr }}" +etcd_snapshot_status_fail_msg: "Backup of etcd snapshot status failed: {{ snapshot_status.stderr }}" +manifests_dir: "{{ k8s_backup_location }}/manifests" +shell_script_file: "{{ k8s_backup_location }}/backup.sh" +file_mode: "744" +backup_file_source: "backup.sh" +k8s_backup_location_tarball: "{{ k8s_backup_location }}.tar.gz" diff --git a/upgrade/roles/backup_telemetry/tasks/telemetry_dump.yml b/upgrade/roles/backup_telemetry/tasks/telemetry_dump.yml deleted file mode 100644 index 41559854c..000000000 --- a/upgrade/roles/backup_telemetry/tasks/telemetry_dump.yml +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Get external IP of timescaledb service - ansible.builtin.command: kubectl get svc "{{ timescaledb_k8s_name }}" -n "{{ telemetry_namespace }}" -o jsonpath='{.status.loadBalancer.ingress[0].ip}' - register: timescaledb_service_external_ip - failed_when: false - changed_when: false - -- name: Get timescaledb pod name - ansible.builtin.command: kubectl get pod -n "{{ telemetry_namespace }}" -l app="{{ timescaledb_k8s_name }}" -o jsonpath="{.items[0].metadata.name}" - register: timescaledb_pod_name - changed_when: false - failed_when: false - -- name: Get pod status - ansible.builtin.command: kubectl get pod -n "{{ telemetry_namespace }}" "{{ timescaledb_pod_name.stdout }}" -o jsonpath='{.status.phase}' - register: pod_status - failed_when: false - changed_when: false - -- name: Print pod status - ansible.builtin.debug: - msg: "The timescaledb pod status is {{ pod_status.stdout }}.Telemetry backup will be taken only when pod is in running state." - -- name: Dump database - ansible.builtin.command: kubectl exec -it "{{ timescaledb_pod_name.stdout }}" -n "{{ telemetry_namespace }}" -- pg_dump -d "postgres://{{ timescaledb_user }}:{{ timescaledb_password }}@{{ timescaledb_service_external_ip.stdout }}:5432/{{ database_name }}" --format=plain --quote-all-identifiers --no-tablespaces --no-owner --no-privileges --file="{{ dump_file }}" # noqa: yaml[line-length] - become: true - changed_when: false - register: dump_telemetry_result - when: - - "'running' in pod_status.stdout | lower" - -- name: Read file path parameters from upgrade_config.yml - ansible.builtin.include_vars: - file: upgrade_config.yml - changed_when: false - -- name: Copy telemetry backup file to backup_location - ansible.builtin.copy: - src: "{{ telemetry_backup_file_path }}" - dest: "{{ backup_location }}" - mode: "{{ file_permission }}" - when: - - dump_telemetry_result.rc is defined - - dump_telemetry_result.rc==0 diff --git a/upgrade/roles/backup_telemetry/vars/main.yml b/upgrade/roles/backup_telemetry/vars/main.yml deleted file mode 100644 index a2c1c810f..000000000 --- a/upgrade/roles/backup_telemetry/vars/main.yml +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -# Usage:telemetry_dump.yml -database_name: "telemetry_metrics" -dump_file: "telemetry_tsdb_dump.sql" -telemetry_namespace: "telemetry-and-visualizations" -timescaledb_k8s_name: timescaledb -telemetry_backup_file_path: "/opt/omnia/telemetry/iDRAC-Telemetry-Reference-Tools/{{ dump_file }}" -file_permission: '0644' - -# Usage:include_telemetry_config.yml -telemetry_config_file: "{{ role_path }}/../../../input/telemetry_config.yml" -telemetry_vault_filename: "{{ role_path }}/../../../input/.telemetry_vault_key" -telemetry_config_syntax_fail_msg: "Failed.Syntax errors present in telemetry_config.yml.Fix errors and re-run playbook again." -vault_file_perm: '0644' diff --git a/upgrade/roles/cp_to_oim/files/omniadb_connection.py b/upgrade/roles/cp_to_oim/files/omniadb_connection.py new file mode 100644 index 000000000..09ddb44d1 --- /dev/null +++ b/upgrade/roles/cp_to_oim/files/omniadb_connection.py @@ -0,0 +1,52 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import psycopg2 as pg +from cryptography.fernet import Fernet + +key_file_path = '/opt/omnia/.postgres/.postgres_pass.key' +pass_file_path = '/opt/omnia/.postgres/.encrypted_pwd' + +with open(key_file_path, 'rb') as passfile: + key = passfile.read() +fernet = Fernet(key) + +with open(pass_file_path, 'rb') as datafile: + encrypted_file_data = datafile.read() +decrypted_pwd = fernet.decrypt(encrypted_file_data).decode() + +def create_connection(): + # Create database connection + conn = pg.connect( + database="omniadb", + user="postgres", + password=decrypted_pwd, + host="localhost", + port="5432", + ) + conn.autocommit = True + return conn + +def create_connection_xcatdb(): + # Create database connection + conn = pg.connect( + database="xcatdb", + user="postgres", + password=decrypted_pwd, + host="localhost", + port="5432", + ) + conn.autocommit = True + return conn + diff --git a/upgrade/roles/cp_to_oim/files/update_oim_db.py b/upgrade/roles/cp_to_oim/files/update_oim_db.py new file mode 100644 index 000000000..086c7c8bd --- /dev/null +++ b/upgrade/roles/cp_to_oim/files/update_oim_db.py @@ -0,0 +1,53 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import omniadb_connection + +# Create a database connection and cursor +conn = omniadb_connection.create_connection() +cursor = conn.cursor() + +# Define the current and new node_name values +current_node_name = "control_plane" +new_node_name = "oim" + +try: + # Step 1: Check if the record exists + select_query = "SELECT * FROM cluster.nodeinfo WHERE node = %s" + cursor.execute(select_query, (current_node_name,)) + record = cursor.fetchone() + + if record: + print("Record found:", record) + + # Step 2: Update the record's node_name + update_query = "UPDATE cluster.nodeinfo SET node = %s WHERE node = %s" + cursor.execute(update_query, (new_node_name, current_node_name)) + + # Save changes to the database + conn.commit() + print("Record updated successfully.") + else: + print("No record found with node_name =", current_node_name) + +except Exception as e: + # Handle any errors that occur + print("An error occurred:", e) + conn.rollback() # Rollback changes in case of an error + +finally: + # Close the cursor and connection + cursor.close() + conn.close() diff --git a/upgrade/roles/cp_to_oim/tasks/main.yml b/upgrade/roles/cp_to_oim/tasks/main.yml new file mode 100644 index 000000000..6549d8ab3 --- /dev/null +++ b/upgrade/roles/cp_to_oim/tasks/main.yml @@ -0,0 +1,42 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Update node_name in Omnia db from control_plane to oim + ansible.builtin.command: | + {{ python_version }} {{ oim_db_path }} + + +- name: Update the node details in xcat table site + ansible.builtin.command: "{{ xcat_bin_path }}/chdef -t site excludenodes=''" + changed_when: true + register: set_excludenodes_value + +- name: Update the hosts table in the xcat db to remove the entry for control_plane if present + ansible.builtin.command: "{{ xcat_bin_path }}/chdef -t node -o control_plane -n oim" + changed_when: true + register: update_node_entry + +- name: Update the hosts table in the xcat db to change the groups column for control_plane if present + ansible.builtin.command: "{{ xcat_bin_path }}/chdef oim groups=oim" + changed_when: true + register: update_group_entry + +- name: Update the node details in xcat table site + ansible.builtin.command: "{{ xcat_bin_path }}/chdef -t site excludenodes={{ oim }}" + changed_when: true + register: set_excludenodes_value + + + diff --git a/utils/kernel_param_update/roles/kcmdline_update/tasks/main.yml b/upgrade/roles/cp_to_oim/vars/main.yml similarity index 75% rename from utils/kernel_param_update/roles/kcmdline_update/tasks/main.yml rename to upgrade/roles/cp_to_oim/vars/main.yml index 6ea3f5d5c..f8ccf970a 100644 --- a/utils/kernel_param_update/roles/kcmdline_update/tasks/main.yml +++ b/upgrade/roles/cp_to_oim/vars/main.yml @@ -13,8 +13,8 @@ # limitations under the License. --- -- name: Adding kernel parameters for OS - ansible.builtin.include_tasks: "kcmdline_update_{{ ansible_distribution | lower }}.yml" - -- name: Reboot nodes - ansible.builtin.include_tasks: reboot_nodes.yml +# Usage: oim_details_db.yml +python_version: "{{ ansible_python_interpreter }}" +oim_db_path: "{{ role_path }}/files/update_oim_db.py" +xcat_bin_path: "/opt/xcat/bin" +oim: "oim" diff --git a/upgrade/roles/docker_registry_uninstall/tasks/docker_registry_uninstall.yml b/upgrade/roles/docker_registry_uninstall/tasks/docker_registry_uninstall.yml deleted file mode 100644 index 4ec48aeca..000000000 --- a/upgrade/roles/docker_registry_uninstall/tasks/docker_registry_uninstall.yml +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Stop docker-registry service - ansible.builtin.service: - name: docker-registry.service - state: stopped - enabled: false - failed_when: false - -- name: Remove docker-registry file - ansible.builtin.file: - path: "{{ item }}" - state: absent - with_items: "{{ docker_registry_file }}" - failed_when: false - -- name: Stop docker service - ansible.builtin.service: - name: docker.service - state: stopped - enabled: false - failed_when: false - -- name: Reload systemd - ansible.builtin.systemd: - daemon_reload: true - failed_when: false - -- name: Remove docker packages - ansible.builtin.command: dnf remove {{ docker_packages }} -y - changed_when: true - failed_when: false - -- name: Remove docker files - ansible.builtin.file: - path: "{{ item }}" - state: absent - with_items: "{{ docker_del_files }}" - failed_when: false - -- name: Get list of interfaces under Docker zone - ansible.builtin.command: "firewall-cmd --zone=docker --list-interfaces" - register: docker_interfaces - changed_when: false - failed_when: false - -- name: Remove IP addresses assigned to Docker interfaces - ansible.builtin.command: - cmd: "set -o pipefail && ip -4 addr show {{ item }} | awk '/inet / {print $2}'" - with_items: "{{ docker_interfaces.stdout_lines }}" - register: docker_ip_addresses - changed_when: false - failed_when: false - -- name: Remove IP addresses if assigned - ansible.builtin.command: - cmd: "sudo ip addr del {{ item.stdout }} dev {{ item.item }}" - with_items: "{{ docker_ip_addresses.results }}" - when: - - item.stdout is defined - - item.stdout != "" - changed_when: false - failed_when: false - -- name: Delete docker interfaces - ansible.builtin.command: sudo ip link delete "{{ item }}" - loop: "{{ docker_interfaces.stdout_lines }}" - changed_when: false - failed_when: false diff --git a/upgrade/roles/docker_registry_uninstall/vars/main.yml b/upgrade/roles/docker_registry_uninstall/vars/main.yml deleted file mode 100644 index a549f5660..000000000 --- a/upgrade/roles/docker_registry_uninstall/vars/main.yml +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -# Usage: clean_docker_registry.yml -docker_registry_file: /etc/systemd/system/docker-registry.service -docker_packages: "docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin docker-ce-rootless-extras" -docker_del_files: - - /var/lib/docker - - /var/lib/containerd - - /opt/omnia/containerd - - /etc/yum.repos.d/docker-ce.repo - - /docker-registry diff --git a/upgrade/roles/encrypt_inputs/tasks/encrypt_input_file.yml b/upgrade/roles/encrypt_inputs/tasks/encrypt_input_file.yml index 6ff89952c..0fbc01b25 100644 --- a/upgrade/roles/encrypt_inputs/tasks/encrypt_input_file.yml +++ b/upgrade/roles/encrypt_inputs/tasks/encrypt_input_file.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,11 +13,6 @@ # limitations under the License. --- -- name: Fetch ansible-vault path - ansible.builtin.command: whereis ansible-vault - changed_when: false - register: ansible_vault_path - - name: Check input file is encrypted ansible.builtin.command: cat {{ input_folder }}/{{ item.file }} changed_when: false @@ -40,7 +35,7 @@ - name: Encrypt {{ item.file }} ansible.builtin.command: >- - {{ ansible_vault_path.stdout.split(' ')[1] }} encrypt {{ input_folder }}/{{ item.file }} + ansible-vault encrypt {{ input_folder }}/{{ item.file }} --vault-password-file {{ input_folder }}/{{ item.vault_password_file }} changed_when: false when: "'$ANSIBLE_VAULT;' not in config_content.stdout" diff --git a/upgrade/roles/import_input_parameters/tasks/encrypt_1_5_inputs.yml b/upgrade/roles/import_input_parameters/tasks/encrypt_input_file.yml similarity index 66% rename from upgrade/roles/import_input_parameters/tasks/encrypt_1_5_inputs.yml rename to upgrade/roles/import_input_parameters/tasks/encrypt_input_file.yml index 5a0532e1d..fd4f99e27 100644 --- a/upgrade/roles/import_input_parameters/tasks/encrypt_1_5_inputs.yml +++ b/upgrade/roles/import_input_parameters/tasks/encrypt_input_file.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,13 +13,8 @@ # limitations under the License. --- -- name: Fetch ansible-vault path - ansible.builtin.command: whereis ansible-vault - changed_when: false - register: ansible_vault_path - - name: Check input file is encrypted - ansible.builtin.command: cat {{ old_input_location }}/{{ item.file }} + ansible.builtin.command: cat {{ new_input_folder_backup_location }}/{{ conf_list.file }} changed_when: false register: config_content no_log: true @@ -31,16 +26,16 @@ - name: Save vault key ansible.builtin.lineinfile: - path: "{{ old_input_location }}/{{ item.vault_password_file }}" + path: "{{ new_input_folder_backup_location }}/{{ conf_list.vault_password_file }}" line: "{{ vault_key }}" - mode: "{{ vault_file_perm }}" + mode: "{{ file_perm }}" owner: root create: true when: "'$ANSIBLE_VAULT;' not in config_content.stdout" -- name: Encrypt {{ item.file }} +- name: Encrypt {{ conf_list.file }} ansible.builtin.command: >- - {{ ansible_vault_path.stdout.split(' ')[1] }} encrypt {{ old_input_location }}/{{ item.file }} - --vault-password-file {{ old_input_location }}/{{ item.vault_password_file }} + ansible-vault encrypt {{ new_input_folder_backup_location }}/{{ conf_list.file }} + --vault-password-file {{ new_input_folder_backup_location }}/{{ conf_list.vault_password_file }} changed_when: false when: "'$ANSIBLE_VAULT;' not in config_content.stdout" diff --git a/upgrade/roles/import_input_parameters/tasks/generate_new_input_files.yml b/upgrade/roles/import_input_parameters/tasks/generate_new_input_files.yml index 20baee178..82b4248fd 100644 --- a/upgrade/roles/import_input_parameters/tasks/generate_new_input_files.yml +++ b/upgrade/roles/import_input_parameters/tasks/generate_new_input_files.yml @@ -12,47 +12,167 @@ # See the License for the specific language governing permissions and # limitations under the License. --- +- name: Set facts for local_repo_config include_vars + ansible.builtin.set_fact: + beegfs_version: "" + amdgpu_version: "" + rocm_version: "" + cluster_os_version: "" + os_release: "" + openldap_release: "" -- name: Remove k8s line from software_config.json - ansible.builtin.lineinfile: - path: "{{ role_path }}/../../../input/software_config.json" - regexp: '"name": "k8s"' - state: absent - when: scheduler_type == "slurm" +- name: Set fact import_input_parameters_new_config_file_array + ansible.builtin.set_fact: + import_input_parameters_new_config_file_array: [] -- name: Remove slurm line from software_config.json - ansible.builtin.lineinfile: - path: "{{ role_path }}/../../../input/software_config.json" - regexp: '"name": "slurm"' - state: absent - when: scheduler_type == "k8s" +- name: Ensure the destination directory exists + ansible.builtin.file: + path: "{{ new_input_folder_backup_location }}" + state: directory + mode: '0755' # Adjust permissions as needed -- name: Remove openldap line from software_config.json - ansible.builtin.lineinfile: - path: "{{ role_path }}/../../../input/software_config.json" - regexp: '"name": "openldap"' - state: absent - when: not ldap_required - -- name: Add freeipa line to software_config.json if freeipa_required is true - ansible.builtin.lineinfile: - path: "{{ role_path }}/../../../input/software_config.json" - line: ' {"name": "freeipa"},' - insertafter: '\[' - firstmatch: true - when: freeipa_required - -- name: Add telemetry line to software_config.json, when omnia_telemetry_support is enabled - ansible.builtin.lineinfile: - path: "{{ role_path }}/../../../input/software_config.json" - line: ' {"name": "telemetry"},' - insertafter: '\[' - firstmatch: true - when: omnia_telemetry_support +- name: Run cat on config files + ansible.builtin.command: cat "{{ read_input_folder_backup_location }}/input/{{ cat_item.file }}" + changed_when: false + loop: "{{ import_input_parameters_config_file_array }}" + loop_control: + loop_var: cat_item + register: cat_output + no_log: true + +- name: Read stdout into an array + ansible.builtin.set_fact: + stdout_array: "{{ cat_output.results | map(attribute='stdout') | list }}" + +- name: Construct new config files array + ansible.builtin.set_fact: + import_input_parameters_new_config_file_array: "{{ import_input_parameters_new_config_file_array + [{'file_entry': file_item.0, 'stdout': file_item.1}] }}" + loop: "{{ import_input_parameters_config_file_array | zip(stdout_array) | list }}" + loop_control: + loop_var: file_item + no_log: true + +- name: Store input files to be read + ansible.builtin.set_fact: + input_filenames: "{{ import_input_parameters_new_config_file_array | map(attribute='file_entry.file') | list }}" + +- name: Read config files + block: + - name: Decrypt config files + ansible.builtin.command: >- + ansible-vault decrypt {{ read_input_folder_backup_location }}/input/{{ decr_item.file_entry.file }} + --vault-password-file {{ read_input_folder_backup_location }}/input/{{ decr_item.file_entry.vault_password_file }} + loop: "{{ import_input_parameters_new_config_file_array }}" + loop_control: + loop_var: decr_item + when: "'$ANSIBLE_VAULT;' in decr_item.stdout" + no_log: true + changed_when: false + + - name: Read existing parameters from input files + ansible.builtin.include_vars: + file: "{{ read_input_folder_backup_location }}/input/{{ item }}" + name: "{{ item | basename | splitext | first }}_ns" + with_items: "{{ input_filenames }}" + register: result + changed_when: false + failed_when: result is failed + + rescue: + - name: Check if any input files read failed + ansible.builtin.set_fact: + read_parameters_failed: true + failed_input_files: "{{ failed_input_files | default([]) + [item.item] }}" + when: item.failed + with_items: "{{ result.results }}" + loop_control: + label: "{{ item.item }}" + + always: + - name: Check if vault password file exists + ansible.builtin.stat: + path: "{{ read_input_folder_backup_location }}/input/{{ vitem.vault_password_file }}" + register: vault_password_file_stat + loop: "{{ import_input_parameters_config_encrypt_array }}" + loop_control: + loop_var: vitem + changed_when: false + + - name: Extract file names where stat.exists is true + ansible.builtin.set_fact: + vault_file_exists_list: "{{ vault_file_exists_list | default([]) + [{'file': item.vitem.file, 'vault_password_file': item.vitem.vault_password_file}] }}" # noqa: yaml[line-length] + when: item.stat.exists + loop: "{{ vault_password_file_stat.results }}" + + - name: Encrypt old input files + ansible.builtin.command: >- + ansible-vault encrypt {{ read_input_folder_backup_location }}/input/{{ eitem.file }} + --vault-password-file {{ read_input_folder_backup_location }}/input/{{ eitem.vault_password_file }} + loop: "{{ vault_file_exists_list }}" + loop_control: + loop_var: eitem + changed_when: false + +- name: Check if read parameters failed + ansible.builtin.fail: + msg: "{{ read_parameters_failed_msg }}" + when: read_parameters_failed is defined and read_parameters_failed + +- name: Set fact version + ansible.builtin.set_fact: + template_path: "{{ role_path }}/templates/{{ folder_name }}" + +- name: Folder name new_input_folder_backup_location + ansible.builtin.debug: + msg: "new_input_folder_backup_location used is {{ new_input_folder_backup_location }}" + +- name: Folder name template + ansible.builtin.debug: + msg: "Template used is {{ template_path }}" - name: Generate config files from templates ansible.builtin.template: - src: "{{ item.src }}" - dest: "{{ role_path }}/../../../input/{{ item.dest }}" - mode: "{{ input_file_perm }}" + src: "{{ template_path }}/{{ my_item.src }}" + dest: "{{ new_input_folder_backup_location }}/{{ my_item.dest }}" + mode: "{{ file_perm }}" loop: "{{ import_input_parameters_config_template_array }}" + loop_control: + loop_var: my_item + +- name: Encrypt config files + ansible.builtin.include_tasks: encrypt_input_file.yml + loop: "{{ input_config_files_array }}" + loop_control: + loop_var: conf_list + no_log: true + +- name: Ensure read_input_folder_backup_location directory is absent + ansible.builtin.file: + path: "{{ read_input_folder_backup_location }}" + state: absent + failed_when: false + +- name: Ensure the destination directory exists + ansible.builtin.file: + path: "{{ read_input_folder_backup_location }}/input" + state: directory + mode: '0755' # Adjust permissions as needed + +- name: Move contents from new_input_folder_backup_location to read_input_folder_backup_location + ansible.posix.synchronize: + mode: push + src: "{{ new_input_folder_backup_location }}/." + dest: "{{ read_input_folder_backup_location }}/input/" + rsync_opts: + - "-a" + - "-v" + register: mv_result + ignore_errors: true + changed_when: false + +- name: Remove the temporary input directory if it still exists and is empty + ansible.builtin.file: + path: "{{ new_input_folder_backup_location }}" + state: absent + when: mv_result is succeeded # Only attempt to remove if the move was successful + failed_when: false diff --git a/upgrade/roles/import_input_parameters/tasks/import_software_config.yml b/upgrade/roles/import_input_parameters/tasks/import_software_config.yml new file mode 100644 index 000000000..edee94d7d --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/import_software_config.yml @@ -0,0 +1,26 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if software_config.json exists + ansible.builtin.stat: + path: "{{ installed_omnia_path }}/input/software_config.json" + register: software_config_exists + +- name: Copy software_config.json file + ansible.builtin.copy: + src: "{{ installed_omnia_path }}/input/software_config.json" + dest: "{{ role_path }}/../../../input/software_config.json" + mode: "{{ file_perm }}" + when: software_config_exists.stat.exists diff --git a/upgrade/roles/import_input_parameters/tasks/main.yml b/upgrade/roles/import_input_parameters/tasks/main.yml index ad6e702c9..7026b9a8b 100644 --- a/upgrade/roles/import_input_parameters/tasks/main.yml +++ b/upgrade/roles/import_input_parameters/tasks/main.yml @@ -12,11 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -- name: Read parameters in from 1.5 - ansible.builtin.include_tasks: read_parameters.yml -- name: Set network spec variables and telemetry config variables - ansible.builtin.include_tasks: set_network_spec_variables.yml +- name: Read parameters in from previous version of omnia + ansible.builtin.include_tasks: parameter_based_input_file_generation.yml -- name: Generate new input files - ansible.builtin.include_tasks: generate_new_input_files.yml +- name: Moving new input files to a specified source input directory + ansible.builtin.include_tasks: sync_files_to_omnia_version.yml + +- name: Copying old software_config file to a specified source input directory + ansible.builtin.include_tasks: import_software_config.yml + when: older_os diff --git a/upgrade/roles/import_input_parameters/tasks/parameter_based_input_file_generation.yml b/upgrade/roles/import_input_parameters/tasks/parameter_based_input_file_generation.yml new file mode 100644 index 000000000..4ab5f0e65 --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/parameter_based_input_file_generation.yml @@ -0,0 +1,79 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Get the installed omnia version + ansible.builtin.include_vars: + file: "{{ installed_omnia_path }}/.metadata/omnia_version" + +- name: Set current_omnia_version + ansible.builtin.set_fact: + current_omnia_version: "{{ omnia_version }}" + +- name: Get the upgrade omnia version + ansible.builtin.include_vars: + file: "../../../../.metadata/omnia_version" + +- name: Set upgrade_omnia_version + ansible.builtin.set_fact: + upgrade_omnia_version: "{{ omnia_version }}" + +- name: Print the omnia version fetched + ansible.builtin.debug: + msg: "current omnia version {{ current_omnia_version }} upgrade to {{ upgrade_omnia_version }}" + +- name: Find directories in a specified path + ansible.builtin.find: + paths: "{{ role_path }}/templates/" + file_type: directory + register: found_dirs + +- name: Extract directory names + ansible.builtin.set_fact: + folder_names: "{{ found_dirs.files | map(attribute='path') | map('basename') | list | sort }}" + +- name: Print the list of input version templates + ansible.builtin.debug: + msg: "{{ folder_names }}" + +- name: Set fact read_input_folder_backup_location + ansible.builtin.set_fact: + read_input_folder_backup_location: "/opt/omnia/upgrade_input/" + +- name: Ensure the destination directory exists + ansible.builtin.file: + path: "{{ read_input_folder_backup_location }}" + state: directory + mode: '0755' # Adjust permissions as needed + +- name: Copy directory from installed_omnia_path/input to read_input_folder_backup_location + ansible.posix.synchronize: + mode: push + src: "{{ installed_omnia_path }}/input" + dest: "{{ read_input_folder_backup_location }}" + rsync_opts: + - "-a" + - "-v" + - "--delete" + register: copy_result + changed_when: false + +- name: Read parameters in from previous version + ansible.builtin.include_tasks: generate_new_input_files.yml + loop: "{{ folder_names }}" + vars: + folder_name: "{{ item }}" + new_input_folder_backup_location: "/opt/omnia/{{ item }}" + when: > + current_omnia_version is version(item , '<') diff --git a/upgrade/roles/import_input_parameters/tasks/read_parameters.yml b/upgrade/roles/import_input_parameters/tasks/read_parameters.yml deleted file mode 100644 index a088cee6c..000000000 --- a/upgrade/roles/import_input_parameters/tasks/read_parameters.yml +++ /dev/null @@ -1,62 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -- name: Run cat on config files - ansible.builtin.command: cat "{{ old_input_location }}/{{ item.file }}" - changed_when: false - loop: "{{ import_input_parameters_config_file_array }}" - register: cat_output - no_log: true - -- name: Read stdout into an array - ansible.builtin.set_fact: - stdout_array: "{{ cat_output.results | map(attribute='stdout') | list }}" - -- name: Construct new config files array - ansible.builtin.set_fact: - import_input_parameters_new_config_file_array: "{{ import_input_parameters_new_config_file_array + [{'file_entry': item.0, 'stdout': item.1}] }}" - loop: "{{ import_input_parameters_config_file_array | zip(stdout_array) | list }}" - no_log: true - -- name: Store input files to be read - ansible.builtin.set_fact: - input_filenames: "{{ import_input_parameters_new_config_file_array | map(attribute='file_entry.file') | list }}" - -- name: Read config files - block: - - name: Decrypt config files - ansible.builtin.command: >- - ansible-vault decrypt {{ old_input_location }}/{{ item.file_entry.file }} - --vault-password-file {{ old_input_location }}/{{ item.file_entry.vault_password_file }} - loop: "{{ import_input_parameters_new_config_file_array }}" - when: "'$ANSIBLE_VAULT;' in item.stdout" - no_log: true - changed_when: false - - - name: Read existing parameters from input files - ansible.builtin.include_vars: - file: "{{ old_input_location }}/{{ item }}" - with_items: "{{ input_filenames }}" - changed_when: false - - rescue: - - name: Failed to read Omnia 1.5 input config files - ansible.builtin.fail: - msg: "{{ read_parameters_failed_msg }}" - - always: - - name: Encrypt Omnia 1.5 input config files - ansible.builtin.include_tasks: encrypt_1_5_inputs.yml - loop: "{{ import_input_parameters_config_encrypt_array }}" - no_log: true diff --git a/upgrade/roles/import_input_parameters/tasks/set_network_spec_variables.yml b/upgrade/roles/import_input_parameters/tasks/set_network_spec_variables.yml deleted file mode 100644 index ee41757c3..000000000 --- a/upgrade/roles/import_input_parameters/tasks/set_network_spec_variables.yml +++ /dev/null @@ -1,229 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -- name: Extract subnet variables from old input files - ansible.builtin.set_fact: - ip_start_bits: "{{ ip_start_range.split('.') if '.' in ip_start_range else [''] }}" - ip_end_bits: "{{ ip_end_range.split('.') if '.' in ip_end_range else [''] }}" - admin_subnet_bits: "{{ admin_nic_subnet.split('.') if '.' in admin_nic_subnet else [''] }}" - bmc_subnet_bits: "{{ bmc_nic_subnet.split('.') if '.' in bmc_nic_subnet else [''] }}" - ib_subnet_bits: "{{ ib_nic_subnet.split('.') if '.' in ib_nic_subnet else [''] }}" - pod_external_ip_start_bits: "{{ pod_external_ip_start_range.split('.') if '.' in pod_external_ip_start_range else [''] }}" - pod_external_ip_end_bits: "{{ pod_external_ip_end_range.split('.') if '.' in pod_external_ip_end_range else [''] }}" - node_start_ip_fourth_octet: "50" - no_log: true - -- name: Initialize validation variables - ansible.builtin.set_fact: - is_valid_ip_range: false - is_valid_admin_subnet: false - is_valid_bmc_subnet: false - is_valid_ib_subnet: false - is_valid_pod_external_ip_range: false - no_log: true - -- name: Validate IP range - ansible.builtin.set_fact: - is_valid_ip_range: true - last_bit_minus_one: "{{ ip_end_bits[3] | int - 1 }}" - admin_network_dynamic_range_third_octet: "{{ ip_start_bits[2] | int + 1 }}" - adm_dynamic_end_range_fourth_octet: "{{ ip_end_bits[3] | int - 55 }}" - dynamic_range_third_octet: "{{ ip_start_bits[2] | int + 2 }}" - when: - - ip_start_bits - - ip_start_bits|length == 4 - - ip_end_bits - - ip_end_bits|length == 4 - - ip_end_bits[3] | int > 0 - no_log: true - -- name: Validate admin subnet - ansible.builtin.set_fact: - is_valid_admin_subnet: true - when: admin_subnet_bits and admin_subnet_bits|length == 4 - no_log: true - -- name: Validate BMC subnet - ansible.builtin.set_fact: - is_valid_bmc_subnet: true - when: bmc_subnet_bits and bmc_subnet_bits|length == 4 - no_log: true - -- name: Validate InfiniBand subnet - ansible.builtin.set_fact: - is_valid_ib_subnet: true - when: ib_subnet_bits and ib_subnet_bits|length == 4 - no_log: true - -- name: Validate pod external IP range - ansible.builtin.set_fact: - is_valid_pod_external_ip_range: true - when: pod_external_ip_start_bits and pod_external_ip_start_bits|length == 4 and pod_external_ip_end_bits and pod_external_ip_end_bits|length == 4 - no_log: true - -- name: Initialize admin_network_static_range - ansible.builtin.set_fact: - import_input_parameters_admin_network_static_range: > - {{ - admin_subnet_bits[0] ~ '.' ~ - admin_subnet_bits[1] ~ '.' ~ - ip_start_bits[2] ~ '.' ~ - ip_start_bits[3] ~ '-' ~ - admin_subnet_bits[0] ~ '.' ~ - admin_subnet_bits[1] ~ '.' ~ - ip_end_bits[2] ~ '.' ~ - last_bit_minus_one - }} - when: is_valid_admin_subnet and is_valid_ip_range - no_log: true - -- name: Initialize admin_network_dynamic_range - ansible.builtin.set_fact: - import_input_parameters_admin_network_dynamic_range: > - {{ - admin_subnet_bits[0] ~ '.' ~ - admin_subnet_bits[1] ~ '.' ~ - admin_network_dynamic_range_third_octet ~ '.' ~ - ip_start_bits[3] ~ '-' ~ - admin_subnet_bits[0] ~ '.' ~ - admin_subnet_bits[1] ~ '.' ~ - admin_network_dynamic_range_third_octet ~ '.' ~ - adm_dynamic_end_range_fourth_octet - }} - when: is_valid_admin_subnet and is_valid_ip_range - no_log: true - -- name: Initialize admin_network_admin_uncorrelated_node_start_ip - ansible.builtin.set_fact: - import_input_parameters_uncorrelated_node_start_ip: > - {{ - admin_subnet_bits[0] ~ '.' ~ - admin_subnet_bits[1] ~ '.' ~ - ip_start_bits[2] ~ '.' ~ - node_start_ip_fourth_octet - }} - when: is_valid_admin_subnet and is_valid_ip_range - no_log: true - -- name: Initialize netmask bits - ansible.builtin.set_fact: - import_input_parameters_bmc_network_netmask_bits: "16" - when: is_valid_bmc_subnet - -- name: Initialize bmc_network_static_range - ansible.builtin.set_fact: - import_input_parameters_bmc_network_static_range: > - {{ - bmc_subnet_bits[0] ~ '.' ~ - bmc_subnet_bits[1] ~ '.' ~ - ip_start_bits[2] ~ '.' ~ - ip_start_bits[3] ~ '-' ~ - bmc_subnet_bits[0] ~ '.' ~ - bmc_subnet_bits[1] ~ '.' ~ - ip_end_bits[2] ~ '.' ~ - last_bit_minus_one - }} - when: is_valid_bmc_subnet and is_valid_ip_range - no_log: true - -- name: Initialize bmc_network_dynamic_range - ansible.builtin.set_fact: - import_input_parameters_bmc_network_dynamic_range: > - {{ - bmc_subnet_bits[0] ~ '.' ~ - bmc_subnet_bits[1] ~ '.' ~ - dynamic_range_third_octet ~ '.' ~ - ip_start_bits[3] ~ '-' ~ - bmc_subnet_bits[0] ~ '.' ~ - bmc_subnet_bits[1] ~ '.' ~ - dynamic_range_third_octet ~ '.' ~ - last_bit_minus_one - }} - when: is_valid_bmc_subnet and is_valid_ip_range - no_log: true - -# - name: Initialize admin_network_network_gateway -# ansible.builtin.set_fact: -# import_input_parameters_admin_network_network_gateway: > -# {{ -# admin_subnet_bits[0] ~ '.' ~ -# admin_subnet_bits[1] ~ '.' ~ -# ip_start_bits[2] ~ '.' ~ -# last_bit_minus_one -# }} -# when: is_valid_admin_subnet and is_valid_ip_range -# no_log: true - -# - name: Initialize bmc_network_network_gateway -# ansible.builtin.set_fact: -# import_input_parameters_bmc_network_network_gateway: > -# {{ -# bmc_subnet_bits[0] ~ '.' ~ -# bmc_subnet_bits[1] ~ '.' ~ -# ip_start_bits[2] ~ '.' ~ -# last_bit_minus_one -# }} -# when: is_valid_bmc_subnet and is_valid_ip_range -# no_log: true - -- name: Initialize bmc_network_discover_range - ansible.builtin.set_fact: - import_input_parameters_bmc_network_discover_range: "{{ bmc_static_start_range ~ '-' ~ bmc_static_end_range }}" - when: bmc_static_start_range and bmc_static_end_range - no_log: true - -- name: Initialize ib_network1_static_range - ansible.builtin.set_fact: - import_input_parameters_ib_network1_static_range: > - {{ - ib_subnet_bits[0] ~ '.' ~ - ib_subnet_bits[1] ~ '.' ~ - ip_start_bits[2] ~ '.' ~ - ip_start_bits[3] ~ '-' ~ - ib_subnet_bits[0] ~ '.' ~ - ib_subnet_bits[1] ~ '.' ~ - ip_end_bits[2] ~ '.' ~ - ip_end_bits[3] - }} - when: ib_subnet_bits and is_valid_ib_subnet - no_log: true - -- name: Initialize pod_external_ip_range - ansible.builtin.set_fact: - import_input_parameters_pod_external_ip_range: > - {{ - admin_subnet_bits[0] ~ '.' ~ - admin_subnet_bits[1] ~ '.' ~ - pod_external_ip_start_bits[2] ~ '.' ~ - pod_external_ip_start_bits[3] ~ '-' ~ - admin_subnet_bits[0] ~ '.' ~ - admin_subnet_bits[1] ~ '.' ~ - pod_external_ip_end_bits[2] ~ '.' ~ - pod_external_ip_end_bits[3] - }} - when: is_valid_admin_subnet and is_valid_pod_external_ip_range - no_log: true - -- name: Trim strings - ansible.builtin.set_fact: - import_input_parameters_admin_network_static_range: "{{ import_input_parameters_admin_network_static_range | trim }}" - import_input_parameters_bmc_network_static_range: "{{ import_input_parameters_bmc_network_static_range | trim }}" - import_input_parameters_admin_network_network_gateway: "{{ import_input_parameters_admin_network_network_gateway | trim }}" - import_input_parameters_bmc_network_network_gateway: "{{ import_input_parameters_bmc_network_network_gateway | trim }}" - import_input_parameters_bmc_network_discover_range: "{{ import_input_parameters_bmc_network_discover_range | trim }}" - import_input_parameters_ib_network1_static_range: "{{ import_input_parameters_ib_network1_static_range | trim }}" - import_input_parameters_pod_external_ip_range: "{{ import_input_parameters_pod_external_ip_range | trim }}" - import_input_parameters_admin_network_dynamic_range: "{{ import_input_parameters_admin_network_dynamic_range | trim }}" - import_input_parameters_uncorrelated_node_start_ip: "{{ import_input_parameters_uncorrelated_node_start_ip | trim }}" - import_input_parameters_bmc_network_dynamic_range: "{{ import_input_parameters_bmc_network_dynamic_range | trim }}" diff --git a/upgrade/roles/import_input_parameters/tasks/sync_files_to_omnia_version.yml b/upgrade/roles/import_input_parameters/tasks/sync_files_to_omnia_version.yml new file mode 100644 index 000000000..e000c515c --- /dev/null +++ b/upgrade/roles/import_input_parameters/tasks/sync_files_to_omnia_version.yml @@ -0,0 +1,32 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Move the new input files to the source input directory for {{ upgrade_omnia_version }} + ansible.posix.synchronize: + mode: push + src: "{{ read_input_folder_backup_location }}/input/." + dest: "{{ role_path }}/../../../input/" + rsync_opts: + - "-a" + - "-v" + register: mv_result + ignore_errors: true + changed_when: false + +- name: Remove the source directory if it still exists and is empty + ansible.builtin.file: + path: "{{ read_input_folder_backup_location }}" + state: absent + when: mv_result is succeeded # Only attempt to remove if the move was successful diff --git a/upgrade/roles/import_input_parameters/templates/1.7/k8s_access_config.j2 b/upgrade/roles/import_input_parameters/templates/1.7/k8s_access_config.j2 new file mode 100644 index 000000000..780f2d5d8 --- /dev/null +++ b/upgrade/roles/import_input_parameters/templates/1.7/k8s_access_config.j2 @@ -0,0 +1,24 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +#*********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# SIMPLY APPEND THE REQUIRD VALUES AGAINST THE PARAMETER OF YOUR CHOICE. +#*********************************************************************** + +# This variable accepts the usernames for which k8s access needs to be setup +# Eg1. user_name: "user1" +# Eg2. user_name: "user1,user2,user3" +user_name: "{{ k8s_access_config_ns.user_name }}" # Value auto populated by Omnia upgrade script diff --git a/upgrade/roles/import_input_parameters/templates/1.7/local_repo_config.j2 b/upgrade/roles/import_input_parameters/templates/1.7/local_repo_config.j2 new file mode 100644 index 000000000..6e7f44112 --- /dev/null +++ b/upgrade/roles/import_input_parameters/templates/1.7/local_repo_config.j2 @@ -0,0 +1,158 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +#*********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# SIMPLY APPEND THE REQUIRD VALUES AGAINST THE PARAMETER OF YOUR CHOICE. +#*********************************************************************** + +# Mandatory +# All the offline repository data for omnia stack will be stored at this path. +# Please make sure assigned partition has enough space. +# Ensure 755 permission is given to repo_store_path if user intends to use nfs share mount for repo_store_path +# Default: /omnia_repo +repo_store_path: "{{ local_repo_config_ns.repo_store_path }}" # Value auto populated by Omnia upgrade script + +# Optional +# This variable accepts the repository urls of the user which contains the packages required for the cluster. +# always: In this case, Omnia creates a local repo on the Omnia Infrastructure Manager hosting all the packages required for the cluster. +# User should make sure required disk space is available. +# partial: In this case, Omnia creates a local repo on the Omnia Infrastructure Manager hosting packages not part of user's repository. +# never: In this case, Omnia does not create a local repo. All the packages are directly downloaded on the cluster. +# This variable accepts repo url and gpgkey +# url: defines the baseurl for the repository +# gpgkey: defines gpgkey for the repository +# If gpgkey is empty then gpgcheck will be disabled for that repository +# This variable should not have jinja variables in it. +# In Ubuntu OS, it is mandatory to provide gpgkey for the user repositories. In case there is no gpgkey associated with the repo +# and user leaves the gpgkey field empty (gpgkey: "" ), omnia configures the repository as a trusted source and the user is solely +# responsible in maintaining the security. +# Example: +# user_repo_url: +# - {url: "http://user_repo.com/x86_64/os/",gpgkey: "http://user_repo.com/x86_64/os/RPM-GPG-KEY"} +user_repo_url: # Value auto populated by Omnia upgrade script +{% if local_repo_config_ns.user_repo_url %} +{% for item in local_repo_config_ns.user_repo_url %} +{% if item.url is not none and item.gpgkey is not none %} + - { url: "{{ item.url }}", gpgkey: "{{ item.gpgkey }}" } +{% endif %} +{% endfor %} +{% endif %} + +# Optional +# This variable accepts the registry url along with port of the user which contains the images required for cluster. +# always: In this case, Omnia creates a local registry on the Omnia Infrastructure Manager hosting all the images required for the cluster. +# User should make sure required disk space is available. +# partial: In this case, Omnia creates a local registry on the Omnia Infrastructure Manager hosting images not part of user's registry. +# never: In this case, Omnia does not create a local registry. All the images are directly downloaded on the cluster. +# This variable accepts host and cert_path +# host: defines the url and port for registry +# cert_path: defines the absolute path of the certificates for each registry. +# If cert_path is empty, insecure registry will be configured. +# Example: +# user_registry: +# - { host: 10.11.0.100:5001, cert_path: "/home/ca.crt" } +# - { host: registryhostname.registry.test, cert_path: "" } +user_registry: # Value auto populated by Omnia upgrade script +{% if local_repo_config_ns.user_registry %} +{% for item in local_repo_config_ns.user_registry %} +{% if item.host is not none and item.cert_path is not none %} + - { host: {{ item.host }} , cert_path: "{{ item.cert_path }}" } +{% endif %} +{% endfor %} +{% endif %} + +# Mandatory when cluster_os_type is ubuntu in softwares_config.json +# This variable will be ignored when cluster_os_type is rhel or rocky +# This variables defines the repos to be configured on all the compute nodes +# When repo_config is always, partial or never, the given ubuntu_os_url configured via proxy in compute nodes +# Online ubuntu_os_url for Ubuntu 22.04 is http://in.archive.ubuntu.com/ubuntu +# Online ubuntu_os_url for Ubuntu 20.04 is http://archive.ubuntu.com/ubuntu +# Example: +# When cluster_os_type is Ubuntu 22.04 +# ubuntu_os_url: "http://in.archive.ubuntu.com/ubuntu" +ubuntu_os_url: {{ local_repo_config_ns.ubuntu_os_url }} # Value auto populated by Omnia upgrade script + +# Mandatory when cluster_os_type is rhel in softwares_config.json +# This variable will be ignored when cluster_os_type is ubuntu or rocky +# User has to provide the code ready builder url that should not have a RedHat subscription authentication inorder to download the packages +# When repo_config is always, the given rhel_os_url will be configured in the Omnia Infrastructure Manager and packages required for cluster will be downloaded +# When repo_config is partial or never, the packages required for cluster which were coming from rhel_repo_url will not be downloaded. +# and the rhel_os_url configured via proxy in compute nodes +# Example: +# rhel_os_url: +# - {url: "http://crb.com/CRB/x86_64/os/", gpgkey: "http://crb.com/CRB/x86_64/os/RPM-GPG-KEY"} +rhel_os_url: # Value auto populated by Omnia upgrade script +{% if local_repo_config_ns.rhel_os_url %} +{% for item in local_repo_config_ns.rhel_os_url %} +{% if item.url is not none and item.gpgkey is not none %} + - { url: "{{ item.url }}", gpgkey: "{{ item.gpgkey }}" } +{% endif %} +{% endfor %} +{% endif %} + +### ADVANCE CONFIGURATIONS FOR LOCAL REPO ### +# Mandatory +# This variable defines all the repo urls from where rpms will be downloaded for omnia features when cluster_os_type is rhel. +# Making incorrect changes to this variable can cause omnia failure. Please edit cautiously. +# 'url' defines the baseurl for the repository +# 'gpgkey' defines gpgkey for the repository +# If 'gpgkey' is kept empty then gpgcheck=0 for that repository +omnia_repo_url_rhel: # Value auto populated by Omnia upgrade script + - { url: "https://download.docker.com/linux/centos/$releasever/$basearch/stable", gpgkey: "https://download.docker.com/linux/centos/gpg" } + - { url: "https://repo.radeon.com/rocm/rhel8/{{ local_repo_config_ns.rocm_version | default('{{ rocm_version }}') }}/main", gpgkey: "https://repo.radeon.com/rocm/rocm.gpg.key" } + - { url: "https://download.fedoraproject.org/pub/epel/8/Everything/$basearch", gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-8" } + - { url: "https://repo.radeon.com/amdgpu/{{ local_repo_config_ns.amdgpu_version | default('{{ amdgpu_version }}')}}/rhel/{{ local_repo_config_ns.cluster_os_version | default('{{ cluster_os_version }}')}}/main/x86_64", gpgkey: "https://repo.radeon.com/rocm/rocm.gpg.key" } + - { url: "https://www.beegfs.io/release/beegfs_{{ local_repo_config_ns.beegfs_version| default('{{ beegfs_version }}')}}/dists/rhel8", gpgkey: "https://www.beegfs.io/release/beegfs_{{ local_repo_config_ns.beegfs_version | default('{{ beegfs_version }}')}}/gpg/GPG-KEY-beegfs" } + - { url: "https://yum.repos.intel.com/oneapi", gpgkey: "https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB" } + - { url: "https://ltb-project.org/rpm/openldap25/$releasever/$basearch", gpgkey: ""} + - { url: "https://nvidia.github.io/libnvidia-container/stable/rpm/$basearch", gpgkey: "https://nvidia.github.io/libnvidia-container/gpgkey"} + - { url: "https://a2o.github.io/snoopy-packages/repo/centos/8/stable/", gpgkey: ""} + +# Mandatory +# This variable defines all the repo urls from where rpms will be downloaded for omnia features when cluster_os_type is rocky. +# Making incorrect changes to this variable can cause omnia failure. Please edit cautiously. +# 'url' defines the baseurl for the repository +# 'gpgkey' defines gpgkey for the repository +# If 'gpgkey' is kept empty then gpgcheck=0 for that repository +omnia_repo_url_rocky: # Value auto populated by Omnia upgrade script + - { url: "https://download.docker.com/linux/centos/$releasever/$basearch/stable", gpgkey: "https://download.docker.com/linux/centos/gpg" } + - { url: "https://repo.radeon.com/rocm/rhel8/{{ local_repo_config_ns.rocm_version | default('{{ rocm_version }}') }}/main", gpgkey: "https://repo.radeon.com/rocm/rocm.gpg.key" } + - { url: "https://download.fedoraproject.org/pub/epel/8/Everything/$basearch", gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-8" } + - { url: "https://repo.radeon.com/amdgpu/{{ local_repo_config_ns.amdgpu_version | default('{{ amdgpu_version }}')}}/rhel/{{ local_repo_config_ns.cluster_os_version | default('{{ cluster_os_version }}')}}/main/x86_64", gpgkey: "https://repo.radeon.com/rocm/rocm.gpg.key" } + - { url: "https://www.beegfs.io/release/beegfs_{{ local_repo_config_ns.beegfs_version | default('{{ beegfs_version }}')}}/dists/rhel8", gpgkey: "https://www.beegfs.io/release/beegfs_{{ local_repo_config_ns.beegfs_version | default('{{ beegfs_version }}')}}/gpg/GPG-KEY-beegfs" } + - { url: "https://yum.repos.intel.com/oneapi", gpgkey: "https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB" } + - { url: "https://ltb-project.org/rpm/openldap25/$releasever/$basearch", gpgkey: ""} + - { url: "http://dl.rockylinux.org/$contentdir/$releasever/PowerTools/$basearch/os/", gpgkey: ""} + - { url: "https://nvidia.github.io/libnvidia-container/stable/rpm/$basearch", gpgkey: "https://nvidia.github.io/libnvidia-container/gpgkey"} + - { url: "https://a2o.github.io/snoopy-packages/repo/centos/8/stable/", gpgkey: ""} + +# Mandatory +# This variable defines all the repo urls from where deb packages will be downloaded for omnia features when cluster_os_type is ubuntu. +# Making incorrect changes to this variable can cause omnia failure. Please edit cautiously. +# 'url' defines the baseurl for the repository +# 'gpgkey' defines gpgkey for the repository +# If 'gpgkey' is kept empty then gpgcheck=0 for that repository +# 'publickey' defines publickey for the repository, if 'gpgkey' is not available for that repository +omnia_repo_url_ubuntu: # Value auto populated by Omnia upgrade script + - { url: "https://download.docker.com/linux/ubuntu {{ local_repo_config_ns.os_release | default('{{ os_release }}')}} stable", gpgkey: "https://download.docker.com/linux/ubuntu/gpg" } + - { url: "https://repo.radeon.com/rocm/apt/{{ local_repo_config_ns.rocm_version | default('{{ rocm_version }}') }} {{ local_repo_config_ns.os_release | default('{{ os_release }}')}} main", gpgkey: "https://repo.radeon.com/rocm/rocm.gpg.key" } + - { url: "https://www.beegfs.io/release/beegfs_{{ local_repo_config_ns.beegfs_version | default('{{ beegfs_version }}')}} {{ local_repo_config_ns.os_release | default('{{ os_release }}') }} non-free", gpgkey: "https://www.beegfs.io/release/beegfs_{{ local_repo_config_ns.beegfs_version | default('{{ beegfs_version }}')}}/gpg/GPG-KEY-beegfs" } + - { url: "https://repo.radeon.com/amdgpu/{{ local_repo_config_ns.amdgpu_version | default('{{ amdgpu_version }}')}}/ubuntu {{ local_repo_config_ns.os_release | default('{{ os_release }}')}} main", gpgkey: "https://repo.radeon.com/rocm/rocm.gpg.key" } + - { url: "https://ltb-project.org/debian/openldap25/{{ local_repo_config_ns.openldap_release | default('{{ openldap_release }}')}} {{ local_repo_config_ns.openldap_release | default('{{ openldap_release }}')}} main", publickey: "https://ltb-project.org/documentation/_static/RPM-GPG-KEY-LTB-project" } + - { url: "https://nvidia.github.io/libnvidia-container/stable/deb/amd64 /", gpgkey: "https://nvidia.github.io/libnvidia-container/gpgkey" } + - { url: "http://ppa.launchpad.net/deadsnakes/ppa/ubuntu {{ local_repo_config_ns.os_release | default('{{ os_release }}')}} main", gpgkey: "" } + - { url: "https://a2o.github.io/snoopy-packages/repo/ubuntu {{ local_repo_config_ns.os_release | default('{{ os_release }}')}} stable", publickey: "https://a2o.github.io/snoopy-packages/snoopy-packages-key.pub" } + - { url: "https://vault.habana.ai/artifactory/debian {{ local_repo_config_ns.os_release | default('{{ os_release }}')}} main", publickey: "https://vault.habana.ai/artifactory/api/gpg/key/public" } diff --git a/upgrade/roles/import_input_parameters/templates/1.7/login_node_security_config.j2 b/upgrade/roles/import_input_parameters/templates/1.7/login_node_security_config.j2 new file mode 100644 index 000000000..f4dd3fd0e --- /dev/null +++ b/upgrade/roles/import_input_parameters/templates/1.7/login_node_security_config.j2 @@ -0,0 +1,79 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +#*********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# SIMPLY APPEND THE REQUIRD VALUES AGAINST THE PARAMETER OF YOUR CHOICE. +#*********************************************************************** + +# Maximum number of consecutive failures before lockout +# The default value of this variable can't be changed +# Default value: 3 +max_failures: 3 # Value auto populated by Omnia upgrade script + +# Period (in seconds) after which the number of failed login attempts is reset +# Default value: 60 +# Min: 30 +# Max: 60 +failure_reset_interval: {{ login_node_security_config_ns.failure_reset_interval | default('60') }} # Value auto populated by Omnia upgrade script + +# Period (in seconds) for which users are locked out +# Default value: 10 +# Min: 5 +# Max: 10 +lockout_duration: {{ login_node_security_config_ns.lockout_duration | default('10') }} # Value auto populated by Omnia upgrade script + +# User sessions that have been idle for a specific period can be ended automatically +# This variable sets session timeout to 3 minutes (180 seconds) by default +# Min: 90 +# Max: 180 +session_timeout: {{ login_node_security_config_ns.session_timeout | default('180') }} # Value auto populated by Omnia upgrade script + +# Email address used for sending alerts in case of authentication failure +# If this variable is left blank, authentication failure alerts will be disabled. +# Multiple email address can be provided by comma separated values +# Example: alert_email_address: "user1@domain.com,user2@domain.com" +alert_email_address: "{{ login_node_security_config_ns.alert_email_address }}" # Value auto populated by Omnia upgrade script + +# This variable will be applicable only when alert_email_address is provided +# SMTP server details in the cluster to sent email alerts +# Supported only single SMTP server configuration +# Example: +# smtp_server: +# - { host: "smtp-server.domain.com", port: "25", sender_address: "alert@domain.com" }" +smtp_server: + - { host: "", port: "", sender_address: "" } + +# This variable mentions the users to whom the access will be provided +# format of user shall be username@ip or username +# Ex1- root@1.2.3.4 Ex2- root Ex3- root@1.2.3.4 root (if multiple user, provide space seperated values) by default empty +user: "{{ login_node_security_config_ns.user }}" # Value auto populated by Omnia upgrade script + +# This variable provides the type of access +# Accepted values: "allow" or "deny" +# Default value: "allow" +# Make sure AllowUsers or DenyUsers entries on sshd configuration file are not commented +allow_deny: "{{ login_node_security_config_ns.allow_deny | default('allow') }}" # Value auto populated by Omnia upgrade script + +# This variable is used to disable services. +# Accepted values: "true" or "false". +# Default value: false +# Root access is needed. +restrict_program_support: {{ login_node_security_config_ns.restrict_program_support | default('false') | lower | regex_replace('\"', '') }} # Value auto populated by Omnia upgrade script + +# The below-mentioned services can be disabled, by adding values in comma separated values format for restrict_softwares variable +# Services: telnet,lpd,bluetooth,rlogin,rexec +# Ex: restrict_softwares: "telnet,lpd,bluetooth" ( This disables 3 services, to disable more services, add services with comma separation. ) +restrict_softwares: "{{ login_node_security_config_ns.restrict_softwares }}" # Value auto populated by Omnia upgrade script diff --git a/upgrade/roles/import_input_parameters/templates/network_config.j2 b/upgrade/roles/import_input_parameters/templates/1.7/network_config.j2 similarity index 79% rename from upgrade/roles/import_input_parameters/templates/network_config.j2 rename to upgrade/roles/import_input_parameters/templates/1.7/network_config.j2 index a6bbebd2b..87f7f8a17 100644 --- a/upgrade/roles/import_input_parameters/templates/network_config.j2 +++ b/upgrade/roles/import_input_parameters/templates/1.7/network_config.j2 @@ -21,15 +21,15 @@ # Absolute path to local copy of .tgz file containing mlnx_ofed package. # The package can be downloaded from https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/ # Optional variable. -mlnx_ofed_offline_path: "{{ mlnx_ofed_offline_path }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +mlnx_ofed_offline_path: "{{ network_config_ns.mlnx_ofed_offline_path }}" # Value auto populated by Omnia upgrade script # If mlnx_ofed_offline_path is not given, declaring this variable is mandatory. # The mlnx_ofed package is downloaded as per version mentioned in this variable. # Default value: 5.4-2.4.1.3 -mlnx_ofed_version: "{{ mlnx_ofed_version }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +mlnx_ofed_version: {{ network_config_ns.mlnx_ofed_version }} # Value auto populated by Omnia upgrade script # Set this variable to true if kernel version currently available on compute nodes is # not compatible with mlnx_ofed version in use. # Mandatory variable # Default value: true -mlnx_ofed_add_kernel_support: {{ mlnx_ofed_add_kernel_support | lower | regex_replace('["\']', '') }} # Value auto populated by Omnia 1.5-1.6 upgrade script +mlnx_ofed_add_kernel_support: {{ network_config_ns.mlnx_ofed_add_kernel_support | lower | regex_replace('["\']', '') }} # Value auto populated by Omnia upgrade script diff --git a/upgrade/roles/import_input_parameters/templates/1.7/network_spec.j2 b/upgrade/roles/import_input_parameters/templates/1.7/network_spec.j2 new file mode 100644 index 000000000..ef7e7d996 --- /dev/null +++ b/upgrade/roles/import_input_parameters/templates/1.7/network_spec.j2 @@ -0,0 +1,71 @@ +--- +Networks: +{% set network_types = [] %} +{% for network in network_spec_ns.Networks %} +{% for network_type, details in network.items() %} +{% if network_type not in network_types %} +{% set _ = network_types.append(network_type) %} +{% endif %} +{% if network_type == 'admin_network' %} + - {{ network_type }}: + nic_name: "{{ details.nic_name }}" + netmask_bits: "{{ details.netmask_bits }}" + static_range: "{{ details.static_range }}" + dynamic_range: "{{ details.dynamic_range }}" + correlation_to_admin: {{ details.correlation_to_admin | lower }} + admin_uncorrelated_node_start_ip: "{{ details.admin_uncorrelated_node_start_ip }}" + network_gateway: "{{ details.network_gateway }}" + DNS: "{{ details.DNS }}" + MTU: "{{ details.MTU }}" + +{% elif network_type == 'bmc_network' %} + - {{ network_type }}: + nic_name: "{{ details.nic_name }}" + netmask_bits: "{{ details.netmask_bits }}" + static_range: "{{ details.static_range }}" + dynamic_range: "{{ details.dynamic_range }}" + reassignment_to_static: {{ details.reassignment_to_static | lower }} + discover_ranges: "{{ details.discover_ranges }}" + network_gateway: "{{ details.network_gateway }}" + MTU: "{{ details.MTU }}" + +{% elif network_type != 'admin_network' and network_type != 'bmc_network' %} + - {{ network_type }}: +{% if details.CIDR is defined %} + netmask_bits: "{{ details.netmask_bits }}" + CIDR: "{{ details.CIDR }}" + network_gateway: "{{ details.network_gateway }}" + MTU: "{{ details.MTU }}" +{% elif details.static_range is defined %} + netmask_bits: "{{ details.netmask_bits }}" + static_range: "{{ details.static_range }}" + network_gateway: "{{ details.network_gateway }}" + MTU: "{{ details.MTU }}" +{% endif %} +{% if details.VLAN is defined %} + VLAN: "{{ details.VLAN }}" +{% endif %} +{% endif %} +{% endfor %} +{% endfor %} + +{% if network_types|length == 2 and 'admin_network' in network_types and 'bmc_network' in network_types %} +#********************************************************************** +# Following are the templates for providing additional network details +# If vlan creation is required ensure vlan name is provided in the format NIC.vlan_id(eth1.101) in server_spec.yml +#********************************************************************** + +# - nic_network1: +# netmask_bits: "20" +# CIDR: "10.10.16.0" +# network_gateway: "" +# MTU: "1500" +# VLAN: "" +# +# - nic_network2: +# netmask_bits: "20" +# static_range: "10.10.1.1-10.10.15.254" +# network_gateway: "" +# MTU: "1500" +# VLAN: "1" +{% endif %} diff --git a/upgrade/roles/import_input_parameters/templates/1.7/omnia_config.j2 b/upgrade/roles/import_input_parameters/templates/1.7/omnia_config.j2 new file mode 100644 index 000000000..fa020ad14 --- /dev/null +++ b/upgrade/roles/import_input_parameters/templates/1.7/omnia_config.j2 @@ -0,0 +1,101 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# SIMPLY APPEND THE REQUIRD VALUES AGAINST THE PARAMETER OF YOUR CHOICE. +# *********************************************************************** + +# Path to directory hosting ansible config file (ansible.cfg file) +# Default value is "/etc/ansible" +# This directory is on the host running ansible,if ansible is installed using dnf +# If ansible is installed using pip, this path should be set +ansible_config_file_path: "{{ omnia_config_ns.ansible_config_file_path }}" # Value auto populated by Omnia upgrade script + +# -----------------------------SLURM------------------------------------------------ + +# Password used for Slurm database. +# The Length of the password should be at least 8. +# The password must not contain -,\, '," +mariadb_password: "{{ omnia_config_ns.mariadb_password }}" # Value auto populated by Omnia upgrade script + +# This variable accepts whether slurm installation is supported in configless mode or slurm in nfs +# Default value is "configless" +# If the value is "nfs_share", then share_path has to be mentioned +# Slurm should be installed in share_path when slurm_installation_type is set to nfs_share +slurm_installation_type: "{{ omnia_config_ns.slurm_installation_type }}" # Value auto populated by Omnia upgrade script + +# Variable indicates whether slurm control node services(slurmctld) should be restarted or not +# If restart_slurm_services is set to true, slurmctld services will be restarted on every execution of omnia.yml. +# It accepts true and false values +# Default value is true +restart_slurm_services: {{ omnia_config_ns.restart_slurm_services | lower | regex_replace('["\']', '') }} # Value auto populated by Omnia upgrade script + +#----------------------------K8S------------------------------------------------------ + +# Kubernetes SDN network. +# It can either be "calico" or "flannel". +# Default value assigned is "calico". +# While setting up Kubernetes plugin for RoCE NIC, ensure that this value is set to "flannel" +k8s_cni: "{{ omnia_config_ns.k8s_cni }}" # Value auto populated by Omnia upgrade script + +# These addresses will be used by Loadbalancer for assigning External IPs to K8s services +# Make sure the IP range is not assigned to any node in the cluster. +# Acceptable formats: "10.11.0.100-10.11.0.150" , "10.11.0.0/16" +# Mandatory Field +pod_external_ip_range: "{{ omnia_config_ns.pod_external_ip_range }}" # Value auto populated by Omnia upgrade script + + +###------ADVANCE CONFIGURATIONS FOR KUBERNETES------ +# Kubernetes internal network for services. +# This network must be unused in your network infrastructure. +# Default value is "10.233.0.0/18" +k8s_service_addresses: "{{ omnia_config_ns.k8s_service_addresses }}" # Value auto populated by Omnia upgrade script + +# Kubernetes pod network CIDR for internal network. When used, it will assign IP +# addresses from this range to individual pods. +# This network must be unused in your network infrastructure. +# Default value is "10.233.64.0/18" +k8s_pod_network_cidr: "{{ omnia_config_ns.k8s_pod_network_cidr }}" # Value auto populated by Omnia upgrade script + +# Kubernetes Topology manager policies. +# It can either be "none" or "best-effort" or "restricted" or "single-numa-node". +# Default value assigned is "none". +topology_manager_policy: "none" + +# Kubernetes Topology manager scope. +# It can either be "container" or "pod". +# Default value assigned is "container". +topology_manager_scope: "container" + +#----------------------------VERIFY INTEL GAUDI INSTALLATION------------------------------------------------------ + +# It's recommended to do extensive tests hl_qual and hccl when installing a new Gaudi node. This takes around 20mins. +# To do that during provisioning this variable has to be set to true. +run_intel_gaudi_tests: false # Value auto populated by Omnia upgrade script + +#----------------------------CSI Driver------------------------------------------------------ +# Following csi powerscale driver input variables are mandatory only if csi_driver_powerscale entry is present in software_config.json + +# Absolute file path for the secret.yaml file. +# User need to download secret.yaml file and fill required data in secret file. Provided the path of the secret file here. + +csi_powerscale_driver_secret_file_path: "" + +# File path for the values.yml file which will contain the Powerscale driver configuration parameters. +# User need to download values.yaml file and fill required data in values.yaml file. Provided the path of the values.yaml file here. +# mention configurable values +csi_powerscale_driver_values_file_path: "" + diff --git a/upgrade/roles/import_input_parameters/templates/1.7/passwordless_ssh_config.j2 b/upgrade/roles/import_input_parameters/templates/1.7/passwordless_ssh_config.j2 new file mode 100644 index 000000000..642e715b4 --- /dev/null +++ b/upgrade/roles/import_input_parameters/templates/1.7/passwordless_ssh_config.j2 @@ -0,0 +1,29 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +#*********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# SIMPLY APPEND THE REQUIRD VALUES AGAINST THE PARAMETER OF YOUR CHOICE. +#*********************************************************************** + +# This variable accepts the user name for which passwordless ssh needs to be setup +# Eg. user_name: "user1,user2,user3" +user_name: "{{ passwordless_ssh_config_ns.user_name }}" # Value auto populated by Omnia upgrade script + +# Variable indicating whether FreeIPA or LDAP is setup +# It can be "freeipa" or "ldap" +# Default value: ldap +authentication_type: "{{ passwordless_ssh_config_ns.authentication_type | default('ldap') }}" # Value auto populated by Omnia upgrade script + diff --git a/upgrade/roles/import_input_parameters/templates/provision_config.j2 b/upgrade/roles/import_input_parameters/templates/1.7/provision_config.j2 similarity index 67% rename from upgrade/roles/import_input_parameters/templates/provision_config.j2 rename to upgrade/roles/import_input_parameters/templates/1.7/provision_config.j2 index ac58c0b35..4ef00dfa4 100644 --- a/upgrade/roles/import_input_parameters/templates/provision_config.j2 +++ b/upgrade/roles/import_input_parameters/templates/1.7/provision_config.j2 @@ -13,10 +13,10 @@ # limitations under the License. --- -#*********************************************************************** +# *********************************************************************** # DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. # SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE. -#*********************************************************************** +# *********************************************************************** #### Mandatory # Path where user has placed the iso image that needs to be provisioned on target nodes. @@ -24,7 +24,7 @@ # Value of iso_file_path should contain cluster_os_type and cluster_os_version values from software_config.json # When cluster_os_type is ubuntu and cluster_os_version is 22.04, by default omnia configures generic kernel. # To customize kernel flavor in ubuntu 22.04, define ubuntu_kernel_flavor as 'hwe' or 'generic' -iso_file_path: "{{ iso_file_path }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +iso_file_path: "{{ provision_config_ns.iso_file_path }}" # Value auto populated by Omnia upgrade script #### Mandatory # Nodes present in mapping file, won't be considered for this node_name. @@ -32,7 +32,7 @@ iso_file_path: "{{ iso_file_path }}" # Value auto populated by Omnia 1.5-1.6 upg # Hostname = node_name + '0000x' + domain_name # Hostname <= 65 characters. # Example: servernode00001.Omnia.test , where node_name=servernode, domain_name=Omnia.test , 00001 used by Omnia -node_name: "{{ node_name }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +node_name: "{{ provision_config_ns.node_name }}" # Value auto populated by Omnia upgrade script #### Mandatory # Domain name the user intends to configure on the cluster. @@ -41,7 +41,7 @@ node_name: "{{ node_name }}" # Value auto populated by Omnia 1.5-1.6 upgrade scr # Hostname <= 65 characters. # Example: servernode00001.Omnia.test ,where node_name=servernode ,domain_name=Omnia.test ,00001 used by Omnia # Examples: abcd.test, ipa.test -domain_name: "{{ domain_name }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +domain_name: "{{ provision_config_ns.domain_name }}" # Value auto populated by Omnia upgrade script #### Optional # This depicts the path where user has kept the PXE mapping file. @@ -49,7 +49,7 @@ domain_name: "{{ domain_name }}" # Value auto populated by Omnia 1.5-1.6 upgrade # Ensure that admin IPs given in mapping file are within the admin_static_ranges. # A templates for mapping file exists in omnia/examples, namely, pxe_mapping_file.csv # Format of csv: SERVICE_TAG,ADMIN_MAC,HOSTNAME,ADMIN_IP,BMC_IP -pxe_mapping_file_path: "{{ pxe_mapping_file_path }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +pxe_mapping_file_path: "{{ provision_config_ns.pxe_mapping_file_path }}" # Value auto populated by Omnia upgrade script #### Mandatory # Variable indicates whether switch based discovery should be enabled to discover the nodes @@ -62,7 +62,7 @@ pxe_mapping_file_path: "{{ pxe_mapping_file_path }}" # Value auto populated by O # For enabling bmc discovery, set enable_switch_based to false and provide bmc_network details in network_spec.yml # It accepts true and false values # Default: false -enable_switch_based: {{ import_input_parameters_enable_switch_based | lower | regex_replace('["\']', '') }} +enable_switch_based: {{ provision_config_ns.enable_switch_based | lower | regex_replace('["\']', '') }} # Value auto populated by Omnia upgrade script #### Mandatory, when enabled_switch_based is set to true # Ports indicates the port no. where the servers are connected on switch @@ -72,9 +72,9 @@ enable_switch_based: {{ import_input_parameters_enable_switch_based | lower | re # Example2 with 2 switches: # - { ip: 172.96.28.12, ports: '1-48,49:3,50' } # - { ip: 172.96.28.14, ports: '1,2,3,5' } -switch_based_details: # Value auto populated by Omnia 1.5-1.6 upgrade script -{% for detail in switch_based_details %} - - { ip: "{{ detail.ip }}", ports: "{{ detail.ports }}" } +switch_based_details: # Value auto populated by Omnia upgrade script +{% for detail in provision_config_ns.switch_based_details %} + - { ip: {{ detail.ip }} , ports: '{{ detail.ports }}' } {% endfor %} #### Optional @@ -86,22 +86,22 @@ switch_based_details: # Value auto populated by Omnia 1.5-1.6 upgrade script # Example for disk_partition: # disk_partition: # - { mount_point: "/var", desired_capacity: "102400" } -disk_partition: # Value auto populated by Omnia 1.5-1.6 upgrade script -{% for detail in disk_partition %} +disk_partition: # Value auto populated by Omnia upgrade script +{% for detail in provision_config_ns.disk_partition %} - { mount_point: "{{ detail.mount_point }}", desired_capacity: "{{ detail.desired_capacity }}" } {% endfor %} #### Mandatory # Timezone that needs to be set during OS provisioning. -# Available timezones are provided in provision/roles/provision_validation/files/timezone.txt +# Available timezones are provided in discovery/roles/discovery_validations/common/files/timezone.txt # Default: "GMT" # Few accepted values: EST,CET,MST,CST6CDT,PST8PDT -timezone: "{{ timezone }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +timezone: "{{ provision_config_ns.timezone }}" # Value auto populated by Omnia upgrade script #### Mandatory # Language that needs to be set during OS provisioning. # Default language supported is "en-US" -language: "{{ language }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +language: "{{ provision_config_ns.language }}" # Value auto populated by Omnia upgrade script #### Mandatory # Default lease time needs to be used by DHCP @@ -109,5 +109,13 @@ language: "{{ language }}" # Value auto populated by Omnia 1.5-1.6 upgrade scrip # Min: 21600 # Default: 86400 # Max: 31536000 -default_lease_time: "{{ default_lease_time }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +default_lease_time: "{{ provision_config_ns.default_lease_time }}" # Value auto populated by Omnia upgrade script + +#### Mandatory +# The ntp_support variable controls whether the cluster will have a Network Time Protocol (NTP) server configured in the Omnia Infrastructure Manager. +# If ntp_support is set to true, NTP server will be configured in the Omnia Infrastructure Manager and the time will be synchronized to the cluster nodes. +# If ntp_support is set to false, NTP server will not be configured in the Omnia Infrastructure Manager and the time will not be synchronized to the cluster nodes. +# In a proxy environment or environment with restricted network access, setting up NTP server in Omnia Infrastructure Manager can result in failure due to unreachable public NTP pools and is not recommended. +# Default: true +ntp_support: true diff --git a/upgrade/roles/import_input_parameters/templates/provision_config_credentials.j2 b/upgrade/roles/import_input_parameters/templates/1.7/provision_config_credentials.j2 similarity index 65% rename from upgrade/roles/import_input_parameters/templates/provision_config_credentials.j2 rename to upgrade/roles/import_input_parameters/templates/1.7/provision_config_credentials.j2 index 7d5827bda..9086bc587 100644 --- a/upgrade/roles/import_input_parameters/templates/provision_config_credentials.j2 +++ b/upgrade/roles/import_input_parameters/templates/1.7/provision_config_credentials.j2 @@ -17,36 +17,36 @@ # Password that needs to be set during OS provisioning for root users. # Length >= 8 characters # Password must not contain -,\, '," -provision_password: "{{ provision_password }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +provision_password: "{{ provision_config_credentials_ns.provision_password }}" # Value auto populated by Omnia upgrade script #### Mandatory # Password that needs to be set for postgres database. # Length >= 8 characters # Password should contain only alphanumeric characters -postgresdb_password: "{{ postgresdb_password }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +postgresdb_password: "{{ provision_config_credentials_ns.postgresdb_password }}" # Value auto populated by Omnia upgrade script #### Mandatory # The credentials for idrac # The credentials should be same across all the servers # The credentials must not contain -,\, '," -bmc_username: "{{ bmc_username }}" # Value auto populated by Omnia 1.5-1.6 upgrade script -bmc_password: "{{ bmc_password }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +bmc_username: "{{ provision_config_credentials_ns.bmc_username }}" # Value auto populated by Omnia upgrade script +bmc_password: "{{ provision_config_credentials_ns.bmc_password }}" # Value auto populated by Omnia upgrade script #### Mandatory, when enabled_switch_based is set to true in provision_config.yml # Non-admin SNMPv3 credentials of the PXE switch # If multiple switches are provided, these credentials should be same across all the switches. # Configuring the switch with SNMPv3 credentials should be done prior to execution of discovery_provision.yml by the user # The credentials must not contain -,\, '," -switch_snmp3_username: "{{ switch_snmp3_username }}" # Value auto populated by Omnia 1.5-1.6 upgrade script -switch_snmp3_password: "{{ switch_snmp3_password }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +switch_snmp3_username: "{{ provision_config_credentials_ns.switch_snmp3_username }}" # Value auto populated by Omnia upgrade script +switch_snmp3_password: "{{ provision_config_credentials_ns.switch_snmp3_password }}" # Value auto populated by Omnia upgrade script # Username for Dockerhub account # This will be used for Docker login and a kubernetes secret will be created and patched to service account in default namespace. # This kubernetes secret can be used to pull images from private repositories # This value is optional but suggested avoiding docker pull limit issues -docker_username: "{{ docker_username }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +docker_username: "{{ provision_config_credentials_ns.docker_username }}" # Value auto populated by Omnia upgrade script # Password for Dockerhub account # This will be used for Docker login # This value is mandatory if docker username is provided -docker_password: "{{ docker_password }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +docker_password: "{{ provision_config_credentials_ns.docker_password }}" # Value auto populated by Omnia upgrade script diff --git a/upgrade/roles/import_input_parameters/templates/1.7/roce_plugin_config.j2 b/upgrade/roles/import_input_parameters/templates/1.7/roce_plugin_config.j2 new file mode 100644 index 000000000..22509fa9a --- /dev/null +++ b/upgrade/roles/import_input_parameters/templates/1.7/roce_plugin_config.j2 @@ -0,0 +1,43 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# This config file will be used for deploying k8s roce plugin. +# Roce plugin is supported only on Ubuntu OS. + + +# Define network interfaces with their respective IP ranges which should be assigned inside the pod in this dictionary +# Number of entries in this dictionary should be equal to number of bcm roce interfaces which must be used for roce pod. +# VLAN nics are not supported for roce plugin deployment +# A limit of 8 roce interfaces is supported by the rocepod deployed. +# name (Mandatory): The name of the interface of the roce nic +# range (Mandatory): The IP range for this interface which should be assigned inside rocepod. It is specified in CIDR notation. +# range_start (Optional): This specifies the starting IP address within the defined range for assigning IPs to network interfaces. +# range_end (Optional): This specifies the ending IP address within the defined range for assigning IPs to network interfaces. +# gateway (Optional): This specifies the IP address of the gateway for the network. +# route (Optional): This specifies additional routing rules for the network interface. Routes determine the paths that packets take to reach specific networks or hosts. +# Omnia does not validate the inputs, it is user's responsibility to provide inputs for the required parameters. +# Eg: If user wants to restrict the IP range, range_start and range_end should be provided with range. +# Eg2: If gateway is to be set, then user should provide gateway and route along with range. +interfaces: # Value auto populated by Omnia upgrade script +{% for interface in roce_plugin_config_ns.interfaces %} +- name: {{ interface.name }} + range: {{ interface.range }} + range_start: {{ interface.range_start | default('') }} + range_end: {{ interface.range_end | default('') }} + gateway: {{ interface.gateway | default('') }} + route: {{ interface.route | default('') }} +{% endfor %} + + diff --git a/upgrade/roles/import_input_parameters/templates/security_config.j2 b/upgrade/roles/import_input_parameters/templates/1.7/security_config.j2 similarity index 74% rename from upgrade/roles/import_input_parameters/templates/security_config.j2 rename to upgrade/roles/import_input_parameters/templates/1.7/security_config.j2 index 0740468f0..24f166c81 100644 --- a/upgrade/roles/import_input_parameters/templates/security_config.j2 +++ b/upgrade/roles/import_input_parameters/templates/1.7/security_config.j2 @@ -25,32 +25,32 @@ # For FreeIPA - the format is omnia.test # For OpenLDAP - if dc=omnia,dc=test, Provide omnia.test # For OpenLDAP - If dc=dell,dc=omnia,dc=com Provide dell.omnia.com -domain_name: "{{ domain_name }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +domain_name: "{{ security_config_ns.domain_name }}" # Value auto populated by Omnia upgrade script # ---------------------------FreeIPA--------------------------- # A Kerberos realm is the domain over which a Kerberos authentication server has the authority to authenticate a user, host or service. # A realm name is often, but not always the upper case version of the name of the DNS domain over which it presides -realm_name: "{{ realm_name }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +realm_name: "{{ security_config_ns.realm_name }}" # Value auto populated by Omnia upgrade script # The directory server operations require an administrative user. # This user is referred to as the Directory Manager and has full access to the Directory for system management tasks # and will be added to the instance of directory server created for IPA. # The password must be at least 8 characters long # The password must not contain -,\, '," -directory_manager_password: "{{ directory_manager_password }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +directory_manager_password: "{{ security_config_ns.directory_manager_password }}" # Value auto populated by Omnia upgrade script # kerberos_admin_password used by IPA admin user in Rocky OS and used by 389-ds for kerberos admin password in leap OS # The IPA server requires an administrative user, named 'admin'. # This user is a regular system account used for IPA server administration # The password must be at least 8 characters long # The password must not contain -,\, '," -kerberos_admin_password: "{{ kerberos_admin_password }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +kerberos_admin_password: "{{ security_config_ns.kerberos_admin_password }}" # Value auto populated by Omnia upgrade script # ---------------------------OpenLDAP--------------------------- # Connection Type can be TLS, SSL # If TLS is given, valid server certificates are mandatory # If SSL is provided, then secure OpenLDAP connection occurs on the port 636 -ldap_connection_type: "{{ ldap_connection_type }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +ldap_connection_type: "{{ security_config_ns.ldap_connection_type }}" # Value auto populated by Omnia upgrade script # These variables accept Certificate file Paths # Make sure certificate is present in the path provided @@ -59,11 +59,11 @@ ldap_connection_type: "{{ ldap_connection_type }}" # Value auto populated by Omn # These variable are optional i.e. if not provided, self signed certificates will be created # The variables accepts values as follows # Certificate Authority(CA) issued certificate file path -tls_ca_certificate: "{{ ldap_ca_cert_path }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +tls_ca_certificate: "{{ security_config_ns.tls_ca_certificate }}" # Value auto populated by Omnia upgrade script # OpenLDAP Certificate file path -tls_certificate: "{{ import_input_parameters_tls_certificate }}" +tls_certificate: "{{ security_config_ns.tls_certificate }}" # OpenLDAP Certificate key file path -tls_certificate_key: "{{ import_input_parameters_tls_certificate_key }}" +tls_certificate_key: "{{ security_config_ns.tls_certificate_key }}" # OpenLDAP server is configured using three admin users for configuration, database, and monitor # By default the user for configuration and database is admin @@ -72,11 +72,11 @@ tls_certificate_key: "{{ import_input_parameters_tls_certificate_key }}" # Update this variable if these users are different from admin # The password must be at least 8 characters long # The password must not contain -,\, '," -openldap_db_username: "{{ import_input_parameters_openldap_db_username }}" -openldap_db_password: "{{ import_input_parameters_openldap_db_password }}" -openldap_config_username: "{{ import_input_parameters_openldap_config_username }}" -openldap_config_password: "{{ import_input_parameters_tls_certificate }}" -openldap_monitor_password: "{{ import_input_parameters_openldap_monitor_password }}" +openldap_db_username: "{{ security_config_ns.openldap_db_username }}" +openldap_db_password: "{{ security_config_ns.openldap_db_password }}" +openldap_config_username: "{{ security_config_ns.openldap_config_username }}" +openldap_config_password: "{{ security_config_ns.openldap_config_password }}" +openldap_monitor_password: "{{ security_config_ns.openldap_monitor_password }}" # OpenLDAP server is configured using organizations # These organizations and its units are necessary for user creation and group mapping @@ -84,5 +84,5 @@ openldap_monitor_password: "{{ import_input_parameters_openldap_monitor_password # Here user belongs to Omnia organization and is part of People unit # The default values are openldap_organization: "omnia", openldap_organizational_unit: "People" # Variables can be updated based om user requirements -openldap_organization: "{{ import_input_parameters_openldap_organization }}" -openldap_organizational_unit: "{{ import_input_parameters_openldap_organizational_unit }}" +openldap_organization: "{{ security_config_ns.openldap_organization }}" +openldap_organizational_unit: "{{ security_config_ns.openldap_organizational_unit }}" diff --git a/upgrade/roles/import_input_parameters/templates/1.7/server_spec.j2 b/upgrade/roles/import_input_parameters/templates/1.7/server_spec.j2 new file mode 100644 index 000000000..9cdeb56b3 --- /dev/null +++ b/upgrade/roles/import_input_parameters/templates/1.7/server_spec.j2 @@ -0,0 +1,42 @@ +--- +Categories: # Value auto populated by Omnia upgrade script +{% for category in server_spec_ns.Categories %} +{% for group, group_content in category.items() %} + - {{ group }}: +{% for item in group_content %} + - network: +{% for network in item.network %} +{% for network_key, network_content in network.items() %} + - {{ network_key }}: +{% for key, value in network_content.items() %} + {{ key }}: "{{ value }}" +{% endfor %} +{% endfor %} + +{% endfor %} +{% endfor %} +{% endfor %} +{% endfor %} + +#********************************************************************** +# Following are the templates for providing additional network and OS details. +# Users may include the `os` or `network` sections individually if only one +# of them needs to be configured, or both together as well. +# +# - Use space (' ') as a delimiter in case of multiple parameters for cmdline. +#********************************************************************** +# +# Example for configuring only network settings: +# Categories: +# - group-1: +# - network: +# - eno1: +# nicnetwork: "nic_network1" +# nictypes: "ethernet" +# +# Example for configuring only OS settings: +# Categories: +# - group-2: +# - os: +# - kernel: +# - cmdline: "iommu=pt intel_iommu=off pci=realloc=off processor.max_cstate=0 intel_idle.max_cstate=0 intel_pstate=disable" diff --git a/upgrade/roles/import_input_parameters/templates/storage_config.j2 b/upgrade/roles/import_input_parameters/templates/1.7/storage_config.j2 similarity index 75% rename from upgrade/roles/import_input_parameters/templates/storage_config.j2 rename to upgrade/roles/import_input_parameters/templates/1.7/storage_config.j2 index 8d8f470ea..a4eb6f581 100644 --- a/upgrade/roles/import_input_parameters/templates/storage_config.j2 +++ b/upgrade/roles/import_input_parameters/templates/1.7/storage_config.j2 @@ -19,13 +19,14 @@ #*********************************************************************** # -----------------------------NFS------------------------------------------------ + # USER have to mount EXTERNAL NFS server, omnia will mount NFS client when nfs_server value is false. # omnia will setup NFS server and mount NFS client when nfs_server value is true. # This variable is used for creating NFS share on slurm_control_node, slurm_node, kube_control_plane, kube_node, auth_server, login, etcd nodes # Values should be entered in JSON format only. # If mount_option values are empty, NFS client will be mounted with these values "nosuid,rw,sync,hard,intr" # Its mandatory to provide atleast one entry in nfs_client_params -# If user wants to setup NFS server on control plane "localhost" can be mentioned as server_ip or admin_nic_ip of control plane also can be provided. +# If user wants to setup NFS server on Omnia Infrastructure Manager "localhost" can be mentioned as server_ip or admin_nic_ip of Omnia Infrastructure Manager also can be provided. # For the server which must be used as k8s server share for NFS external provisioner must be given k8s_share as true # For the server which must be used as slurm share, slurm_share must be given as true # For benchmarks, either slurm_share or k8s_share will be used. Higher precedence will be given to slurm_share @@ -41,51 +42,48 @@ # - { server_ip: 198.168.0.1, server_share_path: "/mnt/share1", client_share_path: "/mnt/mount1", client_mount_options: "nosuid,rw,sync,hard", nfs_server: false , slurm_share: false, k8s_share: true} # - { server_ip: 198.168.0.2, server_share_path: "/mnt/share2", client_share_path: "/mnt/mount2", client_mount_options: "nosuid,rw,sync,hard", nfs_server: false , slurm_share: true, k8s_share: false} -nfs_client_params: # Value auto populated by Omnia 1.5-1.6 upgrade script -{% for params in nfs_client_params %} +nfs_client_params: # Value auto populated by Omnia upgrade script +{% for params in storage_config_ns.nfs_client_params %} {% if params.server_ip is not none and params.server_share_path is not none and params.client_share_path is not none and params.client_mount_options is not none %} -- { server_ip: "{{ params.server_ip }}", server_share_path: "{{ params.server_share_path }}", client_share_path: "{{ params.client_share_path }}", client_mount_options: "{{ params.client_mount_options }}", nfs_server: false, k8s_share: false, slurm_share: false } +- { server_ip: "{{ params.server_ip }}", server_share_path: "{{ params.server_share_path }}", client_share_path: "{{ params.client_share_path }}", client_mount_options: "{{ params.client_mount_options }}", nfs_server: {{ params.nfs_server | lower | regex_replace('["\']', '') }}, k8s_share: {{ params.k8s_share | lower | regex_replace('["\']', '') }}, slurm_share: {{ params.slurm_share | lower | regex_replace('["\']', '') }} } {% endif %} {% endfor %} -{% if enable_omnia_nfs %} -{% set slurm_present = 'slurm' in scheduler_type.split(',') %} -{% set k8s_present = 'k8s' in scheduler_type.split(',') %} -- { server_ip: localhost, server_share_path: "/mnt/omnia_home_share", client_share_path: "/home", client_mount_options: "nosuid,rw,sync,hard,intr", nfs_server: true, k8s_share: {{ k8s_present }}, slurm_share: {{ slurm_present }} } -{% endif %} + +# ----------------------------BEEGFS--------------------------------------------------- # This variable is used if user has RDMA-capable network hardware (e.g., InfiniBand) # Accepted values: true or false # Default value: false # To enable support for remote direct memory access (RDMA), set this value to true -beegfs_rdma_support: {{ beegfs_rdma_support | lower | regex_replace('["\']', '') }} # Value auto populated by Omnia 1.5-1.6 upgrade script +beegfs_rdma_support: {{ storage_config_ns.beegfs_rdma_support | lower | regex_replace('["\']', '') }} # Value auto populated by Omnia upgrade script # This variable if there are separate OFED kernel modules installed # Add path for kernel module # Default value is "/usr/src/ofa_kernel/default/include" # Please make sure this path is present on your nodes, else give appropriate path before execution -beegfs_ofed_kernel_modules_path: "{{ beegfs_ofed_kernel_modules_path }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +beegfs_ofed_kernel_modules_path: "{{ storage_config_ns.beegfs_ofed_kernel_modules_path }}" # Value auto populated by Omnia upgrade script # This variable is used for taking BeeGFS management server IP # Provide IP of beegfs management server # Required field -beegfs_mgmt_server: "{{ beegfs_mgmt_server }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +beegfs_mgmt_server: "{{ storage_config_ns.beegfs_mgmt_server }}" # Value auto populated by Omnia upgrade script # Beegfs-client file system mount location # Default value is "/mnt/beegfs" # Make sure beegfs_unmount_client value is true, for changing BeeGFS mounts location -beegfs_mounts: "{{ beegfs_mounts }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +beegfs_mounts: "{{ storage_config_ns.beegfs_mounts }}" # Value auto populated by Omnia upgrade script # The value of beegfs_unmount_client should be true, if there is a change in beegfs_mounts value or beegfs_client_version # Accepted values: true or false # Default value: false # changing this value to true will unmount running instance of BeeGFS client -beegfs_unmount_client: {{ beegfs_unmount_client | lower | regex_replace('["\']', '') }} # Value auto populated by Omnia 1.5-1.6 upgrade script +beegfs_unmount_client: {{ storage_config_ns.beegfs_unmount_client | lower | regex_replace('["\']', '') }} # Value auto populated by Omnia upgrade script # For installing different version of BeeGFS client than the version which is already running on the BeeGFS cluster # Accepted values: true or false # Default value: false # If there is a need for installing different version of BeeGFS client then make this variable to true -beegfs_version_change: {{ beegfs_version_change | lower | regex_replace('["\']', '') }} # Value auto populated by Omnia 1.5-1.6 upgrade script +beegfs_version_change: {{ storage_config_ns.beegfs_version_change | lower | regex_replace('["\']', '') }} # Value auto populated by Omnia upgrade script # Beegfs secret storage file or authentication file # Required for Beegfs version >= 7.2.7 @@ -93,4 +91,4 @@ beegfs_version_change: {{ beegfs_version_change | lower | regex_replace('["\']', # Accepted value: Path along with filename for secret stoarge file configured # on beegfs server # If this is not provided, beegfs installation will fail -beegfs_secret_storage_filepath: "{{ beegfs_secret_storage_filepath }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +beegfs_secret_storage_filepath: "{{ storage_config_ns.beegfs_secret_storage_filepath }}" # Value auto populated by Omnia upgrade script diff --git a/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 b/upgrade/roles/import_input_parameters/templates/1.7/telemetry_config.j2 similarity index 58% rename from upgrade/roles/import_input_parameters/templates/telemetry_config.j2 rename to upgrade/roles/import_input_parameters/templates/1.7/telemetry_config.j2 index 9932ed8e6..d5ece2312 100644 --- a/upgrade/roles/import_input_parameters/templates/telemetry_config.j2 +++ b/upgrade/roles/import_input_parameters/templates/1.7/telemetry_config.j2 @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,66 +20,82 @@ # This variable is used to enable iDRAC telemetry support # Accepted values: true or false -idrac_telemetry_support: {{ idrac_telemetry_support | lower | regex_replace('["\']', '') }} # Value auto populated by Omnia 1.5-1.6 upgrade script +idrac_telemetry_support: {{ telemetry_config_ns.idrac_telemetry_support | lower | regex_replace('["\']', '') }} # Value auto populated by Omnia upgrade script # This variable is used to start or stop omnia telemetry # Accepted values: true or false # If omnia_telemetry_support is true, then at least one of collect_regular_metrics or collect_health_check_metrics or collect_gpu_metrics should be true, to collect metrics # If omnia_telemetry_support is false, telemetry acquisition will be stopped -omnia_telemetry_support: {{ omnia_telemetry_support | lower | regex_replace('["\']', '') }} # Value auto populated by Omnia 1.5-1.6 upgrade script +omnia_telemetry_support: {{ telemetry_config_ns.omnia_telemetry_support | lower | regex_replace('["\']', '') }} # Value auto populated by Omnia upgrade script # This variable is used to enable visualizations on grafana # Accepted values: true or false -visualization_support: {{ visualization_support | lower | regex_replace('["\']', '') }} # Value auto populated by Omnia 1.5-1.6 upgrade script +visualization_support: {{ telemetry_config_ns.visualization_support | lower | regex_replace('["\']', '') }} # Value auto populated by Omnia upgrade script -##### BELOW VARIABLES ARE MANDATORY IF EITHER idrac_telemetry_support OR omnia_telemetry_support OR visualization_support IS true -###-----CONFIGURATIONS FOR KUBERNETES------ -# These addresses will be used by Loadbalancer for assigning External IPs to K8s services +# This variable signifies support for k8s metric collection and Kube Prometheus deployment on kube_control_plane +# Accepted values: true or false +k8s_prometheus_support: false + +# This variable denotes the time interval for prometheus to collect metrics for targets +# This variable accepts input in seconds +# Default value is 15 +prometheus_scrape_interval: 15 + +# This variable signifies the support for Intel Gaudi habana Metric collection using Gaudi Prometheus metric exporter. +# k8s_prometheus_support must be true for this metric support. +# prometheus_gaudi_support is only available for cluster_os_type: ubuntu and cluster_os_version: 22.04 in software_config.json +# Accepted values: true or false +prometheus_gaudi_support: false + +##### BELOW VARIABLES ARE MANDATORY IF telemetry ENTRY is PRESENT in software_config.json AND EITHER idrac_telemetry_support OR omnia_telemetry_support OR visualization_support IS true +###-----CONFIGURATIONS FOR KUBERNETES ON Omnia Infrastructure Manager FOR TELEMETRY SUPPORT------ +# These addresses will be used by Loadbalancer for assigning External IPs to K8s services running on Omnia Infrastructure Manager # Make sure the IP range is not assigned to any node in the cluster. # If admin_nic network provided in network_spec.yml is in 10.11.0.0 network, then pod_external_ip_range should be in same netwwork like "10.11.0.60-10.11.0.70" # Acceptable formats: "10.11.0.100-10.11.0.150" , "10.11.0.0/16" +# Provide a different ip range than that of external ip range entered in omnia_config.yml # Mandatory Field -pod_external_ip_range: "{{ import_input_parameters_pod_external_ip_range }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +pod_external_ip_range: "{{ telemetry_config_ns.pod_external_ip_range }}" # Value auto populated by Omnia upgrade script # Kubernetes SDN network. # Accepted values: "calico" or "flannel". # Default value assigned is "calico". -k8s_cni: "{{ k8s_cni }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +k8s_cni: "{{ telemetry_config_ns.k8s_cni }}" # Value auto populated by Omnia upgrade script ###------ADVANCE CONFIGURATIONS FOR KUBERNETES------ # Kubernetes internal network for services. # This network must be unused in your network infrastructure. # Default value is "10.233.0.0/18" -k8s_service_addresses: "{{ import_input_parameters_k8s_service_addresses }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +k8s_service_addresses: "{{ telemetry_config_ns.k8s_service_addresses }}" # Value auto populated by Omnia upgrade script # Kubernetes pod network CIDR for internal network. When used, it will assign IP # addresses from this range to individual pods. # This network must be unused in your network infrastructure. # Default value is "10.233.64.0/18" -k8s_pod_network_cidr: "{{ k8s_pod_network_cidr }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +k8s_pod_network_cidr: "{{ telemetry_config_ns.k8s_pod_network_cidr }}" # Value auto populated by Omnia upgrade script # Username used for connecting to timescale db # The username must not contain -,\, '," # The Length of the username should be at least 2 characters. # Mandatory field -timescaledb_user: "{{ timescaledb_user }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +timescaledb_user: "{{ telemetry_config_ns.timescaledb_user }}" # Value auto populated by Omnia upgrade script # Password used for connecting to timescale db # The password must not contain -,\, ',",@ # The Length of the password should be at least 2 characters. # Mandatory field -timescaledb_password: "{{ timescaledb_password }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +timescaledb_password: "{{ telemetry_config_ns.timescaledb_password }}" # Value auto populated by Omnia upgrade script ##### BELOW VARIABLES ARE MANDATORY WHEN idrac_telemetry_support IS SET TO true # The username for idrac # The username must not contain -,\, '," # Required only if idrac_telemetry_support is true -idrac_username: "{{ idrac_username }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +idrac_username: "{{ telemetry_config_ns.idrac_username }}" # Value auto populated by Omnia upgrade script # Password used for idrac # The password must not contain -,\, '," # Required only if idrac_telemetry_support is true -idrac_password: "{{ idrac_password }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +idrac_password: "{{ telemetry_config_ns.idrac_password }}" # Value auto populated by Omnia upgrade script ### ADVANCE CONFIGURATIONS FOR IDRAC TELEMETRY ### # Username used for connecting to mysql db @@ -87,19 +103,19 @@ idrac_password: "{{ idrac_password }}" # Value auto populated by Omnia 1.5-1.6 u # The Length of the username should be at least 2 characters. # The username should not be kept 'root'. # Mandatory field when idrac_telemetry_support is true -mysqldb_user: "{{ mysqldb_user }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +mysqldb_user: "{{ telemetry_config_ns.mysqldb_user }}" # Value auto populated by Omnia upgrade script # Password used for connecting to mysql db # The password must not contain -,\, '," # The Length of the password should be at least 2 characters. # Mandatory field when idrac_telemetry_support is true -mysqldb_password: "{{ mysqldb_password }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +mysqldb_password: "{{ telemetry_config_ns.mysqldb_password }}" # Value auto populated by Omnia upgrade script # Password used for connecting to timescale db for root user # The password must not contain -,\, '," # The Length of the password should be at least 2 characters. # Mandatory field when idrac_telemetry_support is true -mysqldb_root_password: "{{ mysqldb_root_password }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +mysqldb_root_password: "{{ telemetry_config_ns.mysqldb_root_password }}" # Value auto populated by Omnia upgrade script ##### BELOW VARIABLES ARE MANDATORY WHEN omnia_telemetry_support IS SET TO true # This variable denotes the time interval of telemetry data collection from required computue nodes @@ -108,19 +124,19 @@ mysqldb_root_password: "{{ mysqldb_root_password }}" # Value auto populated by O # Example 1: omnia_telemetry_collection_interval: 300 # Example 2: omnia_telemetry_collection_interval: 600 # Valid range: minimum 60 seconds and maximum 3600 seconds(i.e. 1 minute to 1 hour) -omnia_telemetry_collection_interval: {{ omnia_telemetry_collection_interval | regex_replace('["\']', '') }} # Value auto populated by Omnia 1.5-1.6 upgrade script +omnia_telemetry_collection_interval: {{ telemetry_config_ns.omnia_telemetry_collection_interval | regex_replace('["\']', '') }} # Value auto populated by Omnia upgrade script # This variable is used to enable metric collection part of the regular metric group # Accepted values: true or false -collect_regular_metrics: {{ collect_regular_metrics | lower | regex_replace('["\']', '') }} # Value auto populated by Omnia 1.5-1.6 upgrade script +collect_regular_metrics: {{ telemetry_config_ns.collect_regular_metrics | lower | regex_replace('["\']', '') }} # Value auto populated by Omnia upgrade script # This variable is used to enable metric collection related to health check # Accepted values: true or false -collect_health_check_metrics: {{ collect_health_check_metrics | lower | regex_replace('["\']', '') }} # Value auto populated by Omnia 1.5-1.6 upgrade script +collect_health_check_metrics: {{ telemetry_config_ns.collect_health_check_metrics | lower | regex_replace('["\']', '') }} # Value auto populated by Omnia upgrade script # This variable is used to enable metric collection related to GPU # Accepted values: true or false -collect_gpu_metrics: {{ collect_gpu_metrics | lower | regex_replace('["\']', '') }} # Value auto populated by Omnia 1.5-1.6 upgrade script +collect_gpu_metrics: {{ telemetry_config_ns.collect_gpu_metrics | lower | regex_replace('["\']', '') }} # Value auto populated by Omnia upgrade script ### ADVANCE CONFIGURATIONS FOR OMNIA TELEMETRY ### # This variable is used to set an appropriate time interval for all compute nodes so that they do not congest the admin network @@ -131,7 +147,7 @@ collect_gpu_metrics: {{ collect_gpu_metrics | lower | regex_replace('["\']', '') # Example 2: fuzzy_offset: 100 # For larger cluster this parameter can be set with higher value # This value should be between 60 and omnia_telemetry_collection_interval value -fuzzy_offset: {{ fuzzy_offset | regex_replace('["\']', '') }} # Value auto populated by Omnia 1.5-1.6 upgrade script +fuzzy_offset: {{ telemetry_config_ns.fuzzy_offset | regex_replace('["\']', '') }} # Value auto populated by Omnia upgrade script # This variable is used to define data collection timeout period # This variable accepts input in seconds @@ -139,22 +155,22 @@ fuzzy_offset: {{ fuzzy_offset | regex_replace('["\']', '') }} # Value auto popul # Example 1: metric_collection_timeout: 5 # Example 2: metric_collection_timeout: 10 # This value should be greater than 0 and less than omnia_telemetry_collection_interval value -metric_collection_timeout: {{ metric_collection_timeout | regex_replace('["\']', '') }} # Value auto populated by Omnia 1.5-1.6 upgrade script +metric_collection_timeout: {{ telemetry_config_ns.metric_collection_timeout | regex_replace('["\']', '') }} # Value auto populated by Omnia upgrade script ##### BELOW VARIABLES ARE MANDATORY WHEN visualization_support IS SET TO true # The username for grafana UI # The length of username should be at least 5 # The username must not contain -,\, '," -grafana_username: "{{ grafana_username }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +grafana_username: "{{ telemetry_config_ns.grafana_username }}" # Value auto populated by Omnia upgrade script # Password used for grafana UI # The length of the password should be at least 5 # The password must not contain -,\, '," # The password should not be kept 'admin' -grafana_password: "{{ grafana_password }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +grafana_password: "{{ telemetry_config_ns.grafana_password }}" # Value auto populated by Omnia upgrade script # At this location grafana persistent volume will be created. # If using telemetry, all telemetry related files will also be stored and # both timescale and mysql databases will be mounted to this location. # '/' is mandatory at the end of the path. -mount_location: "{{ mount_location }}" # Value auto populated by Omnia 1.5-1.6 upgrade script +mount_location: "{{ telemetry_config_ns.mount_location }}" # Value auto populated by Omnia upgrade script diff --git a/upgrade/roles/import_input_parameters/templates/network_spec.j2 b/upgrade/roles/import_input_parameters/templates/network_spec.j2 deleted file mode 100644 index 9fb234b64..000000000 --- a/upgrade/roles/import_input_parameters/templates/network_spec.j2 +++ /dev/null @@ -1,41 +0,0 @@ ---- -Networks: - - admin_network: - nic_name: "{{ admin_nic }}" # Value auto populated by Omnia 1.5-1.6 upgrade script - netmask_bits: "{{ import_input_parameters_admin_network_netmask_bits }}" - static_range: "{{ import_input_parameters_admin_network_static_range }}" # Value auto populated by Omnia 1.5-1.6 upgrade script - dynamic_range: "{{ import_input_parameters_admin_network_dynamic_range }}" # Value auto populated by Omnia 1.5-1.6 upgrade script - correlation_to_admin: {{ "true" }} - admin_uncorrelated_node_start_ip: "{{ import_input_parameters_uncorrelated_node_start_ip }}" # Value auto populated by Omnia 1.5-1.6 upgrade script - network_gateway: "{{ import_input_parameters_admin_network_network_gateway }}" - DNS: "{{ import_input_parameters_admin_network_dns }}" - MTU: "{{ import_input_parameters_admin_network_mtu }}" - - - bmc_network: - nic_name: "{{ bmc_network_nic }}" - netmask_bits: "{{ import_input_parameters_bmc_network_netmask_bits }}" - static_range: "{{ import_input_parameters_bmc_network_static_range }}" # Value auto populated by Omnia 1.5-1.6 upgrade script - dynamic_range: "{{ import_input_parameters_bmc_network_dynamic_range }}" # Value auto populated by Omnia 1.5-1.6 upgrade script - reassignment_to_static: {{ "true" }} - discover_range: "{{ import_input_parameters_bmc_network_discover_range }}" # Value auto populated by Omnia 1.5-1.6 upgrade script - network_gateway: "{{ import_input_parameters_bmc_network_network_gateway }}" - MTU: "{{ import_input_parameters_bmc_network_mtu }}" - -#********************************************************************** -# Following are the templates for providing additional network details -#********************************************************************** - -# - thor_network1: -# netmask_bits: "{{ import_input_parameters_thor_network1_netmask_bits }}" -# CIDR: "{{ import_input_parameters_thor_network1_cidr }}" -# network_gateway: "{{ import_input_parameters_thor_network1_network_gateway }}" -# MTU: "{{ import_input_parameters_thor_network1_mtu }}" -# VLAN: "{{ import_input_parameters_thor_network1_vlan }}" - -# - thor_network2: -# netmask_bits: "{{ import_input_parameters_thor_network2_netmask_bits }}" -# static_range: "{{ import_input_parameters_thor_network2_static_range }}" -# network_gateway: "{{ import_input_parameters_thor_network2_network_gateway }}" -# MTU: "{{ import_input_parameters_thor_network2_mtu }}" -# VLAN: "{{ import_input_parameters_thor_network2_vlan }}" - diff --git a/upgrade/roles/import_input_parameters/templates/omnia_config.j2 b/upgrade/roles/import_input_parameters/templates/omnia_config.j2 deleted file mode 100644 index 65aed4cf7..000000000 --- a/upgrade/roles/import_input_parameters/templates/omnia_config.j2 +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -#*********************************************************************** -# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. -# SIMPLY APPEND THE REQUIRD VALUES AGAINST THE PARAMETER OF YOUR CHOICE. -#*********************************************************************** - -# Path to directory hosting ansible config file (ansible.cfg file) -# Default value is "/etc/ansible" -# This directory is on the host running ansible, if ansible is installed using dnf -# If ansible is installed using pip, this path should be set -ansible_config_file_path: "{{ ansible_config_file_path }}" # Value auto populated by Omnia 1.5-1.6 upgrade script - -# -----------------------------SLURM------------------------------------------------ - -# Password used for Slurm database. -# The Length of the password should be at least 8. -# The password must not contain -,\, '," -mariadb_password: "{{ mariadb_password }}" # Value auto populated by Omnia 1.5-1.6 upgrade script - -# This variable accepts whether slurm installation is supported in configless mode or slurm in nfs -# Default value is "configless" -# If the value is "nfs_share", then share_path has to be mentioned -# Slurm should be installed in share_path when slurm_installation_type is set to nfs_share -slurm_installation_type: "configless" # Value auto populated by Omnia 1.5-1.6 upgrade script - -# Variable indicates whether slurm control node services(slurmctld) should be restarted or not -# If restart_slurm_services is set to true, slurmctld services will be restarted on every execution of omnia.yml. -# It accepts true and false values -# Default value is true -restart_slurm_services: {{ import_input_parameters_restart_slurm_services | lower | regex_replace('["\']', '') }} - -#----------------------------K8S------------------------------------------------------ - -# Kubernetes SDN network. -# It can either be "calico" or "flannel". -# Default value assigned is "calico". -k8s_cni: "{{ k8s_cni }}" # Value auto populated by Omnia 1.5-1.6 upgrade script - -# These addresses will be used by Loadbalancer for assigning External IPs to K8s services -# Make sure the IP range is not assigned to any node in the cluster. -# Acceptable formats: "10.11.0.100-10.11.0.150" , "10.11.0.0/16" -# Mandatory Field -pod_external_ip_range: "{{ import_input_parameters_pod_external_ip_range }}" # Value auto populated by Omnia 1.5-1.6 upgrade script - - -###------ADVANCE CONFIGURATIONS FOR KUBERNETES------ -# Kubernetes internal network for services. -# This network must be unused in your network infrastructure. -# Default value is "10.233.0.0/18" -k8s_service_addresses: "{{ import_input_parameters_k8s_service_addresses }}" - -# Kubernetes pod network CIDR for internal network. When used, it will assign IP -# addresses from this range to individual pods. -# This network must be unused in your network infrastructure. -# Default value is "10.233.64.0/18" -k8s_pod_network_cidr: "{{ k8s_pod_network_cidr }}" # Value auto populated by Omnia 1.5-1.6 upgrade script diff --git a/upgrade/roles/import_input_parameters/vars/main.yml b/upgrade/roles/import_input_parameters/vars/main.yml index 97dac2c1b..19ae733e6 100644 --- a/upgrade/roles/import_input_parameters/vars/main.yml +++ b/upgrade/roles/import_input_parameters/vars/main.yml @@ -13,44 +13,52 @@ # limitations under the License. ## File arrays -# Usage: read_parameters.yml +# Usage: generate_new_input_files.yml import_input_parameters_config_file_array: - file: "omnia_config.yml" vault_password_file: ".omnia_vault_key" - file: "provision_config.yml" - vault_password_file: ".provision_vault_key" + vault_password_file: "" + - file: "provision_config_credentials.yml" + vault_password_file: ".provision_credential_vault_key" + - file: "network_spec.yml" + vault_password_file: "" - file: "network_config.yml" vault_password_file: "" - file: "security_config.yml" vault_password_file: ".security_vault.key" - file: "storage_config.yml" vault_password_file: "" + - file: "local_repo_config.yml" + vault_password_file: "" - file: "telemetry_config.yml" vault_password_file: ".telemetry_vault_key" - - file: "accelerator_config.yml" + - file: "k8s_access_config.yml" vault_password_file: "" - file: "login_node_security_config.yml" vault_password_file: "" - file: "passwordless_ssh_config.yml" vault_password_file: "" - - file: "rhsm_config.yml" + - file: "roce_plugin_config.yml" + vault_password_file: "" + - file: "server_spec.yml" vault_password_file: "" + import_input_parameters_new_config_file_array: [] import_input_parameters_config_encrypt_array: - file: "omnia_config.yml" vault_password_file: ".omnia_vault_key" - - file: "provision_config.yml" - vault_password_file: ".provision_vault_key" + - file: "provision_config_credentials.yml" + vault_password_file: ".provision_credential_vault_key" - file: "security_config.yml" vault_password_file: ".security_vault.key" - file: "telemetry_config.yml" vault_password_file: ".telemetry_vault_key" -read_parameters_failed_msg: "Failed to read Omnia v1.5 input config files" +read_parameters_failed_msg: "Failed to read Omnia installed input config files {{ failed_input_files }} Please check for any syntax errors. " -# Usage: generate_new_input_files.yml import_input_parameters_config_template_array: - src: 'omnia_config.j2' dest: 'omnia_config.yml' @@ -66,77 +74,35 @@ import_input_parameters_config_template_array: dest: 'storage_config.yml' - src: 'telemetry_config.j2' dest: 'telemetry_config.yml' + - src: 'local_repo_config.j2' + dest: 'local_repo_config.yml' - src: 'network_spec.j2' dest: 'network_spec.yml' + - src: 'k8s_access_config.j2' + dest: 'k8s_access_config.yml' + - src: 'login_node_security_config.j2' + dest: 'login_node_security_config.yml' + - src: 'passwordless_ssh_config.j2' + dest: 'passwordless_ssh_config.yml' + - src: 'roce_plugin_config.j2' + dest: 'roce_plugin_config.yml' + - src: 'server_spec.j2' + dest: 'server_spec.yml' -input_file_perm: "0644" - -## Template variables -# Usage: omnia_config.j2 -import_input_parameters_restart_slurm_services: true -import_input_parameters_k8s_service_addresses: "10.233.0.0/18" - -# Usage: network_spec.j2, set_network_spec_variables.yml -import_input_parameters_admin_network_netmask_bits: "16" -import_input_parameters_admin_network_static_range: "" -import_input_parameters_admin_network_dynamic_range: "" -import_input_parameters_admin_network_correlation_to_admin: true -import_input_parameters_admin_network_admin_uncorrelated_node_start_ip: "" -import_input_parameters_admin_network_network_gateway: "" -import_input_parameters_admin_network_dns: "" -import_input_parameters_admin_network_mtu: "1500" -import_input_parameters_admin_network_vlan: "" - -bmc_network_nic: "" -import_input_parameters_bmc_network_netmask_bits: "" -import_input_parameters_bmc_network_static_range: "" -import_input_parameters_bmc_network_discover_range: "" -import_input_parameters_bmc_network_dynamic_range: "" -import_input_parameters_bmc_network_reassignment_to_static: true -import_input_parameters_bmc_network_network_gateway: "" -import_input_parameters_bmc_network_mtu: "1500" -import_input_parameters_bmc_network_vlan: "" - -import_input_parameters_ib_network1_netmask_bits: "16" -import_input_parameters_ib_network1_cidr: "" -import_input_parameters_ib_network1_static_range: "" -import_input_parameters_ib_network1_network_gateway: "" -import_input_parameters_ib_network1_mtu: "1500" -import_input_parameters_ib_network1_vlan: "1" -import_input_parameters_thor_network1_netmask_bits: "20" -import_input_parameters_thor_network1_cidr: "10.10.16.0" -import_input_parameters_thor_network1_static_range: "" -import_input_parameters_thor_network1_network_gateway: "" -import_input_parameters_thor_network1_mtu: "1500" -import_input_parameters_thor_network1_vlan: "" +file_perm: "0644" -import_input_parameters_thor_network2_netmask_bits: "20" -import_input_parameters_thor_network2_cidr: "" -import_input_parameters_thor_network2_static_range: "10.10.1.1-10.10.15.254" -import_input_parameters_thor_network2_network_gateway: "" -import_input_parameters_thor_network2_mtu: "1500" -import_input_parameters_thor_network2_vlan: "1" - -# Usage: provision_config.j2 -import_input_parameters_enable_switch_based: false - -# Usage: security_config.j2 -import_input_parameters_tls_certificate: "" -import_input_parameters_tls_certificate_key: "" -import_input_parameters_user_home_dir: "/home" -import_input_parameters_openldap_db_username: "admin" -import_input_parameters_openldap_db_password: "" -import_input_parameters_openldap_config_username: "admin" -import_input_parameters_openldap_config_password: "" -import_input_parameters_openldap_monitor_password: "" -import_input_parameters_openldap_organization: "omnia" -import_input_parameters_openldap_organizational_unit: "People" - - -## Messages displayed to user # Usage: import_input_parameters.yml import_input_parameters_successful_msg: "Prepare upgrade completed. Kindly confirm migrated values and provide input for newly introduced values." -# Usage: encrypt_1_5_inputs.yml -vault_file_perm: '0644' + +input_folder: "{{ role_path }}/../../../input" +input_config_files_array: + - file: "omnia_config.yml" + vault_password_file: ".omnia_vault_key" + - file: "provision_config_credentials.yml" + vault_password_file: ".provision_credential_vault_key" + - file: "security_config.yml" + vault_password_file: ".security_vault.key" + - file: "telemetry_config.yml" + vault_password_file: ".telemetry_vault_key" diff --git a/upgrade/roles/metadata_update/tasks/include_network_spec.yml b/upgrade/roles/metadata_update/tasks/include_network_spec.yml deleted file mode 100644 index e8470767b..000000000 --- a/upgrade/roles/metadata_update/tasks/include_network_spec.yml +++ /dev/null @@ -1,115 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Include network_spec.yml - block: - - name: Include network_spec file - ansible.builtin.include_vars: "{{ network_spec }}" - register: include_network_spec - no_log: true - tags: init - rescue: - - name: Failed to include network_spec.yml - ansible.builtin.fail: - msg: "{{ network_spec_syntax_fail_msg }} Error: {{ include_network_spec.message }}" - -- name: Parse network_spec data - ansible.builtin.set_fact: - network_data: "{{ network_data | default({}) | combine({item.key: item.value}) }}" - with_dict: "{{ Networks }}" - -- name: Set pxe nic static start and end ranges - ansible.builtin.set_fact: - pxe_nic_start_range: "{{ network_data.admin_network.static_range | split('-') | first }}" - pxe_nic_end_range: "{{ network_data.admin_network.static_range | split('-') | last }}" - -- name: Set pxe nic dynamic start and end ranges - ansible.builtin.set_fact: - pxe_nic_dynamic_start_range: "{{ network_data.admin_network.dynamic_range | split('-') | first }}" - pxe_nic_dynamic_end_range: "{{ network_data.admin_network.dynamic_range | split('-') | last }}" - -- name: Set default admin uncorrelated ip - ansible.builtin.set_fact: - admin_uncorrelated_node_start_ip: "{{ pxe_nic_start_range }}" - when: network_data.admin_network.admin_uncorrelated_node_start_ip | default("", true) | length == 0 - -- name: Set admin uncorrelated ip - ansible.builtin.set_fact: - admin_uncorrelated_node_start_ip: "{{ network_data.admin_network.admin_uncorrelated_node_start_ip }}" - when: network_data.admin_network.admin_uncorrelated_node_start_ip | default("", true) | length > 0 - -- name: Check admin network details are valid - block: - - name: Execute Python script to validate network address - ansible.builtin.command: "{{ python_version }} {{ network_address_script }} admin_network" - register: script_output - changed_when: false - environment: - net_data: "{{ network_data | to_json }}" - rescue: - - name: Failed, Invalid admin network details - ansible.builtin.fail: - msg: "{{ fail_msg_admin_nic_details }} {{ fail_msg_admin_nic_ip_details }}" - -- name: Validate admin nic ip status - ansible.builtin.assert: - that: - - script_output.stdout | length > 0 - - script_output.rc == 0 - fail_msg: "{{ fail_msg_admin_nic_details }} {{ fail_msg_admin_nic_ip_details }}" - -- name: Set pxe_nic_ip - ansible.builtin.set_fact: - admin_nic_ip: "{{ script_output.stdout }}" - admin_nic: "{{ network_data.admin_network.nic_name }}" - -- name: Set default admin uncorrelated ip - ansible.builtin.set_fact: - admin_uncorrelated_node_start_ip: "{{ pxe_nic_start_range }}" - when: network_data.admin_network.admin_uncorrelated_node_start_ip | default("", true) | length == 0 - -- name: Set admin uncorrelated ip - ansible.builtin.set_fact: - admin_uncorrelated_node_start_ip: "{{ network_data.admin_network.admin_uncorrelated_node_start_ip }}" - when: network_data.admin_network.admin_uncorrelated_node_start_ip | default("", true) | length > 0 - -- name: Validate admin correlation - ansible.builtin.assert: - that: - - network_data.admin_network.correlation_to_admin == true or - network_data.admin_network.correlation_to_admin == false - fail_msg: "{{ admin_correlation_fail_msg }}" - when: network_data.admin_network.correlation_to_admin != None - -- name: Set default correlation_to_admin status - ansible.builtin.set_fact: - correlation_status: false - when: network_data.admin_network.correlation_to_admin == None - -- name: Set correlation_to_admin status - ansible.builtin.set_fact: - correlation_status: "{{ network_data.admin_network.correlation_to_admin }}" - when: network_data.admin_network.correlation_to_admin != None - -- name: Assign values to pxe nic - ansible.builtin.set_fact: - pxe_nic: "{{ admin_nic }}" - pxe_nic_ip: "{{ admin_nic_ip }}" - pxe_nic_subnet: "{{ (admin_nic_ip + '/' + network_data.admin_network.netmask_bits) | ansible.utils.ipaddr('network') }}" - pxe_nic_netmask: "{{ (admin_nic_ip + '/' + network_data.admin_network.netmask_bits) | ansible.utils.ipaddr('netmask') }}" - -- name: Gather the MAC address of admin nic - ansible.builtin.set_fact: - pxe_mac_address: "{{ ansible_facts[pxe_nic]['macaddress'] | default(None) }}" diff --git a/upgrade/roles/metadata_update/tasks/main.yml b/upgrade/roles/metadata_update/tasks/main.yml deleted file mode 100644 index 577eec365..000000000 --- a/upgrade/roles/metadata_update/tasks/main.yml +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -# Commented 'Remove old metadata' task since upgrade_xcat requires 1.5.1 param present in metadata file -# - name: Remove old metadata -# ansible.builtin.import_tasks: remove_old_metadata.yml - -- name: Read network spec - ansible.builtin.import_tasks: include_network_spec.yml - -- name: Set BMC NIC details - ansible.builtin.import_tasks: set_bmc_nic_vars.yml - -- name: Create metadata - ansible.builtin.import_tasks: update_metadata.yml diff --git a/upgrade/roles/metadata_update/tasks/remove_old_metadata.yml b/upgrade/roles/metadata_update/tasks/remove_old_metadata.yml deleted file mode 100644 index ec0d98d71..000000000 --- a/upgrade/roles/metadata_update/tasks/remove_old_metadata.yml +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Check metadata.yml file existence - ansible.builtin.stat: - path: "{{ meta_path }}" - register: metadata_status - -- name: Delete metadata.yml file if it exists - ansible.builtin.file: - path: "{{ meta_path }}" - state: absent - when: metadata_status.stat.exists diff --git a/upgrade/roles/metadata_update/tasks/set_bmc_nic_vars.yml b/upgrade/roles/metadata_update/tasks/set_bmc_nic_vars.yml deleted file mode 100644 index fd5b8e03e..000000000 --- a/upgrade/roles/metadata_update/tasks/set_bmc_nic_vars.yml +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Set BMC details status - ansible.builtin.set_fact: - bmc_details_status: false - -- name: Checking BMC network input - ansible.builtin.set_fact: - bmc_details_status: true - when: - - network_data.bmc_network.nic_name | default("", true) | length > 0 - - network_data.bmc_network.netmask_bits | default("", true) | length > 0 - - network_data.bmc_network.static_range | default("", true) | length > 0 - - network_data.bmc_network.dynamic_range | default("", true) | length > 0 - -- name: Set BMC network details - when: - - bmc_details_status - block: - - name: Set the BMC dynamic start and end range - ansible.builtin.set_fact: - bmc_dynamic_start_range: "{{ network_data.bmc_network.dynamic_range | split('-') | first }}" - bmc_dynamic_end_range: "{{ network_data.bmc_network.dynamic_range | split('-') | last }}" - - - name: Set the BMC static start and end range - ansible.builtin.set_fact: - bmc_static_start_range: "{{ network_data.bmc_network.static_range | split('-') | first }}" - bmc_static_end_range: "{{ network_data.bmc_network.static_range | split('-') | last }}" - - - name: Initialize variables - ansible.builtin.set_fact: - bmc_dynamic_status: true - bmc_static_status: false - bmc_discover_range_status: "{{ network_data.bmc_network.discover_ranges | default('', true) | length > 1 }}" - - - name: Update bmc_static_status when static range is provided - ansible.builtin.set_fact: - bmc_static_status: true - when: - - bmc_static_start_range | default("", true) | length > 1 - - bmc_static_end_range | default("", true) | length > 1 - - - name: Check bmc network details - block: - - name: Execute Python script to validate network address - ansible.builtin.command: "{{ python_version }} {{ network_address_script }} bmc_network" - register: script_output - changed_when: false - environment: - net_data: "{{ network_data | to_json }}" - rescue: - - name: Failed, Invalid bmc network details - ansible.builtin.fail: - msg: "{{ fail_msg_bmc_nic_details }}" - - - name: Validate BMC nic ip status - ansible.builtin.assert: - that: - - script_output.stdout | length > 0 - - script_output.rc == 0 - fail_msg: "{{ fail_msg_bmc_nic_details }}" - - - name: Set bmc_ip - ansible.builtin.set_fact: - bmc_nic_ip: "{{ script_output.stdout }}" - bmc_nic: "{{ network_data.bmc_network.nic_name }}" - when: - - script_output.rc == 0 - - - name: Assign values to BMC nic - ansible.builtin.set_fact: - bmc_nic_subnet: "{{ (bmc_nic_ip + '/' + network_data.bmc_network.netmask_bits) | ansible.utils.ipaddr('network') }}" - bmc_nic_netmask: "{{ (bmc_nic_ip + '/' + network_data.bmc_network.netmask_bits) | ansible.utils.ipaddr('netmask') }}" diff --git a/upgrade/roles/metadata_update/tasks/update_metadata.yml b/upgrade/roles/metadata_update/tasks/update_metadata.yml deleted file mode 100644 index 6b28406fc..000000000 --- a/upgrade/roles/metadata_update/tasks/update_metadata.yml +++ /dev/null @@ -1,161 +0,0 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Check metadata.yml file existence - ansible.builtin.stat: - path: "{{ meta_path }}" - register: metadata_status - -- name: Create metadata.yml file if it doesn't exists - ansible.builtin.file: - path: "{{ meta_path }}" - state: touch - mode: "{{ conf_file_mode }}" - group: root - owner: root - when: not metadata_status.stat.exists - -- name: Update netmask for admin and bmc network in metadata.yml file - block: - - name: Update netmask_bits - ansible.builtin.lineinfile: - path: "{{ meta_path }}" - regexp: '^md_netmask_bits:(.*)$' - insertafter: "EOF" - state: present - line: 'md_netmask_bits: {{ network_data.admin_network.netmask_bits }}' - -- name: Update md_admin_static_start_range - ansible.builtin.lineinfile: - path: "{{ meta_path }}" - regexp: '^md_admin_static_start_range:(.*)$' - insertafter: "EOF" - state: present - line: 'md_admin_static_start_range: {{ pxe_nic_start_range }}' - -- name: Update md_admin_static_end_range - ansible.builtin.lineinfile: - path: "{{ meta_path }}" - regexp: '^md_admin_static_end_range:(.*)$' - insertafter: "EOF" - state: present - line: 'md_admin_static_end_range: {{ pxe_nic_end_range }}' - -- name: Update md_admin_dynamic_start_range - ansible.builtin.lineinfile: - path: "{{ meta_path }}" - regexp: '^md_admin_dynamic_start_range:(.*)$' - insertafter: "EOF" - state: present - line: 'md_admin_dynamic_start_range: {{ pxe_nic_dynamic_start_range }}' - -- name: Update md_admin_dynamic_end_range - ansible.builtin.lineinfile: - path: "{{ meta_path }}" - regexp: '^md_admin_dynamic_end_range:(.*)$' - insertafter: "EOF" - state: present - line: 'md_admin_dynamic_end_range: {{ pxe_nic_dynamic_end_range }}' - -- name: Update md_admin_nic_subnet - ansible.builtin.lineinfile: - path: "{{ meta_path }}" - regexp: '^md_admin_nic_subnet:(.*)$' - insertafter: "EOF" - state: present - line: 'md_admin_nic_subnet: {{ pxe_nic_subnet }}' - -- name: Update md_admin_nic_ip - ansible.builtin.lineinfile: - path: "{{ meta_path }}" - regexp: '^md_admin_nic_ip:(.*)$' - insertafter: "EOF" - state: present - line: 'md_admin_nic_ip: {{ admin_nic_ip }}' - -- name: Update the mapping file status - ansible.builtin.lineinfile: - path: "{{ meta_path }}" - regexp: '^md_discovery_mech_mapping:(.*)$' - insertafter: "EOF" - state: present - line: 'md_discovery_mech_mapping: {{ discovery_mech_mapping }}' - -- name: Update metadata.yml file for bmc discovery_mechanism - when: bmc_details_status - block: - - name: Update md_bmc_static_start_range - ansible.builtin.lineinfile: - path: "{{ meta_path }}" - regexp: '^md_bmc_static_start_range:(.*)$' - insertafter: "EOF" - state: present - line: 'md_bmc_static_start_range: {{ bmc_static_start_range }}' - - - name: Update md_bmc_static_end_range - ansible.builtin.lineinfile: - path: "{{ meta_path }}" - regexp: '^md_bmc_static_end_range:(.*)$' - insertafter: "EOF" - state: present - line: 'md_bmc_static_end_range: {{ bmc_static_end_range }}' - - - name: Update md_bmc_dynamic_start_range - ansible.builtin.lineinfile: - path: "{{ meta_path }}" - regexp: '^md_bmc_dynamic_start_range:(.*)$' - insertafter: "EOF" - state: present - line: 'md_bmc_dynamic_start_range: {{ bmc_dynamic_start_range }}' - - - name: Update md_bmc_dynamic_end_range - ansible.builtin.lineinfile: - path: "{{ meta_path }}" - regexp: '^md_bmc_dynamic_end_range:(.*)$' - insertafter: "EOF" - state: present - line: 'md_bmc_dynamic_end_range: {{ bmc_dynamic_end_range }}' - - - name: Update md_bmc_nic_ip - ansible.builtin.lineinfile: - path: "{{ meta_path }}" - regexp: '^md_bmc_nic_ip:(.*)$' - insertafter: "EOF" - state: present - line: 'md_bmc_nic_ip: {{ bmc_nic_ip }}' - - - name: Update md_bmc_nic_subnet - ansible.builtin.lineinfile: - path: "{{ meta_path }}" - regexp: '^md_bmc_nic_subnet:(.*)$' - insertafter: "EOF" - state: present - line: 'md_bmc_nic_subnet: {{ bmc_nic_subnet }}' - -- name: Update md_discovery_mech_bmc - ansible.builtin.lineinfile: - path: "{{ meta_path }}" - regexp: '^md_discovery_mech_bmc:(.*)$' - insertafter: "EOF" - state: present - line: 'md_discovery_mech_bmc: {{ discovery_mech_bmc }}' - -- name: Update md_discovery_mech_bmc - ansible.builtin.lineinfile: - path: "{{ meta_path }}" - regexp: '^md_discovery_mech_switch_based:(.*)$' - insertafter: "EOF" - state: present - line: 'md_discovery_mech_switch_based: {{ discovery_mech_switch_based }}' diff --git a/upgrade/roles/metadata_update/vars/main.yml b/upgrade/roles/metadata_update/vars/main.yml deleted file mode 100644 index 70252c550..000000000 --- a/upgrade/roles/metadata_update/vars/main.yml +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -provision_metadata_path: "/opt/omnia/.data/metadata.yml" - -# Usage: update_metadata.yml -meta_path: "/opt/omnia/.data/metadata.yml" -conf_file_mode: "0644" - -# Usage: include_network_spec.yml -network_spec: "{{ role_path }}/../../../input/network_spec.yml" -network_spec_syntax_fail_msg: "Failed. Syntax errors present in network_spec.yml. Fix errors and re-run playbook again." -python_version: python3.9 -network_address_script: "{{ role_path }}/../../../discovery/roles/discovery_validations/common/files/validate_network_address.py" -success_msg_nic: "nic successfully validated" -fail_msg_admin_nic_details: "Failed. Invalid admin_nic details (nic_name, netmask_bits, static_range or dynamic_range) in network_spec file." -fail_msg_admin_nic_ip_details: "Ensure admin nic is configured with ip address." -validate_nic_status: "Failed, please check the network interface status should be UP" -admin_nic_netmask_fail_msg: "Failed, Admin nic netmask should be same as netmask in network_spec file." -admin_correlation_fail_msg: "Failed. Invalid details provided, correlation_to_admin should true or false." -fail_msg_bmc_nic_details: "Failed. Invalid bmc_network details (nic_name, netmask_bits, static_range or dynamic_range) in network_spec file." -discovery_mech_mapping: true -discovery_mech_bmc: false -discovery_mech_switch_based: false diff --git a/upgrade/roles/omnia_telemetry/tasks/check_and_revert.yml b/upgrade/roles/omnia_telemetry/tasks/check_and_revert.yml new file mode 100644 index 000000000..bc9df9c9b --- /dev/null +++ b/upgrade/roles/omnia_telemetry/tasks/check_and_revert.yml @@ -0,0 +1,65 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check omnia_telemetry service status on node where it was running previously + ansible.builtin.service_facts: + register: service_status + delegate_to: "{{ item }}" + with_items: "{{ running_nodes }}" + +- name: Get list of nodes where service is running + ansible.builtin.set_fact: + not_running_nodes: "{{ not_running_nodes | default([]) + [item.item] }}" + when: + - "'omnia_telemetry.service' in item.ansible_facts.services and item.ansible_facts.services['omnia_telemetry.service'].state not in ['running', 'started']" + with_items: "{{ service_status.results }}" + loop_control: + label: "{{ item.item }}" + + +- name: Warning and revert old binary if service is failed to start + when: not_running_nodes is defined and (not_running_nodes | length > 0) + block: + - name: Warning if omnia_telemetry service is failed to start on any one node + ansible.builtin.debug: + msg: "{{ warning_msg_omnia_telemetry.split('\n') }}" + + - name: Replace original binaries on all nodes as service failed to start + ansible.builtin.copy: + src: "{{ old_binary_files_path }}" + dest: "{{ omnia_telemetry_dest }}" + force: true + owner: root + group: root + mode: "{{ binary_mode }}" + delegate_to: "{{ item }}" + with_items: "{{ running_nodes }}" + + - name: Start omnia_telemetry service on nodes after replacing old binaries + ansible.builtin.service: + name: omnia_telemetry + state: started + delegate_to: "{{ item }}" + with_items: "{{ running_nodes }}" + +- name: Replace new binaries to /opt/omnia/telemetry + ansible.posix.synchronize: + src: "{{ new_binaries_dir }}/" + dest: "{{ omnia_telemetry_dest }}/" + mode: push + rsync_opts: + - "-a" + - "-v" + - "-h" diff --git a/upgrade/roles/omnia_telemetry/tasks/create_binary.yml b/upgrade/roles/omnia_telemetry/tasks/create_binary.yml new file mode 100644 index 000000000..171d29d61 --- /dev/null +++ b/upgrade/roles/omnia_telemetry/tasks/create_binary.yml @@ -0,0 +1,40 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + + +- name: Install python pyinstaller + ansible.builtin.command: "{{ python_version }} -m pip install {{ pyinstaller_python_package }}" + changed_when: true + +- name: Install python psutil + ansible.builtin.command: "{{ python_version }} -m pip install {{ psutil_python_package }}" + changed_when: true + +- name: Create telemetry upgrade temp directory + ansible.builtin.file: + path: "{{ upgrade_folder_path }}/telemetry" + state: directory + mode: "{{ directory_permissions }}" + +- name: Telemetry binary creation + ansible.builtin.command: | + pyinstaller -F collector.py + --distpath {{ telemetry_binary.dist_path }} + --workpath {{ telemetry_binary.build_path }} + --specpath {{ telemetry_binary.spec_path }} + -n {{ telemetry_binary.binary_name }} + changed_when: true + args: + chdir: "{{ telemetry_binary.python_file_path }}" diff --git a/upgrade/roles/upgrade_xcat/tasks/update_xcat_tables.yml b/upgrade/roles/omnia_telemetry/tasks/main.yml similarity index 72% rename from upgrade/roles/upgrade_xcat/tasks/update_xcat_tables.yml rename to upgrade/roles/omnia_telemetry/tasks/main.yml index 6807b679c..19c092ad9 100644 --- a/upgrade/roles/upgrade_xcat/tasks/update_xcat_tables.yml +++ b/upgrade/roles/omnia_telemetry/tasks/main.yml @@ -13,9 +13,8 @@ # limitations under the License. --- -- name: Include metdata file - ansible.builtin.include_vars: "{{ metadata_path }}" +- name: Replace binary and start omnia_telemetry service + ansible.builtin.include_tasks: replace_and_start.yml -- name: Configure site table - ansible.builtin.command: chdef -t site nameservers="{{ md_pxe_nic_ip }}" - changed_when: true +- name: Check and revert if service has not started after replace + ansible.builtin.include_tasks: check_and_revert.yml diff --git a/upgrade/roles/omnia_telemetry/tasks/replace_and_start.yml b/upgrade/roles/omnia_telemetry/tasks/replace_and_start.yml new file mode 100644 index 000000000..e9f450607 --- /dev/null +++ b/upgrade/roles/omnia_telemetry/tasks/replace_and_start.yml @@ -0,0 +1,32 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Transfer all binaries to compute nodes + ansible.builtin.copy: + src: "{{ binary_files_path }}" + dest: "{{ omnia_telemetry_dest }}" + force: true + owner: root + group: root + mode: "{{ binary_mode }}" + delegate_to: "{{ item }}" + loop: "{{ running_nodes }}" + +- name: Start omnia_telemetry service on nodes where it was stopped + ansible.builtin.service: + name: omnia_telemetry + state: started + delegate_to: "{{ item }}" + loop: "{{ running_nodes }}" diff --git a/upgrade/roles/omnia_telemetry/tasks/stop_omnia_telemetry.yml b/upgrade/roles/omnia_telemetry/tasks/stop_omnia_telemetry.yml new file mode 100644 index 000000000..17f249263 --- /dev/null +++ b/upgrade/roles/omnia_telemetry/tasks/stop_omnia_telemetry.yml @@ -0,0 +1,49 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Initialize list of omnia_telemetry service running nodes + ansible.builtin.set_fact: + running_nodes: [] + host_list: [] + +- name: Get provisioned nodes from /opt/omnia/omnia_inventory + ansible.builtin.set_fact: + host_list: "{{ groups['compute_hostname_ip'] }}" + when: + - groups['compute_hostname_ip'] is defined + - groups['compute_hostname_ip'] | length > 0 + +- name: Get service facts from nodes + ansible.builtin.service_facts: + delegate_to: "{{ item }}" + with_items: "{{ host_list }}" + register: service_status + +- name: Add node to running list if service is running + ansible.builtin.set_fact: + running_nodes: "{{ running_nodes + [item.item] }}" + when: + - service_status is defined + - "'omnia_telemetry.service' in item.ansible_facts.services and item.ansible_facts.services['omnia_telemetry.service'].state in ['running', 'started']" + with_items: "{{ service_status.results }}" + loop_control: + label: "{{ item.item }}" + +- name: Stop omnia_telemetry service on running nodes + ansible.builtin.service: + name: omnia_telemetry + state: stopped + delegate_to: "{{ item }}" + with_items: "{{ running_nodes }}" diff --git a/upgrade/roles/omnia_telemetry/vars/main.yml b/upgrade/roles/omnia_telemetry/vars/main.yml new file mode 100644 index 000000000..f7e5ca381 --- /dev/null +++ b/upgrade/roles/omnia_telemetry/vars/main.yml @@ -0,0 +1,44 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Usage: python_package_installation.yml +python_version: "{{ ansible_python_interpreter }}" +pyinstaller_python_package: pyinstaller +psutil_python_package: psutil + +# Usage: telemetry_binary_creation.yml +temp_download_dir: "/tmp" +max_retries: 10 +directory_permissions: "0755" +upgrade_folder_path: "{{ temp_download_dir }}/upgrade" +telemetry_binary: + dist_path: "{{ upgrade_folder_path }}/telemetry/dist" + build_path: "{{ upgrade_folder_path }}/telemetry/build" + spec_path: "{{ upgrade_folder_path }}/telemetry/" + binary_name: omnia_telemetry + python_file_path: "{{ role_path }}/../../../prepare_oim/roles/omnia_telemetry_oim/files" + +# Usage: replace_and_start.yml +binary_files_path: "{{ upgrade_folder_path }}/telemetry/dist/omnia_telemetry" +omnia_telemetry_dest: "/opt/omnia/telemetry" +binary_mode: "0500" + +# Usage: check_and_revert.yml +old_binary_files_path: "/opt/omnia/telemetry/dist/omnia_telemetry" +new_binaries_dir: "{{ upgrade_folder_path }}/telemetry" +warning_msg_omnia_telemetry: | + "[Warning] Omnia_telemetry binary of Omnia {{ upgrade_omnia_version }} is not replaced on compute nodes because + Omnia telemetry service is failed to start on {{ not_running_nodes }}. + Post successful upgrade, run telemetry.yml to deploy binary from Omnia {{ upgrade_omnia_version }}." diff --git a/upgrade/roles/uninstall_open_ldap/vars/main.yml b/upgrade/roles/post_upgrade/tasks/main.yml similarity index 67% rename from upgrade/roles/uninstall_open_ldap/vars/main.yml rename to upgrade/roles/post_upgrade/tasks/main.yml index 193836fe5..a206b6026 100644 --- a/upgrade/roles/uninstall_open_ldap/vars/main.yml +++ b/upgrade/roles/post_upgrade/tasks/main.yml @@ -12,12 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -# Usage: main.yml -software_config_json_file: "{{ role_path }}/../../../input/software_config.json" -uninstall_open_ldap_packages: - - openldap-clients - - sssd - - sssd-ldap - - oddjob-mkhomedir - - openssl-perl -uninstall_open_ldap_conf_dest: "/etc/openldap/ldap.conf" + +- name: Update installed_version + ansible.builtin.lineinfile: + path: "{{ meta_path }}" + regexp: '^installed_version:(.*)$' + insertafter: "EOF" + state: present + line: 'installed_version: "1.7"' + +- name: Print user message + ansible.builtin.debug: + msg: "{{ user_msg_upgrade.split('\n') }}" diff --git a/upgrade/roles/post_upgrade/vars/main.yml b/upgrade/roles/post_upgrade/vars/main.yml new file mode 100644 index 000000000..4dfddbbf4 --- /dev/null +++ b/upgrade/roles/post_upgrade/vars/main.yml @@ -0,0 +1,53 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +meta_path: "/opt/omnia/.data/metadata.yml" + +user_msg_upgrade: "{{ unsupported_os_user_msg_upgrade if ansible_distribution_version in ['8.6', '8.7'] else supported_os_user_msg_upgrade }}" + +supported_os_user_msg_upgrade: | + "************************************************************************************** + upgrade_oim.yml is completed SUCCESSFULLY. + The Omnia Infrastructure Manager has been upgraded to Omnia {{ upgrade_omnia_version }} + + This has not upgraded any software version on compute nodes. + *************************************************************************************** + + Note: + 1. To execute Omnia 1.7 playbook, activate omnia17_venv using command + #source /opt/omnia/omnia17_venv/bin/activate + 2. Make sure that all required entries and version details are present in input/software_config.json. + 3. After upgrade, execute local_repo.yml for the required software versions. + 4. Below software version is supported with omnia161_venv only- + Kubernetes - 1.26.12 + KServe - 0.11.2 + Kubeflow - 1.8.0 + For above software versions- + 1. Activate omnia161_venv using command #source /opt/omnia/omnia161_venv/bin/activate + 2. Replace k8s.json, kserve.json, kubeflow.json in input/config// from omnia 1.6.1 source" + +unsupported_os_user_msg_upgrade: | + "************************************************************************************** + upgrade_oim.yml is completed SUCCESSFULLY. + The Omnia Infrastructure Manager has been upgraded to Omnia {{ upgrade_omnia_version }} + + This has not upgraded any software version on compute nodes. + *************************************************************************************** + + Note: + 1. RHEL/Rocky 8.6 or 8.7 are unsupported OS for Omnia 1.7. + 2. To execute Omnia 1.7 playbook, activate omnia161_venv using command + #source /opt/omnia/omnia161_venv/bin/activate + 3. Omnia 1.7 new features are not supported on RHEL/Rocky 8.6 or 8.7." diff --git a/upgrade/roles/preinstall_cluster_cleanup/tasks/cleanup_15_nfs.yml b/upgrade/roles/preinstall_cluster_cleanup/tasks/cleanup_15_nfs.yml deleted file mode 100644 index 7ae4a7ec0..000000000 --- a/upgrade/roles/preinstall_cluster_cleanup/tasks/cleanup_15_nfs.yml +++ /dev/null @@ -1,102 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Read omnia config file - delegate_to: localhost - run_once: true - block: - - name: Read file path parameters from upgrade_config.yml - ansible.builtin.include_vars: - file: upgrade_config.yml - changed_when: false - - - name: Fetch ansible-vault path - ansible.builtin.command: whereis ansible-vault - changed_when: false - register: ansible_vault_path - - - name: Read the omnia config from 1.5 which is encrypted - ansible.builtin.command: cat "{{ old_input_location }}/{{ omnia_config_file_name }}" - changed_when: false - register: omnia_config_content - no_log: true - - - name: Decrypt config files - ansible.builtin.command: >- - {{ ansible_vault_path.stdout.split(' ')[1] }} decrypt {{ old_input_location }}/{{ omnia_config_file_name }} - --vault-password-file {{ old_input_location }}/{{ omnia_15_config_vault_key }} - when: ansible_vault_search_key in omnia_config_content.stdout - changed_when: false - - - name: Fetch v1.5 omnia_config.yml - block: - - name: Fetch v1.5 omnia_config.yml - ansible.builtin.set_fact: - fetch_var: "{{ lookup('file', old_input_location + '/' + omnia_config_file_name) | from_yaml }}" - rescue: - - name: Failed to fetch v1.5 omnia_config.yml - ansible.builtin.fail: - msg: "{{ omnia_config_syntax_fail_msg }}" - - - name: Encrypt config files - ansible.builtin.command: >- - {{ ansible_vault_path.stdout.split(' ')[1] }} encrypt {{ old_input_location }}/{{ omnia_config_file_name }} - --vault-password-file {{ old_input_location }}/{{ omnia_15_config_vault_key }} - changed_when: false - - -- name: Stop the nfs service - ansible.builtin.service: - name: nfs-server - state: stopped - enabled: false - changed_when: false - when: "'manager' in group_names" - -- name: Comment the nfs share entries in "{{ exports_file_path }}" - ansible.builtin.lineinfile: - path: "{{ exports_file_path }}" - regexp: "{{ fetch_var.omnia_usrhome_share }}" - state: absent - changed_when: false - when: "'manager' in group_names" - -- name: Unmount nfs share directory - ansible.posix.mount: - path: "{{ fetch_var.omnia_usrhome_share }}" - state: unmounted - become: true - when: "'compute' in group_names" - -- name: Remove nfs share directory - ansible.builtin.file: - path: "{{ fetch_var.omnia_usrhome_share }}" - state: absent - become: true - -- name: Read the contents of file /etc/fstab - ansible.builtin.command: cat "{{ fstab_file_path }}" - register: fstab_content - no_log: true - changed_when: false - when: "'compute' in group_names" - -- name: Comment the nfs share entry in /etc/fstab file - ansible.builtin.lineinfile: - path: "{{ fstab_file_path }}" - regexp: "{{ fetch_var.omnia_usrhome_share }}" - state: absent - changed_when: false - when: "'compute' in group_names" diff --git a/upgrade/roles/preinstall_cluster_cleanup/vars/main.yml b/upgrade/roles/preinstall_cluster_cleanup/vars/main.yml deleted file mode 100644 index 5a310f5be..000000000 --- a/upgrade/roles/preinstall_cluster_cleanup/vars/main.yml +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -omnia_config_file_path: "{{ role_path }}/../../../input/omnia_config.yml" -omnia_config_vault_key: "{{ role_path }}/../../../input/.omnia_vault_key" -binary_files_path: "/opt/omnia/telemetry/dist/omnia_telemetry" -ansible_vault_search_key: "$ANSIBLE_VAULT;" - -# variables for the 1.5 nfs cleanup -omnia_config_file_name: "omnia_config.yml" -omnia_15_config_vault_key: ".omnia_vault_key" -exports_file_path: "/etc/exports" -fstab_file_path: "/etc/fstab" -spool_clustername_file: "/var/spool/clustername" -omnia_config_syntax_fail_msg: "Failed.Syntax errors present in the v1.5 omnia_config.yml.File Path is: {{ old_input_location }}/{{ omnia_config_file_name }}.Re-run the playbook after fixing syntax error." # noqa: yaml[line-length] diff --git a/upgrade/roles/prepare_cp_for_upgrade/files/doxcat.patch b/upgrade/roles/prepare_cp_for_upgrade/files/doxcat.patch deleted file mode 100644 index 25e30aed9..000000000 --- a/upgrade/roles/prepare_cp_for_upgrade/files/doxcat.patch +++ /dev/null @@ -1,17 +0,0 @@ ---- doxcat 2023-09-11 09:55:53.739008109 +0530 -+++ updatedoxcat 2023-09-11 09:55:36.110898984 +0530 -@@ -423,12 +423,14 @@ - if [ $IPMI_SUPPORT -ne 0 ]; then - # Set boot from network will cause OpenPOWER server wait at petitboot menu, so do nothing here - if uname -m | grep x86_64; then -+ ipmitool raw 0x00 0x08 0x03 0x08 - ipmitool chassis bootdev pxe - fi - fi - reboot -f - elif [ "$dest" = "install" -o "$dest" = "netboot" ]; then - if [ $IPMI_SUPPORT -ne 0 ]; then -+ ipmitool raw 0x00 0x08 0x03 0x08 - ipmitool chassis bootdev pxe - fi - logger -s -t $log_label -p local4.info "Reboot..." diff --git a/upgrade/roles/prepare_cp_for_upgrade/files/encrypt_pwd.py b/upgrade/roles/prepare_cp_for_upgrade/files/encrypt_pwd.py deleted file mode 100644 index 067352707..000000000 --- a/upgrade/roles/prepare_cp_for_upgrade/files/encrypt_pwd.py +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#!/usr/bin/env python3 - -''' - This module contains tasks required for database update - The query should be created along with timestamp before updating - the database. -''' -import sys -from cryptography.fernet import Fernet - -db_password = sys.argv[1] - -def encrypt_config_file(): - ''' - This module encrypts the config file - ''' - key = Fernet.generate_key() - with open('/opt/omnia/.postgres/.postgres_pass.key', 'wb') as filekey: - filekey.write(key) - - with open('/opt/omnia/.postgres/.postgres_pass.key', 'rb') as filekey: - key = filekey.read() - fernet = Fernet(key) - db_password_bytes = db_password.encode() - encrypted = fernet.encrypt(db_password_bytes) - - with open('/opt/omnia/.postgres/.encrypted_pwd', 'wb') as encrypted_file: - encrypted_file.write(encrypted) - -def main(): - ''' - This module initiates config encryption - ''' - encrypt_config_file() - -if __name__ == '__main__': - main() diff --git a/upgrade/roles/prepare_cp_for_upgrade/files/postgresql.conf b/upgrade/roles/prepare_cp_for_upgrade/files/postgresql.conf deleted file mode 100644 index f6387047d..000000000 --- a/upgrade/roles/prepare_cp_for_upgrade/files/postgresql.conf +++ /dev/null @@ -1,2 +0,0 @@ -[Unit] -After=network.target network-online.target diff --git a/upgrade/roles/prepare_cp_for_upgrade/files/xcat-cmdline.patch b/upgrade/roles/prepare_cp_for_upgrade/files/xcat-cmdline.patch deleted file mode 100644 index 529130600..000000000 --- a/upgrade/roles/prepare_cp_for_upgrade/files/xcat-cmdline.patch +++ /dev/null @@ -1,19 +0,0 @@ -diff --git a/xCAT-genesis-builder/xcat-cmdline.sh b/xCAT-genesis-builder/xcat-cmdline.sh -index 7c7eb7fb2..15ead2175 100755 ---- a/xCAT-genesis-builder/xcat-cmdline.sh -+++ b/xCAT-genesis-builder/xcat-cmdline.sh -@@ -75,6 +75,14 @@ if [[ ${ARCH} =~ ppc64 ]]; then - done - # wait 2+number_of_nics seconds for all the LINKed NICs to be UP - sleep $waittime -+elif [[ ${ARCH} =~ x86_64 ]]; then -+ # load all network driver modules listed in /lib/modules//modules.dep file -+ KERVER=`uname -r` -+ for line in `cat /lib/modules/$KERVER/modules.dep |grep -vE 'tunnel|ieee|ifb|bond|dummy|fjes|hv_netvsc|ntb_netdev|xen-netfront|hdlc_fr|dlci'| awk -F: '{print \$1}' | sed -e "s/\(.*\)\.ko.*/\1/"`; do -+ if [[ $line =~ "kernel/drivers/net" ]]; then -+ modprobe `basename $line` -+ fi -+ done - fi - - while :; do screen -dr doxcat || screen -S doxcat -L -ln doxcat; done diff --git a/upgrade/roles/prepare_cp_for_upgrade/files/xcat-genesis-base-x86_64.tar.gz b/upgrade/roles/prepare_cp_for_upgrade/files/xcat-genesis-base-x86_64.tar.gz deleted file mode 100644 index e08a28662..000000000 --- a/upgrade/roles/prepare_cp_for_upgrade/files/xcat-genesis-base-x86_64.tar.gz +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fd5cdcba0e1d729ed0ba1657ab2d8ced5ce41dd4ef88e89a28df0f6b55376e8c -size 142607880 diff --git a/upgrade/roles/prepare_cp_for_upgrade/tasks/configure_synclist_files.yml b/upgrade/roles/prepare_cp_for_upgrade/tasks/configure_synclist_files.yml deleted file mode 100644 index 5abd93fa7..000000000 --- a/upgrade/roles/prepare_cp_for_upgrade/tasks/configure_synclist_files.yml +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Read local repo config - ansible.builtin.include_vars: - file: "{{ local_repo_config_file }}" - -- name: Configure synclist files - ansible.builtin.include_role: - name: "{{ role_path }}/../../../discovery/roles/configure_synclist" # noqa:role-name[path] - -- name: Running xCAT updatenode - block: - - name: Get booted nodes details from DB - community.postgresql.postgresql_query: - db: omniadb - login_user: postgres - query: SELECT node FROM cluster.nodeinfo where (status='booted') AND (node!= 'control_plane'); - login_password: "{{ postgresdb_password }}" - become_user: postgres - register: node_query_status - no_log: true - - - name: List booted nodes - ansible.builtin.set_fact: - booted_nodes: "{{ node_query_status.query_result | map(attribute='node') | list }}" - # no_of_booted_nodes: "{{ node_query_status.rowcount }}" - - - name: Concatenate booted nodes - ansible.builtin.set_fact: - booted_nodes_str: "{{ booted_nodes | default([]) | join(',') if booted_nodes | length > 1 else booted_nodes[0] }}" - when: booted_nodes | length > 0 - - - name: Updatenode - ansible.builtin.command: updatenode "{{ booted_nodes_str }}" -F - when: booted_nodes | length > 0 - changed_when: true diff --git a/upgrade/roles/prepare_cp_for_upgrade/tasks/main.yml b/upgrade/roles/prepare_cp_for_upgrade/tasks/main.yml deleted file mode 100644 index 7265e798d..000000000 --- a/upgrade/roles/prepare_cp_for_upgrade/tasks/main.yml +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Fetching cluster os - ansible.builtin.set_fact: - control_plane_os: "{{ ansible_distribution | lower }}" - -- name: Setting admin nic details - ansible.builtin.include_tasks: set_admin_nic_vars.yml - -- name: Include local repo related config files - ansible.builtin.include_role: - name: "{{ role_path }}/../../../discovery/roles/discovery_validations/common" # noqa:role-name[path] - tasks_from: include_local_repo_config.yml - -- name: Invoking xCAT operations - environment: - XCATROOT: "{{ xcat_root_env }}" - PATH: "{{ ansible_env.PATH }}:{{ xcat_path_env }}" - MANPATH: "{{ xcat_manpath_env }}" - PERL_BADLANG: "{{ perl_badlang_env }}" - block: - - name: Configure postgres - ansible.builtin.include_tasks: configure_postgres.yml - - - name: Fetch os image - ansible.builtin.include_role: - name: "{{ role_path }}/../../../discovery/roles/configure_xcat/common" # noqa:role-name[path] - tasks_from: fetch_osimage.yml - - - name: Configure synclist files - ansible.builtin.include_tasks: configure_synclist_files.yml - -- name: Initiate monitor thread - ansible.builtin.include_role: - name: "{{ role_path }}/../../../discovery/roles/monitor_thread" # noqa:role-name[path] - tasks_from: initiate_monitor_status.yml - -- name: Restart omnia service - ansible.builtin.systemd: - name: omnia - state: restarted - enabled: true diff --git a/upgrade/roles/prepare_cp_for_upgrade/tasks/set_admin_nic_vars.yml b/upgrade/roles/prepare_cp_for_upgrade/tasks/set_admin_nic_vars.yml deleted file mode 100644 index 51059ac78..000000000 --- a/upgrade/roles/prepare_cp_for_upgrade/tasks/set_admin_nic_vars.yml +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Fetch the network interfaces in UP state in the system - ansible.builtin.shell: | - set -o pipefail - /usr/sbin/ip a | awk '/state UP/{print $2}' - register: nic_addr_up - changed_when: false - -- name: Include network_spec.yml - block: - - name: Include network_spec file - ansible.builtin.include_vars: "{{ network_spec }}" - register: include_network_spec - no_log: true - rescue: - - name: Failed to include network_spec.yml - ansible.builtin.fail: - msg: "{{ network_spec_syntax_fail_msg }} Error: {{ include_network_spec.message }}" - -- name: Parse network_spec data - ansible.builtin.set_fact: - network_data: "{{ network_data | default({}) | combine({item.key: item.value}) }}" - with_dict: "{{ Networks }}" - -- name: Check admin network details are valid - block: - - name: Execute Python script to validate network address - ansible.builtin.command: "{{ python_version }} {{ network_address_script }} admin_network" - register: script_output - changed_when: false - environment: - net_data: "{{ network_data | to_json }}" - rescue: - - name: Failed, Invalid admin network details - ansible.builtin.fail: - msg: "{{ fail_msg_admin_nic_details }} {{ fail_msg_admin_nic_ip_details }}" - -- name: Validate admin nic ip status - ansible.builtin.assert: - that: - - script_output.stdout | length > 0 - - script_output.rc == 0 - fail_msg: "{{ fail_msg_admin_nic_details }} {{ fail_msg_admin_nic_ip_details }}" - -- name: Set admin_nic_ip - ansible.builtin.set_fact: - admin_nic_ip: "{{ script_output.stdout }}" - admin_nic: "{{ network_data.admin_network.nic_name }}" - -- name: Assert admin nic value - ansible.builtin.assert: - that: - - admin_nic in nic_addr_up.stdout - success_msg: " Admin {{ success_msg_nic }}" - fail_msg: "{{ validate_nic_status }} for admin nic" diff --git a/upgrade/roles/prepare_cp_for_upgrade/vars/main.yml b/upgrade/roles/prepare_cp_for_upgrade/vars/main.yml deleted file mode 100644 index 8d6f360de..000000000 --- a/upgrade/roles/prepare_cp_for_upgrade/vars/main.yml +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -# Usage: set_admin_nic_vars.yml -python_version: python3.9 -network_address_script: "{{ role_path }}/../../../discovery/roles/discovery_validations/common/files/validate_network_address.py" -network_spec: "{{ role_path }}/../../../input/network_spec.yml" - -success_msg_nic: "nic successfully validated" -network_spec_syntax_fail_msg: "Failed. Syntax errors present in network_spec.yml. Fix errors and re-run playbook again." -fail_msg_admin_nic_details: "Failed. Invalid admin_nic details (nic_name, netmask_bits, static_range or dynamic_range) in network_spec file." -fail_msg_admin_nic_ip_details: "Ensure admin nic is configured with ip address." -validate_nic_status: "Failed, please check the network interface status should be UP" -admin_nic_netmask_fail_msg: "Failed, Admin nic netmask should be same as netmask in network_spec file." - -# Usage: configure_synclist_files.yml -local_repo_config_file: "{{ role_path }}/../../../input/local_repo_config.yml" - -# Usage: configure_postgres.yml -xcat_services: - - xcatd - - postgresql - - firewalld - - snmpd -postgresql_conf_dir: /etc/systemd/system/postgresql.service.d -postgresql_conf_dest: "{{ postgresql_conf_dir }}/postgresql.conf" - -pg_hba_conf_path: /var/lib/pgsql/data/pg_hba.conf -postgres_file_path: "/opt/omnia/.postgres/" -directory_permissions: "0600" -utility_path: "{{ role_path }}/files/encrypt_pwd.py" -encrypted_file_path: "/opt/omnia/.postgres/.encrypted_pwd" - -postgresql_conf_src: "{{ role_path }}/files/postgresql.conf" -pgsqlsetup_path: /opt/xcat/bin/pgsqlsetup -file_permission: "0755" - -# Usage: main.yml -xcat_root_env: "/opt/xcat" -xcat_path_env: "/opt/xcat/bin:/opt/xcat/sbin:/opt/xcat/share/xcat/tools" -xcat_manpath_env: "/opt/xcat/share/man:$MANPATH" -perl_badlang_env: 0 diff --git a/utils/roles/control_plane_cleanup/tasks/clean_telemetry_setup.yml b/upgrade/roles/restore_oim/tasks/cleanup_k8s.yml similarity index 61% rename from utils/roles/control_plane_cleanup/tasks/clean_telemetry_setup.yml rename to upgrade/roles/restore_oim/tasks/cleanup_k8s.yml index 7f5fea312..81be1800b 100644 --- a/utils/roles/control_plane_cleanup/tasks/clean_telemetry_setup.yml +++ b/upgrade/roles/restore_oim/tasks/cleanup_k8s.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -- name: Set control_plane_os + +- name: Set oim_os ansible.builtin.set_fact: - control_plane_os: "{{ ansible_distribution | lower }}" + oim_os: "{{ ansible_distribution | lower }}" - name: Reset kubeadm ansible.builtin.command: kubeadm reset --cri-socket={{ crio_socket }} -f @@ -33,6 +34,18 @@ state: absent failed_when: false +- name: Stop etcd service + ansible.builtin.service: + name: etcd + state: stopped + failed_when: false + +- name: Remove etcd service + ansible.builtin.file: + path: "/etc/systemd/system/etcd.service" + state: absent + failed_when: false + - name: Check if crictl is present ansible.builtin.stat: path: "{{ bin_dir }}/crictl" @@ -97,26 +110,6 @@ failed_when: false changed_when: true -- name: Get container ID for buildkitd - ansible.builtin.shell: > - set -o pipefail - && nerdctl ps -q -a -f name=buildkitd | head -n 1 - register: buildkit_container_id - changed_when: false - failed_when: false - -- name: Stop buildkitd container - ansible.builtin.command: nerdctl stop {{ buildkit_container_id.stdout }} - when: buildkit_container_id.stdout is defined and buildkit_container_id.stdout != '' - changed_when: false - failed_when: false - -- name: Remove buildkitd container - ansible.builtin.command: nerdctl rm {{ buildkit_container_id.stdout }} - when: buildkit_container_id.stdout is defined and buildkit_container_id.stdout != '' - changed_when: false - failed_when: false - - name: Remove kubernetes folder ansible.builtin.file: path: "{{ kube_folder_path }}" @@ -128,79 +121,51 @@ state: absent failed_when: false -- name: Remove /etc/exports entries - ansible.builtin.lineinfile: - path: "{{ exports_path }}" - regexp: "{{ item }}" - state: absent - with_items: "{{ exports_regexp }}" - -- name: Exporting the shared directories - ansible.builtin.command: /usr/sbin/exportfs -r - changed_when: true - failed_when: false - -- name: Include telemetry_config file - ansible.builtin.include_tasks: include_telemetry_config.yml - -- name: Remove grafana persistent data - ansible.builtin.file: - path: "{{ item }}" - state: absent - with_items: "{{ grafana_folders }}" - -- name: Remove telemetry github data - ansible.builtin.file: - path: "{{ item }}" - state: absent - with_items: "{{ telemetry_folders }}" - -- name: Remove telemetry database persistent data - ansible.builtin.file: - path: "{{ item }}" - state: absent - with_items: "{{ database_folders }}" - tags: database - - name: Remove metallb data ansible.builtin.file: path: "{{ item }}" state: absent with_items: "{{ metallb_files }}" -- name: Stop docker service - ansible.builtin.service: - name: docker.service - state: stopped - enabled: false - failed_when: false - - name: Reload systemd ansible.builtin.systemd: daemon_reload: true changed_when: false failed_when: false -- name: Remove docker packages for RHEL/Rocky - ansible.builtin.command: dnf remove -y {{ docker_packages }} - changed_when: true - failed_when: false - when: control_plane_os in control_plane_os_redhat or - control_plane_os in control_plane_os_rocky - -- name: Remove docker packages for Ubuntu - ansible.builtin.command: apt remove -y {{ docker_packages }} - changed_when: true - failed_when: false - when: control_plane_os in control_plane_os_ubuntu - -- name: Remove docker files +- name: Remove k8s bin file ansible.builtin.file: path: "{{ item }}" state: absent - with_items: "{{ docker_del_files }}" + with_items: "{{ k8s_bin_files }}" + failed_when: false -- name: Remove docker.list file +- name: Remove k8s config file ansible.builtin.file: - path: "{{ docker_list_path }}" + path: "{{ item }}" state: absent + with_items: "{{ k8s_del_files }}" + failed_when: false + +- name: Free Omnia Infrastructure Manager k8s by killing related processes + block: + - name: Check for Kubernetes-related processes occupying ports + ansible.builtin.shell: > + set -o pipefail && \ + netstat -tulpn | grep 'kube' + register: kube_processes + changed_when: false + failed_when: false + + - name: Extract PIDs of Kubernetes processes using ports + ansible.builtin.set_fact: + kube_pids: "{{ kube_processes.stdout_lines | map('regex_search', '\\s(\\d+)/', '\\1') | flatten }}" + when: kube_processes.stdout_lines | default("", true) | length > 1 + failed_when: false + + - name: Kill Kubernetes processes to free the ports + ansible.builtin.command: "kill -9 {{ item }}" + with_items: "{{ kube_pids }}" + when: kube_pids | default("", true) | length > 1 + changed_when: false + failed_when: false diff --git a/upgrade/roles/restore_oim/tasks/etcd.yml b/upgrade/roles/restore_oim/tasks/etcd.yml new file mode 100644 index 000000000..2a9194ade --- /dev/null +++ b/upgrade/roles/restore_oim/tasks/etcd.yml @@ -0,0 +1,114 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Wait for all pods to be Running after deploying k8s + ansible.builtin.shell: | + set -o pipefail; + kubectl get pods --all-namespaces | grep -v "Running" | awk 'NR>1 {print $2}' | wc -l + register: pod_count_before_restore + until: pod_count_before_restore.stdout == "0" + changed_when: false + retries: 30 + delay: 10 + +- name: Stop etcd service + ansible.builtin.systemd: + name: etcd + state: stopped + +- name: Stop kubelet service + ansible.builtin.systemd: + name: kubelet + state: stopped + +- name: Extract the last directory name from the mount location + ansible.builtin.set_fact: + mount_location_last_part: "{{ mount_location.rstrip('/') | basename }}" + +- name: Set base path from mount_location + ansible.builtin.set_fact: + base_dir_path: "{{ mount_location.rstrip('/') | dirname }}" + +- name: Delete existing mount_location + ansible.builtin.file: + state: absent + path: "{{ mount_location }}" + +- name: Copy k8s_backup_location to mount location data folder + ansible.builtin.copy: + src: "{{ k8s_backup_location }}/{{ mount_location_last_part }}" + dest: "{{ base_dir_path }}" + mode: "{{ directory_mode }}" + +- name: Delete etcd restore data dir if present already + ansible.builtin.file: + state: absent + path: "{{ etcd_restore_data_dir }}" + +- name: Restore etcd snapshot + ansible.builtin.command: + cmd: "etcdctl snapshot restore {{ snapshot_db_name }} --data-dir={{ etcd_restore_data_dir }}" + changed_when: false + +- name: Update etcd data dir in "{{ etcd_env_file }}" + ansible.builtin.replace: + path: "{{ etcd_env_file }}" + regexp: '^ETCD_DATA_DIR=.*' + replace: "ETCD_DATA_DIR={{ etcd_restore_data_dir }}" + +- name: Reload systemd + ansible.builtin.systemd: + daemon_reload: true + +- name: Start etcd service + ansible.builtin.systemd: + name: etcd + state: started + enabled: true + +- name: Start kubelet service + ansible.builtin.systemd: + name: kubelet + state: started + enabled: true + +- name: Please wait, kuberenetes etcd restoration is in-progress + ansible.builtin.wait_for: + timeout: "{{ etcd_wait_time }}" + +- name: Wait for all pods to be Running after restoring etcd + ansible.builtin.shell: | + set -o pipefail; + kubectl get pods --all-namespaces | grep -v "Running" | awk 'NR>1 {print $2}' | wc -l + register: pod_count_after_restore + until: pod_count_after_restore.stdout == "0" + changed_when: false + retries: 30 + delay: 10 + +- name: Get kube-proxy pod name + ansible.builtin.command: 'kubectl get pod -n "{{ kube_system_namespace }}" -l k8s-app=kube-proxy -o jsonpath="{.items[0].metadata.name}"' + register: kube_proxy_pod_name + changed_when: false + failed_when: false + +- name: Delete kube-proxy pod + ansible.builtin.command: 'kubectl delete pod "{{ kube_proxy_pod_name.stdout }}" -n "{{ kube_system_namespace }}"' + changed_when: false + failed_when: false + +- name: Wait for kube-proxy pod to come to ready state + ansible.builtin.command: 'kubectl wait --for=condition=ready --timeout=11m -n "{{ kube_system_namespace }}" pod -l k8s-app=kube-proxy' + changed_when: false diff --git a/upgrade/roles/telemetry_uninstall/tasks/include_telemetry_config.yml b/upgrade/roles/restore_oim/tasks/include_telemetry_config.yml similarity index 97% rename from upgrade/roles/telemetry_uninstall/tasks/include_telemetry_config.yml rename to upgrade/roles/restore_oim/tasks/include_telemetry_config.yml index 4e42a6105..e1b809bb9 100644 --- a/upgrade/roles/telemetry_uninstall/tasks/include_telemetry_config.yml +++ b/upgrade/roles/restore_oim/tasks/include_telemetry_config.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. --- + - name: Check telemetry_config.yml file is encrypted ansible.builtin.command: cat {{ telemetry_config_file }} changed_when: false diff --git a/upgrade/roles/restore_oim/tasks/main.yml b/upgrade/roles/restore_oim/tasks/main.yml new file mode 100644 index 000000000..687353022 --- /dev/null +++ b/upgrade/roles/restore_oim/tasks/main.yml @@ -0,0 +1,90 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Validate venv + ansible.builtin.include_tasks: validate_venv.yml + +- name: Validate upgrade_config.yml + ansible.builtin.include_tasks: validate_upgrade_config.yml + +- name: Precheck for restore + ansible.builtin.include_tasks: restore_precheck.yml + +- name: Verify prerequistes for K8s + ansible.builtin.include_tasks: prerequiste.yml + +- name: Include telemetry config vars + ansible.builtin.include_tasks: include_telemetry_config.yml + +- name: Stop omnia_telemetry.service + ansible.builtin.include_role: + name: omnia_telemetry + tasks_from: stop_omnia_telemetry.yml + +- name: Cleanup existing K8S cluster + ansible.builtin.include_tasks: cleanup_k8s.yml + +- name: Restore nerdctl + ansible.builtin.include_tasks: restore_nerdctl.yml + +- name: Configure container runtime + ansible.builtin.include_role: + name: "{{ playbook_dir }}/../telemetry/roles/orchestrator" # noqa:role-name[path] + tasks_from: container_runtime.yml + +- name: Deploy K8s + ansible.builtin.include_role: + name: "{{ playbook_dir }}/../telemetry/roles/orchestrator" # noqa:role-name[path] + tasks_from: deploy_k8s.yml + vars: + software_config_file: "{{ installed_omnia_path }}/input/software_config.json" + +- name: Restore etcd + ansible.builtin.include_tasks: etcd.yml + when: check_etcd_backup.stat.exists + +- name: Check for pods presence after restoring etcd + ansible.builtin.include_tasks: pod_check.yml + +- name: Restore timescaledb and idrac telemetry pods + ansible.builtin.include_tasks: timescaledb.yml + when: timescale_pod_status_flag or idrac_telemetry_pod_status_flag + +- name: Restore and start omnia_telemetry.service + ansible.builtin.include_role: + name: omnia_telemetry + tasks_from: replace_and_start.yml + apply: + vars: + binary_files_path: "/opt/omnia/telemetry/dist/omnia_telemetry" + +- name: Restore omnia service + ansible.builtin.include_role: + name: "{{ installed_omnia_path }}/discovery/roles/monitor_thread" # noqa: role-name[path] + tasks_from: initiate_monitor_status.yml + +- name: Restore omnia_inventory files + ansible.builtin.copy: + src: "{{ backup_location }}/omnia_inventory/" + dest: "{{ omnia_inv_path }}" + mode: "{{ file_permission }}" + when: omnia_inv_stat.stat.exists + +- name: Set installed_version as 1.6.1 + ansible.builtin.lineinfile: + path: "{{ meta_path }}" + regexp: '^installed_version:(.*)$' + state: present + line: 'installed_version: "1.6.1"' diff --git a/upgrade/roles/restore_oim/tasks/pod_check.yml b/upgrade/roles/restore_oim/tasks/pod_check.yml new file mode 100644 index 000000000..13b4cf29e --- /dev/null +++ b/upgrade/roles/restore_oim/tasks/pod_check.yml @@ -0,0 +1,73 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check timescaledb pod running status + block: + - name: Get timescaledb pod name + ansible.builtin.command: kubectl get pod -n "{{ telemetry_namespace }}" -l app="{{ timescaledb_k8s_name }}" -o jsonpath="{.items[0].metadata.name}" + register: timescaledb_pod_name + changed_when: false + failed_when: false + + - name: Check if timescaledb Pod Exists + ansible.builtin.command: kubectl get pod "{{ timescaledb_pod_name.stdout }}" -n "{{ telemetry_namespace }}" --no-headers + register: timescaledb_pod_check + changed_when: false + ignore_errors: true + no_log: true + + - name: Get timescaledb pod status + ansible.builtin.command: kubectl get pod -n "{{ telemetry_namespace }}" "{{ timescaledb_pod_name.stdout }}" -o jsonpath='{.status.phase}' + register: timescaledb_pod_status + when: timescaledb_pod_check.rc == 0 + failed_when: false + changed_when: false + +- name: Check iDRAC telemetry pod running status + block: + - name: Get iDRAC telemetry pod name + ansible.builtin.command: kubectl get pod -n "{{ telemetry_namespace }}" -l app="{{ idrac_k8s_name }}" -o jsonpath="{.items[0].metadata.name}" + register: idrac_telemetry_pod_name + changed_when: false + failed_when: false + + - name: Check if iDRAC telemetry Pod Exists + ansible.builtin.command: kubectl get pod "{{ idrac_telemetry_pod_name.stdout }}" -n "{{ telemetry_namespace }}" --no-headers + register: idrac_telemetry_pod_check + changed_when: false + ignore_errors: true + no_log: true + + - name: Get iDRAC telemetry pod status + ansible.builtin.command: kubectl get pod -n "{{ telemetry_namespace }}" "{{ idrac_telemetry_pod_name.stdout }}" -o jsonpath='{.status.phase}' + register: idrac_telemetry_pod_status + when: idrac_telemetry_pod_check.rc == 0 + failed_when: false + changed_when: false + +- name: Set facts pods status flags + ansible.builtin.set_fact: + timescaledb_pod_status_flag: false + idrac_telemetry_pod_status_flag: false + +- name: Set fact timescaledb_pod_status_flag + ansible.builtin.set_fact: + timescale_pod_status_flag: true + when: timescaledb_pod_status.stdout is defined and timescaledb_pod_status.stdout == 'Running' + +- name: Set fact idrac_telemetry_pod_status_flag + ansible.builtin.set_fact: + idrac_telemetry_pod_status_flag: true + when: idrac_telemetry_pod_status.stdout is defined and idrac_telemetry_pod_status.stdout == 'Running' diff --git a/upgrade/roles/restore_oim/tasks/prerequiste.yml b/upgrade/roles/restore_oim/tasks/prerequiste.yml new file mode 100644 index 000000000..85166dd66 --- /dev/null +++ b/upgrade/roles/restore_oim/tasks/prerequiste.yml @@ -0,0 +1,65 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Read software_config.json + ansible.builtin.include_vars: + file: "{{ software_config_file }}" + name: software_config + +- name: Get cluster_os_type from software_config.json + ansible.builtin.set_fact: + cluster_os_type: "{{ software_config['cluster_os_type'] }}" + +- name: Get cluster_os_version from software_config.json + ansible.builtin.set_fact: + cluster_os_version: "{{ software_config['cluster_os_version'] }}" + +- name: Load telemetry.json + ansible.builtin.set_fact: + telemetry_json: "{{ lookup('file', telemetry_json_file) | from_json }}" + +- name: Load the vars from telemetry file + ansible.builtin.set_fact: + telemetry_package_map: "{{ telemetry_package_map | default({}) | combine({((item.package).rsplit('-', 1)[0]): item.package}) }}" + loop: "{{ telemetry_json['telemetry']['cluster'] }}" + when: (item.type == "tarball" or item.type == "git") + +- name: Extract k8s version + ansible.builtin.set_fact: + k8s_version: "{{ (telemetry_package_map['kubectl']).rsplit('-', 1)[1] | default('1.26.12') }}" + +- name: Gather all IP addresses + ansible.builtin.command: ip -4 addr show + register: ip_output + changed_when: false + +- name: Read Omnia Infrastructure Manager hostname + ansible.builtin.command: hostname + changed_when: false + register: hostname_output + +- name: Read Omnia Infrastructure Manager domain name + ansible.builtin.command: hostname -d + changed_when: false + register: domain_name_output + +- name: Set oim details + ansible.builtin.set_fact: + oim_hostname: "{{ hostname_output.stdout }}" + oim_domain_name: "{{ domain_name_output.stdout }}" + oim_ip_addresses: "{{ ip_output.stdout | regex_findall('inet\\s([0-9.]+)') }}" + +- name: Include local_repo_access.yml + ansible.builtin.include_vars: "{{ local_repo_access_path }}" diff --git a/upgrade/roles/restore_oim/tasks/restore_nerdctl.yml b/upgrade/roles/restore_oim/tasks/restore_nerdctl.yml new file mode 100644 index 000000000..c3d1e97ce --- /dev/null +++ b/upgrade/roles/restore_oim/tasks/restore_nerdctl.yml @@ -0,0 +1,78 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Get nerdctl version + ansible.builtin.command: nerdctl --version + register: nerdctl_version + changed_when: false + +- name: Restore nerdctl + when: nerdctl_version is defined and '1.5.0' not in nerdctl_version.stdout + block: + - name: Stop all running containers + ansible.builtin.shell: | + for container_id in $(nerdctl ps -q); do + nerdctl stop $container_id + done + args: + executable: /bin/bash + changed_when: false + + - name: Remove old nerdctl binary + ansible.builtin.file: + path: "{{ item }}" + state: absent + changed_when: false + with_items: "{{ nerdctl.binary_files_path }}" + + - name: Remove nerdctl binary from tmp directory + ansible.builtin.file: + path: "{{ item }}" + state: absent + with_fileglob: + - "{{ temp_download_dir }}/*nerdctl*" + + - name: Create nerdctl temp directory + ansible.builtin.file: + path: "{{ nerdctl_directory }}" + state: directory + mode: "{{ directory_permissions }}" + + - name: Download nerdctl archive + ansible.builtin.get_url: + url: "{{ nerdctl.url }}" + dest: "{{ nerdctl.archive_dest }}" + mode: "{{ file_permission }}" + register: download_nerdctl + until: download_nerdctl is not failed + retries: "{{ max_retries }}" + + - name: Extract nerdctl archive + ansible.builtin.unarchive: + src: "{{ nerdctl.archive_dest }}" + dest: "{{ nerdctl_directory }}" + mode: "{{ file_permission }}" + + - name: Make nerdctl executable + ansible.builtin.file: + path: "{{ nerdctl.folder_dest }}" + mode: "{{ nerdctl.folder_permission }}" + + - name: Move nerdctl to system bin directory + ansible.builtin.copy: + src: "{{ nerdctl.folder_dest }}" + dest: "{{ item }}" + mode: preserve + with_items: "{{ nerdctl.executable_dest }}" diff --git a/upgrade/roles/restore_oim/tasks/restore_precheck.yml b/upgrade/roles/restore_oim/tasks/restore_precheck.yml new file mode 100644 index 000000000..f1138a875 --- /dev/null +++ b/upgrade/roles/restore_oim/tasks/restore_precheck.yml @@ -0,0 +1,55 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if the etcd backup file exists + ansible.builtin.stat: + path: "{{ snapshot_db_name }}" + register: check_etcd_backup + +- name: Check if the k8s backup tarball exists + ansible.builtin.stat: + path: "{{ k8s_backup_location_tarball }}" + register: check_k8s_tarball + +- name: Check if the mysqldb backup file exists + ansible.builtin.stat: + path: "{{ k8s_backup_location }}/{{ mysqldb_local_backup_file }}" + register: check_mysqldb_backup_file + +- name: Check if the timescaledb backup file exists + ansible.builtin.stat: + path: "{{ k8s_backup_location }}/{{ timescale_telemetry_backup_file }}" + register: check_timescaledb_backup_file + +- name: Check for k8s tarball existence + when: not check_etcd_backup.stat.exists + block: + - name: Fail if snapshot does not exist and if k8s tarball exist + ansible.builtin.fail: + msg: "{{ k8s_tarball_file_fail_msg }}" + when: + - check_k8s_tarball.stat.exists + +- name: Fail if etcd backup file does not exist + ansible.builtin.fail: + msg: "{{ etcd_file_fail_msg }}" + when: + - not check_etcd_backup.stat.exists + - not check_k8s_tarball.stat.exists + +- name: Check if omnia_inventory is present + ansible.builtin.stat: + path: "{{ backup_location }}/omnia_inventory" + register: omnia_inv_stat diff --git a/upgrade/roles/restore_oim/tasks/timescaledb.yml b/upgrade/roles/restore_oim/tasks/timescaledb.yml new file mode 100644 index 000000000..21749e824 --- /dev/null +++ b/upgrade/roles/restore_oim/tasks/timescaledb.yml @@ -0,0 +1,130 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if the telemetry_metrics backup file exists + ansible.builtin.stat: + path: "{{ k8s_backup_location }}/{{ timescale_telemetry_backup_file }}" + register: telemetry_backup_file + +- name: Print telemetry_backup_file exist + ansible.builtin.debug: + msg: "backup exist : {{ telemetry_backup_file.stat.exists }}" + +- name: Install psycopg2 package + ansible.builtin.pip: + name: psycopg2-binary + state: present + +- name: Get external IP of timescaledb service + ansible.builtin.command: kubectl get svc "{{ timescaledb_k8s_name }}" -n "{{ telemetry_namespace }}" -o jsonpath='{.status.loadBalancer.ingress[0].ip}' + register: timescaledb_service_external_ip + failed_when: false + changed_when: false + when: timescale_pod_status_flag + +- name: Check and create the database schemas if backup file exist + when: + - timescale_pod_status_flag + - telemetry_backup_file.stat.exists + block: + - name: Check if telemetry_metrics database exists before restore + ansible.builtin.command: > + kubectl exec "{{ timescaledb_pod_name.stdout }}" + -n "{{ telemetry_namespace }}" + -- psql -U {{ timescaledb_user }} -tc "SELECT 1 FROM pg_database WHERE datname='{{ database_name }}';" + register: check_database + changed_when: false + + - name: Create telemetry_metrics database for restore + ansible.builtin.command: > + kubectl exec "{{ timescaledb_pod_name.stdout }}" + -n "{{ telemetry_namespace }}" + -- psql -U {{ timescaledb_user }} -c "CREATE DATABASE {{ database_name }};" + when: check_database.stdout.find('1') == -1 # Create DB only if it doesn't exist + register: create_database + changed_when: create_database.rc == 0 + + - name: Restore telemetry_metrics database from dump + ansible.builtin.shell: > + set -o pipefail && \ + kubectl exec -i "{{ timescaledb_pod_name.stdout }}" + -n "{{ telemetry_namespace }}" + -- psql -U {{ timescaledb_user }} {{ database_name }} -f "{{ timescale_telemetry_backup_file }}" + register: restore_database + changed_when: restore_database.rc == 0 + + - name: Pause for 60 seconds to complete the database restore + ansible.builtin.pause: + seconds: 60 + + - name: Verify metrics in telemetry_metrics database + when: + - timescale_pod_status_flag is true + - idrac_telemetry_pod_status_flag is true + block: + - name: Count records in timeseries_metrics table + ansible.builtin.command: kubectl exec "{{ timescaledb_pod_name.stdout }}" -n "{{ telemetry_namespace }}" -- psql -d "postgres://{{ timescaledb_user }}:{{ timescaledb_password }}@{{ timescaledb_service_external_ip.stdout }}:5432/{{ database_name }}" -c "SELECT COUNT(*) FROM public.timeseries_metrics;" # noqa: yaml[line-length] + register: count_metrics + changed_when: false + + - name: Display timescaledb metrics count after restore + ansible.builtin.debug: + msg: "metrics count {{ count_metrics.stdout | regex_search('\\d+') }}" + when: count_metrics.stdout is defined + + - name: Display restore timescaledb status + ansible.builtin.debug: + msg: "{{ timescaledb_restore_success_msg }}" + when: count_metrics.stdout is defined and count_metrics.stdout | regex_search('\\d+') + + - name: Verify metrics in telemetry_metrics database + when: + - timescale_pod_status_flag is true + - idrac_telemetry_pod_status_flag is false + block: + - name: Count records in timeseries_metrics table + ansible.builtin.command: kubectl exec "{{ timescaledb_pod_name.stdout }}" -n "{{ telemetry_namespace }}" -- psql -d "postgres://{{ timescaledb_user }}:{{ timescaledb_password }}@{{ timescaledb_service_external_ip.stdout }}:5432/{{ database_name }}" -c "SELECT COUNT(*) FROM omnia_telemetry.metrics;" # noqa: yaml[line-length] + register: count_metrics + changed_when: false + + - name: Display timescaledb metrics count after restore + ansible.builtin.debug: + msg: "metrics count {{ count_metrics.stdout | regex_search('\\d+') }}" + when: count_metrics.stdout is defined + + - name: Display restore timescaledb status + ansible.builtin.debug: + msg: "{{ timescaledb_restore_success_msg }}" + when: count_metrics.stdout is defined and count_metrics.stdout | regex_search('\\d+') + +- name: Create the database schemas if backup file does not exist + when: + - not telemetry_backup_file.stat.exists + block: + - name: Dump file does not exist , Invoke shell script to create public schema + ansible.builtin.command: kubectl exec -it "{{ timescaledb_pod_name.stdout }}" -n "{{ telemetry_namespace }}" ./cmd/initialize_timescaledb.sh + changed_when: false + register: status + until: status is not failed + retries: "{{ max_retries }}" + delay: "{{ max_delay }}" + + - name: Dump file does not exist , Invoke python utility to create schema and table omnia_telemetry schema + ansible.builtin.command: | + {{ python_version }} {{ db_schema_utility }} {{ timescaledb_user }} {{ timescaledb_password }} + {{ timescaledb_service_external_ip.stdout }} {{ timescaledb_container_port }} {{ database_name }} + changed_when: false + no_log: true + when: omnia_telemetry_support diff --git a/upgrade/roles/upgrade_omniadb/tasks/delete_and_recreate.yml b/upgrade/roles/restore_oim/tasks/validate_upgrade_config.yml similarity index 50% rename from upgrade/roles/upgrade_omniadb/tasks/delete_and_recreate.yml rename to upgrade/roles/restore_oim/tasks/validate_upgrade_config.yml index 831b6babe..a415d4cbe 100644 --- a/upgrade/roles/upgrade_omniadb/tasks/delete_and_recreate.yml +++ b/upgrade/roles/restore_oim/tasks/validate_upgrade_config.yml @@ -12,27 +12,32 @@ # See the License for the specific language governing permissions and # limitations under the License. --- + - name: Read file path parameters from upgrade_config.yml ansible.builtin.include_vars: file: upgrade_config.yml changed_when: false -- name: Ping old omnia database - community.postgresql.postgresql_ping: - db: omniadb - login_password: "{{ postgresdb_password }}" - register: db_ping_old +- name: Get stats of the installed omnia path + ansible.builtin.stat: + path: "{{ installed_omnia_path }}" + register: installed_path_stat + +- name: Assert installed omnia path is not empty + ansible.builtin.assert: + that: + - installed_omnia_path + - installed_path_stat.stat.isdir is defined and installed_path_stat.stat.isdir + fail_msg: "{{ valid_installed_path_msg }}" -- name: Delete old omnia database - community.postgresql.postgresql_db: - db: omniadb - login_password: "{{ postgresdb_password }}" - state: absent - when: db_ping_old.is_available +- name: Get stats of the backup location + ansible.builtin.stat: + path: "{{ backup_location }}" + register: backup_location_stat -- name: Create new omnia database - community.postgresql.postgresql_db: - db: omniadb - login_password: "{{ postgresdb_password }}" - state: present - when: db_ping_old.is_available +- name: Assert backup location is not empty + ansible.builtin.assert: + that: + - backup_location + - backup_location_stat.stat.isdir is defined and backup_location_stat.stat.isdir + fail_msg: "{{ valid_backup_location_msg }}" diff --git a/upgrade/roles/restore_oim/tasks/validate_venv.yml b/upgrade/roles/restore_oim/tasks/validate_venv.yml new file mode 100644 index 000000000..91ce350f1 --- /dev/null +++ b/upgrade/roles/restore_oim/tasks/validate_venv.yml @@ -0,0 +1,33 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check virtual ENV + ansible.builtin.set_fact: + venv_path: "{{ lookup('ansible.builtin.env', 'VIRTUAL_ENV') }}" + +- name: Determine if venv is active + ansible.builtin.set_fact: + is_venv_active: "{{ venv_path is defined and venv_path | length > 0 }}" + +- name: Fail if VIRTUAL_ENV is not set + ansible.builtin.fail: + msg: "{{ venv_active_fail_msg }}" + when: not is_venv_active + +- name: Check if venv_path is correct + ansible.builtin.assert: + that: + - venv_path == "/opt/omnia/omnia161_venv" + fail_msg: "{{ venv_fail_msg }}" diff --git a/upgrade/roles/restore_oim/vars/main.yml b/upgrade/roles/restore_oim/vars/main.yml new file mode 100644 index 000000000..bc5859ad2 --- /dev/null +++ b/upgrade/roles/restore_oim/vars/main.yml @@ -0,0 +1,144 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Usage:include_telemetry_config.yml +telemetry_config_file: "{{ installed_omnia_path }}/input/telemetry_config.yml" +telemetry_vault_filename: "{{ installed_omnia_path }}/input/.telemetry_vault_key" +telemetry_config_syntax_fail_msg: "Failed.Syntax errors present in telemetry_config.yml.Fix errors and re-run playbook again." +vault_file_perm: '0644' + +# Usage: etcd.yml +snapshot_db_name: "{{ k8s_backup_location }}/snapshot.db" +etcd_restore_data_dir: "/var/lib/etcd" +k8s_backup_location: "{{ backup_location }}/k8s" +etcd_env_file: "/etc/etcd.env" +directory_mode: "755" +etcd_wait_time: 180 +kube_proxy_app_name: "kube-proxy" +kube_system_namespace: "kube-system" + +# Usage: prerequiste.yml +telemetry_json_file: "{{ installed_omnia_path }}/input/config/{{ software_config.cluster_os_type }}/{{ software_config.cluster_os_version }}/telemetry.json" +software_config_file: "{{ installed_omnia_path }}/input/software_config.json" +local_repo_access_path: "/opt/omnia/offline/local_repo_access.yml" + +# Usage: restore_precheck.yml +k8s_backup_location_tarball: "{{ k8s_backup_location }}.tar.gz" +k8s_tarball_file_fail_msg: 'Please untar "{{ k8s_backup_location_tarball }}" file and rerun the restore_oim.yml' +etcd_file_fail_msg: '"{{ k8s_backup_location }}/snapshot.db" file doesnt exist. So restore cannot be performed because backup does not exist.' +mysqldb_local_backup_file: "mysqldb_dump.sql" +timescale_telemetry_backup_file: "telemetry_tsdb_dump.sql" + +# Usage: validate_venv.yml +venv_fail_msg: "The venv_path is incorrect! Expected /opt/omnia/omnia161_venv but got {{ venv_path }}. +Please deactivate existing venv using command: deactivate and activate venv using command: source /opt/omnia/omnia161_venv/bin/activate and rerun playbook" +venv_active_fail_msg: "It seems the Python virtual environment for Omnia isn’t active. Please activate it using the following command: source /opt/omnia/omnia161_venv/bin/activate and rerun playbook." # noqa: yaml[line-length] + +# validate_upgrade_config.yml +valid_installed_path_msg: "Please provide a valid path for 'installed_omnia_path' in the upgrade_config.yml file +and re-run the playbook." +valid_backup_location_msg: "Please provide a valid path for 'backup_location' in the upgrade_config.yml file and re-run the playbook." + +# Usage:clean_k8s.yml +crio_socket: /var/run/crio/crio.sock +metallb_files: + - "/var/lib/ipaddresspool.yaml" + - "/var/lib/l2advertisement.yaml" +bin_dir: /usr/local/bin +grace_period: 0 +kube_folder_path: /root/.kube +k8s_bin_files: + - "{{ bin_dir }}/kubelet" + - "{{ bin_dir }}/kubectl" + - "{{ bin_dir }}/crictl" + - "{{ bin_dir }}/etcd" + - "{{ bin_dir }}/calicoctl" + - "{{ bin_dir }}/kubeadm" + - "{{ bin_dir }}/calicoctl.sh" + - "{{ bin_dir }}/etcdctl" + - "{{ bin_dir }}/etcdctl.sh" + - "{{ bin_dir }}/k8s-certs-renew.sh" + - "{{ bin_dir }}/helm" + - "{{ usr_bin_dir }}/kubelet" + - "{{ usr_bin_dir }}/kubectl" + - "{{ usr_bin_dir }}/crictl" + - "{{ usr_bin_dir }}/etcd" + - "{{ usr_bin_dir }}/calicoctl" + - "{{ usr_bin_dir }}/kubeadm" + - "{{ usr_bin_dir }}/calicoctl.sh" + - "{{ usr_bin_dir }}/etcdctl" + - "{{ usr_bin_dir }}/etcdctl.sh" + - "{{ usr_bin_dir }}/k8s-certs-renew.sh" + - "{{ usr_bin_dir }}/helm" +k8s_del_files: + - /usr/local/share/ca-certificates/etcd-ca.crt + - /usr/local/share/ca-certificates/kube-ca.crt + - /etc/ssl/certs/etcd-ca.pem + - /etc/ssl/certs/kube-ca.pem + - /etc/pki/ca-trust/source/anchors/etcd-ca.crt + - /etc/pki/ca-trust/source/anchors/kube-ca.crt + - /var/log/calico + - /etc/calico + - /var/lib/kubelet + - /var/lib/etcd + - /run/calico + - /etc/bash_completion.d/kubectl.sh + - /etc/modules-load.d/kubespray-br_netfilter.conf + - /usr/libexec/kubernetes + - /etc/NetworkManager/conf.d/calico.conf + - /etc/NetworkManager/conf.d/k8s.conf + - /root/.helm + - /root/.config/helm + - /root/.cache/helm + - /root/.local/share/helm + - /root/.kube +usr_bin_dir: /usr/bin + +# Usage: pod_check.yml +idrac_k8s_name: idrac-telemetry + +# Usage: timescaledb.yml +database_name: "telemetry_metrics" +dump_file: "telemetry_tsdb_dump.sql" +telemetry_namespace: "telemetry-and-visualizations" +timescaledb_k8s_name: timescaledb +timescaledb_container_port: 5432 +max_retries: 10 +max_delay: 10 +timescaledb_file_fail_msg: "Backup file {{ k8s_backup_location }}/{{ timescale_telemetry_backup_file }} does not exist!" +timescaledb_restore_success_msg: "Timescaledb restored successfully" +python_version: "{{ ansible_python_interpreter }}" +db_schema_utility: "{{ installed_omnia_path }}/telemetry/roles/omnia_telemetry_prepare_cp/files/omnia_telemetry_schema_creation.py" + +# Usage: main.yml +omnia_inv_path: /opt/omnia/omnia_inventory/ +meta_path: "/opt/omnia/.data/metadata.yml" + +# Usage: restore_nerdctl.yml +temp_download_dir: "/tmp" +directory_permissions: "0755" +nerdctl_directory: "{{ temp_download_dir }}/nerdctl/upgrade" +file_permission: "0644" +nerdctl: + folder_dest: "{{ nerdctl_directory }}/nerdctl" + folder_permission: "+x" + url: "https://github.com/containerd/nerdctl/releases/download/v1.5.0/nerdctl-1.5.0-linux-amd64.tar.gz" + archive_dest: "{{ nerdctl_directory }}/nerdctl-1.5.0-linux-amd64.tar.gz" + binary_files_path: + - "/usr/local/bin/nerdctl" + - "/usr/bin/nerdctl" + executable_dest: + - "/usr/local/bin/" + - "/usr/bin/" diff --git a/upgrade/roles/telemetry_uninstall/tasks/telemetry_uninstall.yml b/upgrade/roles/telemetry_uninstall/tasks/telemetry_uninstall.yml deleted file mode 100644 index 88b4c1cbe..000000000 --- a/upgrade/roles/telemetry_uninstall/tasks/telemetry_uninstall.yml +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Reset kubeadm - ansible.builtin.command: kubeadm reset --cri-socket={{ crio_socket }} -f - changed_when: false - failed_when: false - -- name: Remove /etc/exports entries - ansible.builtin.lineinfile: - path: "{{ exports_path }}" - regexp: "{{ item }}" - state: absent - with_items: "{{ exports_regexp }}" - -- name: Exporting the shared directories - ansible.builtin.command: /usr/sbin/exportfs -r - changed_when: true - failed_when: false - -- name: Include telemetry_config file - ansible.builtin.include_tasks: include_telemetry_config.yml - -- name: Remove grafana github data - ansible.builtin.file: - path: "{{ item }}" - state: absent - with_items: "{{ grafana_folders }}" - -- name: Remove telemetry github data - ansible.builtin.file: - path: "{{ item }}" - state: absent - with_items: "{{ telemetry_folders }}" - -- name: Remove metallb data - ansible.builtin.file: - path: "{{ item }}" - state: absent - with_items: "{{ metallb_files }}" diff --git a/upgrade/roles/telemetry_uninstall/vars/main.yml b/upgrade/roles/telemetry_uninstall/vars/main.yml deleted file mode 100644 index 0665873c2..000000000 --- a/upgrade/roles/telemetry_uninstall/vars/main.yml +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -# Usage:telemetry_uninstall.yml -exports_path: /etc/exports -exports_regexp: - - "/install" - - "/tftpboot" - - "/var/nfs_repo" -crio_socket: /var/run/crio/crio.sock -metallb_files: - - "/var/lib/ipaddresspool.yaml" - - "/var/lib/l2advertisement.yaml" -grafana_folders: - - "{{ mount_location }}github-grafana-plugins" -telemetry_folders: - - "{{ mount_location }}iDRAC-Telemetry-Reference-Tools/" - - "{{ mount_location }}iDRAC-Telemetry-Scripting/" - -# Usage:include_telemetry_config.yml -telemetry_config_file: "{{ role_path }}/../../../input/telemetry_config.yml" -telemetry_vault_filename: "{{ role_path }}/../../../input/.telemetry_vault_key" -telemetry_config_syntax_fail_msg: "Failed.Syntax errors present in telemetry_config.yml.Fix errors and re-run playbook again." -vault_file_perm: '0644' diff --git a/upgrade/roles/uninstall_k8s_cluster/tasks/uninstall_k8s.yml b/upgrade/roles/uninstall_k8s_cluster/tasks/uninstall_k8s.yml deleted file mode 100644 index e4d35919a..000000000 --- a/upgrade/roles/uninstall_k8s_cluster/tasks/uninstall_k8s.yml +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Reset kubeadm - ansible.builtin.command: "kubeadm reset -f" - changed_when: true - failed_when: false - -- name: Uninstall k8s packages - ansible.builtin.package: - name: "{{ k8s_packages }}" - state: absent - -# - name: Remove unused dependencies on ubuntu -# ansible.builtin.command: sudo apt autoremove -# become: true -# when: cluster_os|lower == "ubuntu" - -- name: Autoremove unneeded packages installed as dependencies - ansible.builtin.dnf: - autoremove: true - -- name: Remove K8s files - ansible.builtin.file: - path: "{{ item }}" - state: absent - with_items: "{{ k8s_files }}" - -- name: Remove kubernetes repo - ansible.builtin.file: - path: "{{ k8s_repo_file }}" - state: absent - -- name: Clear iptables and rules - ansible.builtin.shell: - cmd: "{{ item }}" # noqa: no-changed-when command-instead-of-shell - become: true - with_items: "{{ clear_iptables_cmds }}" - -- name: Remove docker on K8s node - ansible.builtin.include_tasks: remove_docker_k8s.yml - tags: remove_docker - -- name: Clean the repos cache - ansible.builtin.command: dnf clean all - changed_when: true - -- name: Update the repos cache - ansible.builtin.command: dnf makecache - changed_when: true - -- name: Reload systemd manager - ansible.builtin.systemd: - daemon-reload: true diff --git a/upgrade/roles/uninstall_k8s_cluster/vars/main.yml b/upgrade/roles/uninstall_k8s_cluster/vars/main.yml deleted file mode 100644 index 647d41412..000000000 --- a/upgrade/roles/uninstall_k8s_cluster/vars/main.yml +++ /dev/null @@ -1,64 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -# Usage:validation.yml -kube_inv_fail_msg: "Required kubernetes groups [manager,compute] are not defined in inventory. Please check inventory passed is of Omnia 1.5 format" - -# Usage:uninstall_k8s.yml -crio_socket_path: "" - -k8s_repo_file: "/etc/yum.repos.d/kubernetes.repo" - -k8s_packages: - - "kubelet" - - "kubeadm" - - "kubectl" - - "kubernetes-cni" - - "kube*" - -k8s_files: - - "~/.kube" - - "/etc/cni" - - "/etc/kubernetes" - - "/etc/apparmor.d/docker" - - "/etc/systemd/system/etcd*" - - "/var/lib/dockershim" - - "/var/lib/etcd" - - "/var/lib/kubelet" - - "/var/lib/etcd2/" - - "/var/run/kubernetes" - - "/var/lib/cni/" # ref:https://github.com/dell/omnia/blob/main/utils/destroy.yml - - "/run/flannel/" # ref:https://github.com/dell/omnia/blob/main/utils/destroy.yml - - -clear_iptables_cmds: - - "iptables -F && iptables -X" # clear iptables - - "iptables -t nat -F && iptables -t nat -X" # flush and delete the NAT (Network Address Translation) table - - "iptables -t raw -F && iptables -t raw -X" # flush and remove the chains and rules in the raw table - - "iptables -t mangle -F && iptables -t mangle -X" # remove the chains and rules in the mangle table - -# Usage: remove_docker_k8s.yml -docker_packages: - - docker-ce-cli - - docker-ce - - containerd.io - -docker_repo_file: "/etc/yum.repos.d/docker-ce.repo" - -docker_del_files: - - "/var/lib/docker" - - "/var/lib/containerd" - -omnia_config_file_path: "{{ role_path }}/../../../input/omnia_config.yml" diff --git a/upgrade/roles/update_metadata/tasks/update_metadata.yml b/upgrade/roles/update_metadata/tasks/update_metadata.yml deleted file mode 100644 index fea81801e..000000000 --- a/upgrade/roles/update_metadata/tasks/update_metadata.yml +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Check metadata.yml file existence - ansible.builtin.stat: - path: "{{ metadata_yaml_file_path }}" - register: metadata_status - -- name: Create metadata.yml file if it doesn't exists - ansible.builtin.file: - path: "{{ metadata_yaml_file_path }}" - state: touch - mode: "{{ file_permissions }}" - group: root - owner: root - when: not metadata_status.stat.exists - -- name: Update or add installed_version in metadata.yml file - block: - - name: Update installed_version - ansible.builtin.lineinfile: - path: "{{ metadata_yaml_file_path }}" - regexp: '^installed_version:(.*)$' - insertafter: "EOF" - state: present - line: 'installed_version: 1.6.1' diff --git a/upgrade/roles/upgrade_discovery/tasks/main.yml b/upgrade/roles/upgrade_discovery/tasks/main.yml new file mode 100644 index 000000000..da70d8a6f --- /dev/null +++ b/upgrade/roles/upgrade_discovery/tasks/main.yml @@ -0,0 +1,85 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Include provision_config.yml + ansible.builtin.include_role: + name: discovery_validations/common + tasks_from: include_provision_config.yml # noqa: role-name[path] + +- name: Validate discovery parameters + ansible.builtin.include_role: + name: discovery_validations/common + tasks_from: include_network_spec.yml # noqa: role-name[path] + +- name: Fetch the network interfaces in UP state in the system + ansible.builtin.shell: | + set -o pipefail + /usr/sbin/ip a | awk '/state UP/{print $2}' + register: nic_addr_up + changed_when: false + +- name: Validate discovery parameters + ansible.builtin.include_role: + name: discovery_validations/common + tasks_from: validate_admin_nic.yml # noqa: role-name[path] + +- name: Validate domain name + ansible.builtin.include_role: + name: discovery_validations/common + tasks_from: validate_domain_name.yml # noqa: role-name[path] + +- name: Set pxe_nic_forwarders + ansible.builtin.set_fact: + pxe_nic_forwarders: "{{ network_data.admin_network.DNS | default('', true) }}" + +- name: Configure forwarders in site table + ansible.builtin.shell: > + {{ xcat_path }}/chdef -t site forwarders="{{ pxe_nic_forwarders }}" + changed_when: true + +- name: Task for creating DNS configuration + block: + - name: Create DNS configuration + ansible.builtin.command: "{{ xcat_sbin_path }}/makedns -n" + changed_when: true + register: dns_config + rescue: + - name: Verify DNS configuration is successful + ansible.builtin.debug: + msg: "{{ dns_config_warning_msg }} {{ dns_config.stderr }}" + when: dns_config.stderr is defined + +- name: Update local_repo_access.yml + ansible.builtin.lineinfile: + path: "{{ local_repo_access_dest_path }}" + insertafter: EOF + line: "{{ item.line }}" + with_items: + - { line: 'oim_hostname: "{{ oim_hostname }}"' } + - { line: 'domain_name: "{{ oim_domain_name }}"' } + - { line: 'proxy_status: false' } + - { line: 'no_proxy_input_status: false' } + - { line: 'user_no_proxy: ""' } + +- name: Initiate monitor thread + ansible.builtin.include_role: + name: monitor_thread + tasks_from: initiate_monitor_status.yml + +- name: Restart omnia service + ansible.builtin.systemd: + name: omnia + state: restarted + enabled: true diff --git a/upgrade/roles/upgrade_discovery/vars/main.yml b/upgrade/roles/upgrade_discovery/vars/main.yml new file mode 100644 index 000000000..95111db6e --- /dev/null +++ b/upgrade/roles/upgrade_discovery/vars/main.yml @@ -0,0 +1,20 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Usage: main.yml +local_repo_access_dest_path: "/opt/omnia/offline/local_repo_access.yml" +xcat_sbin_path: /opt/xcat/sbin +xcat_path: /opt/xcat/bin +dns_config_warning_msg: "[WARNING] makedns -n command is not successful. Error:" diff --git a/upgrade/roles/upgrade_idrac_telemetry/tasks/create_idrac_inventory.yml b/upgrade/roles/upgrade_idrac_telemetry/tasks/create_idrac_inventory.yml new file mode 100644 index 000000000..97488f095 --- /dev/null +++ b/upgrade/roles/upgrade_idrac_telemetry/tasks/create_idrac_inventory.yml @@ -0,0 +1,31 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +--- + +- name: Read mysql db for the credentials + ansible.builtin.command: kubectl exec -it "{{ mysqldb_pod }}"-0 -n "{{ telemetry_namespace }}" \ + -- mysql -u "{{ mysqldb_user }}" -p"{{ mysqldb_password }}" \ + -e "SELECT ip FROM "{{ mysqldb_table }}""; + register: idracip_dump + changed_when: false + no_log: true + +- name: Create idrac inventory + ansible.builtin.add_host: + name: "{{ item }}" + groups: "idrac" + with_items: "{{ idracip_dump.stdout_lines }}" + when: + - "'[ip]' not in item" + - item | trim | length > 1 diff --git a/upgrade/roles/upgrade_idrac_telemetry/tasks/create_telemetry_pods.yml b/upgrade/roles/upgrade_idrac_telemetry/tasks/create_telemetry_pods.yml new file mode 100644 index 000000000..d5faef53c --- /dev/null +++ b/upgrade/roles/upgrade_idrac_telemetry/tasks/create_telemetry_pods.yml @@ -0,0 +1,37 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +--- + +- name: Include timescale DB role + ansible.builtin.include_role: + name: "{{ timescaledb_role_path }}" + +# Task will execute only if timescaledb metrics data not present +- name: Initiate the Telemetry DB restore + ansible.builtin.include_tasks: timescale_db_restore.yml + when: not (omnia_telemetry_schema_flag or public_schema_flag) + +- name: Include idrac_telemetry vars + ansible.builtin.include_vars: + file: "{{ playbook_dir }}/../telemetry/roles/idrac_telemetry/vars/main.yml" + +- name: Include timescaleDB vars + ansible.builtin.include_vars: + file: "{{ playbook_dir }}/../telemetry/roles/timescaledb/vars/main.yml" + +- name: Validate telemetry parameters + ansible.builtin.import_tasks: "{{ idrac_deployment_file }}" + +- name: Initiate the telemetry + ansible.builtin.include_tasks: initiate_telemetry.yml diff --git a/upgrade/roles/upgrade_idrac_telemetry/tasks/delete_idrac_pods.yml b/upgrade/roles/upgrade_idrac_telemetry/tasks/delete_idrac_pods.yml new file mode 100644 index 000000000..c1c4682cc --- /dev/null +++ b/upgrade/roles/upgrade_idrac_telemetry/tasks/delete_idrac_pods.yml @@ -0,0 +1,29 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +--- + +- name: Deleting the timescale DB pod + ansible.builtin.command: kubectl delete statefulset "{{ tsdb_pod }}" -n "{{ telemetry_namespace }}" + changed_when: false + failed_when: false + +- name: Deleting the iDRAC telemetry pods + ansible.builtin.command: kubectl delete deployment "{{ idrac_telemetry_pod }}" -n "{{ telemetry_namespace }}" + changed_when: false + failed_when: false + +- name: Delete iDRAC reference tools telemetry folder + ansible.builtin.file: + path: "{{ idrac_reference_tools_folder }}" + state: absent diff --git a/upgrade/roles/upgrade_idrac_telemetry/tasks/filter_idrac.yml b/upgrade/roles/upgrade_idrac_telemetry/tasks/filter_idrac.yml new file mode 100644 index 000000000..42c768b6a --- /dev/null +++ b/upgrade/roles/upgrade_idrac_telemetry/tasks/filter_idrac.yml @@ -0,0 +1 @@ +../../../../telemetry/roles/idrac_telemetry/tasks/filter_idrac.yml \ No newline at end of file diff --git a/upgrade/roles/upgrade_idrac_telemetry/tasks/include_provision_config.yml b/upgrade/roles/upgrade_idrac_telemetry/tasks/include_provision_config.yml new file mode 100644 index 000000000..8239c863c --- /dev/null +++ b/upgrade/roles/upgrade_idrac_telemetry/tasks/include_provision_config.yml @@ -0,0 +1,54 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Include provision_config.yml + +- name: Check that the provision_config.yml exists + ansible.builtin.stat: + path: "{{ provision_config_file }}" + register: stat_result + +- name: Fail if provision_config.yml file doesn't exist + ansible.builtin.fail: + msg: "{{ fail_msg_provision_config_file }}" + when: not stat_result.stat.exists + +- name: Include variable file provision_config.yml + ansible.builtin.include_vars: "{{ provision_config_file }}" + no_log: true + +- name: Install dos2unix package + ansible.builtin.package: + name: dos2unix + state: present + +- name: Convert timezone.txt to linux format + ansible.builtin.command: dos2unix {{ timezone_file_path }} + failed_when: false + changed_when: false + +- name: Searching for timezone + ansible.builtin.lineinfile: + path: "{{ timezone_file_path }}" + line: "{{ timezone }}" + state: present + check_mode: true + register: timezone_search + +- name: Assert timezone + ansible.builtin.assert: + that: timezone_search is not changed + fail_msg: "{{ fail_timezone_msg }}" + register: timezone_check diff --git a/upgrade/roles/upgrade_idrac_telemetry/tasks/include_telemetry_vars.yml b/upgrade/roles/upgrade_idrac_telemetry/tasks/include_telemetry_vars.yml new file mode 100644 index 000000000..361b87b6e --- /dev/null +++ b/upgrade/roles/upgrade_idrac_telemetry/tasks/include_telemetry_vars.yml @@ -0,0 +1,70 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +--- + +# Include telemetry_config_file.yml +- name: Check that the telemetry_config.yml exists + ansible.builtin.stat: + path: "{{ telemetry_config_file }}" + register: stat_result + +- name: Fail if telemetry_config.yml file doesn't exist + ansible.builtin.fail: + msg: "{{ fail_msg_telemetry_config_file }}" + when: not stat_result.stat.exists + +- name: Check telemetry_config.yml file is encrypted + ansible.builtin.command: cat {{ telemetry_config_file }} + changed_when: false + register: config_content + no_log: true + +- name: Decrpyt telemetry_login_vars.yml + ansible.builtin.command: >- + ansible-vault decrypt {{ telemetry_config_file }} + --vault-password-file {{ vault_filename }} + changed_when: false + when: "'$ANSIBLE_VAULT;' in config_content.stdout" + +- name: Include variable file telemetry_config.yml + block: + - name: Include variable file telemetry_config.yml + ansible.builtin.include_vars: "{{ telemetry_config_file }}" + register: include_telemetry_config + no_log: true + rescue: + - name: Failed to include telemetry_config.yml + ansible.builtin.fail: + msg: "{{ telemetry_config_syntax_fail_msg }} Possible Syntax Error Hints: {{ include_telemetry_config.message }}" + +# Encrypt telemetry_config.yml +- name: Create ansible vault key + ansible.builtin.set_fact: + vault_key: "{{ lookup('password', '/dev/null chars=ascii_letters') }}" + when: "'$ANSIBLE_VAULT;' not in config_content.stdout" + +- name: Save vault key + ansible.builtin.lineinfile: + path: "{{ vault_filename }}" + line: "{{ vault_key }}" + mode: "{{ vault_file_perm }}" + owner: root + create: true + when: "'$ANSIBLE_VAULT;' not in config_content.stdout" + +- name: Encrypt telemetry_config.yml file + ansible.builtin.command: >- + ansible-vault encrypt {{ telemetry_config_file }} + --vault-password-file {{ vault_filename }} + changed_when: false diff --git a/upgrade/roles/upgrade_idrac_telemetry/tasks/initiate_telemetry.yml b/upgrade/roles/upgrade_idrac_telemetry/tasks/initiate_telemetry.yml new file mode 100644 index 000000000..3504cbde1 --- /dev/null +++ b/upgrade/roles/upgrade_idrac_telemetry/tasks/initiate_telemetry.yml @@ -0,0 +1,109 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Include and initialize variables +- name: Initiate telemetry process if idrac_support is enabled + when: idrac_telemetry_support is true and 'idrac' in groups + block: + - name: Initialize variables + ansible.builtin.set_fact: + idrac_telemetry_scripting_repo: "https://github.com/dell/iDRAC-Telemetry-Scripting.git" + idrac_telemetry_scripting_stable_commit: "1f4bb26" + idrac_telemetry_scripting_folder: iDRAC-Telemetry-Scripting + login_vars_file: "{{ playbook_dir }}/input_params/telemetry_login_vars.yml" + vault_filename: "{{ playbook_dir }}/input_params/.login_vault_key" + idrac_inventory_filename: "/opt/omnia/idrac_inventory" + idrac_file_existence: "iDRAC inventory file does not exist. Check /opt/omnia/idrac_inventory." + monitor_config_file: "{{ playbook_dir }}/../input/monitor_config.yml" + monitor_config_vault_filename: "{{ playbook_dir }}/../input/.monitor_vault_key" + min_firmware_version_reqd: 3 + datacenter_license: false + firmware_version: false + file_perm: '0644' + telemetry_idrac: [] + service_type: 3 + auth_type: 1 + idrac_awx_count: 0 + filtered_idrac_count: 0 + failed_idrac: [] + awx_idrac: [] + +# Filter iDRACs matching telemetry pre-requisites + + - name: Add iDRAC nodes and initiate telemetry + ansible.builtin.include_tasks: filter_idrac.yml + with_items: "{{ groups['idrac'] }}" + loop_control: + index_var: idrac_index + # no_log: true + +# Add iDRAC Credentials in DB and enable telemetry fetching + + - name: Enable telemetry collection on iDRAC + when: telemetry_idrac is defined and (telemetry_idrac | length > 0) + block: + - name: Git clone telemetry initialization repo + ansible.builtin.git: + repo: "{{ idrac_telemetry_scripting_repo }}" + dest: "{{ mount_location + idrac_telemetry_scripting_folder }}" + version: "{{ idrac_telemetry_scripting_stable_commit }}" + register: telemetry_collection + + - name: Enable telemetry collection on iDRACs + ansible.builtin.command: >- + "{{ python_version }}" ./ConfigurationScripts/EnableOrDisableAllTelemetryReports.py -ip "{{ item }}" + -u "{{ idrac_username }}" -p "{{ idrac_password }}" -s Enabled + args: + chdir: "{{ mount_location + idrac_telemetry_scripting_folder }}" + with_items: "{{ telemetry_idrac }}" + changed_when: false + no_log: true + + rescue: + - name: Show failure msg + ansible.builtin.debug: + msg: "Enabling telemetry on an iDRAC failed" + + +# Initiate iDRAC collection + - name: Initiate telemetry collection + when: telemetry_idrac is defined and (telemetry_idrac | length > 0) + block: + - name: Wait for idrac-telemetry pod to come to ready state + ansible.builtin.command: kubectl wait --for=condition=ready --timeout=10m -n "{{ namespace }}" pod -l app="{{ idrac_telemetry_k8s_name }}" + changed_when: false + + - name: Get idrac-telemetry pod name + ansible.builtin.command: kubectl get pods -n "{{ namespace }}" -l app="{{ idrac_telemetry_k8s_name }}" -o jsonpath="{.items[0].metadata.name}" + changed_when: false + register: idrac_telemetry_pod + + - name: Wait for 15 sec for mysqldb to be ready with updated values + ansible.builtin.pause: + seconds: 15 + + - name: Initiate telemetry-collector + ansible.builtin.shell: >- + kubectl exec --stdin --tty "{{ idrac_telemetry_pod.stdout }}" -n "{{ namespace }}" + -c telemetry-receiver -- nohup go run cmd/redfishread/redfishread.go & + changed_when: false + + - name: Telemetry report + ansible.builtin.debug: + msg: + - "Count of iDRAC IPs found: {{ idrac_awx_count }}" + - "List of iDRAC IPs found: {{ awx_idrac }}" + - "Count of iDRAC IPs where telemetry is initiated: {{ filtered_idrac_count }}" + - "List of iDRAC IPs where telemetry is initiated: {{ telemetry_idrac }}" diff --git a/upgrade/roles/upgrade_idrac_telemetry/tasks/main.yml b/upgrade/roles/upgrade_idrac_telemetry/tasks/main.yml new file mode 100644 index 000000000..1034ca9ea --- /dev/null +++ b/upgrade/roles/upgrade_idrac_telemetry/tasks/main.yml @@ -0,0 +1,79 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +--- + +- name: When iDRAC pods are running, upgrade iDRAC and timscale DB pods + when: + - upgrade_precheck_pod_status_report.idrac_telemetry_pod_running_status is true + - upgrade_precheck_pod_status_report.timescaledb_pod_running_status is true + - upgrade_precheck_pod_status_report.mysqldb_pod_running_status is true + block: + - name: Pods of iDRAC are running, set telemetry entry present to true + ansible.builtin.set_fact: + telemetry_entry_present: true + + - name: Include telemetry_config vars + ansible.builtin.include_tasks: include_telemetry_vars.yml + + - name: Install required pip packages + ansible.builtin.pip: + name: "{{ item }}" + with_items: "{{ python_pip_packages }}" + + - name: Prepare the iDRAC inventory from mysql db + ansible.builtin.include_tasks: create_idrac_inventory.yml + + # Mounting timescaledb with correct mount path + - name: Backup timescaledb data files to host_path + ansible.builtin.include_tasks: reconfig_timescaledb_data.yml + + - name: Checking the timescaledb metrics status + ansible.builtin.include_role: + name: upgrade_precheck + tasks_from: check_timescaldb_existence.yml + + - name: Delete the TSDB and iDRAC telemetry pods + ansible.builtin.include_tasks: delete_idrac_pods.yml + + - name: Include provision_config to refer timezone + ansible.builtin.include_tasks: include_provision_config.yml + + - name: Span the iDRAC and timescale DB pods + ansible.builtin.include_tasks: create_telemetry_pods.yml + +- name: When only omnia telemetry is running, upgrade timescale DB + when: + - upgrade_precheck_pod_status_report.idrac_telemetry_pod_running_status is false + - upgrade_precheck_pod_status_report.timescaledb_pod_running_status is true + - upgrade_precheck_pod_status_report.mysqldb_pod_running_status is false + block: + - name: Pods of omnia telemetry are running, set telemetry entry present to true + ansible.builtin.set_fact: + telemetry_entry_present: true + + - name: Include telemetry_config vars + ansible.builtin.include_tasks: include_telemetry_vars.yml + + # Mounting timescaledb with correct mount path + - name: Backup timescaledb data files to host_path + ansible.builtin.include_tasks: reconfig_timescaledb_data.yml + + - name: Checking the timescaledb metrics status + ansible.builtin.include_role: + name: upgrade_precheck + tasks_from: check_timescaldb_existence.yml + + - name: Spin the timescale DB pod + ansible.builtin.include_tasks: upgrade_tsdb.yml diff --git a/upgrade/roles/upgrade_idrac_telemetry/tasks/reconfig_timescaledb_data.yml b/upgrade/roles/upgrade_idrac_telemetry/tasks/reconfig_timescaledb_data.yml new file mode 100644 index 000000000..f8709ad48 --- /dev/null +++ b/upgrade/roles/upgrade_idrac_telemetry/tasks/reconfig_timescaledb_data.yml @@ -0,0 +1,86 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Get timescaledb pod name + ansible.builtin.command: kubectl get pod -n "{{ telemetry_namespace }}" -l app="{{ tsdb_pod }}" -o jsonpath="{.items[0].metadata.name}" + register: timescaledb_pod_name + changed_when: false + failed_when: false + +- name: Check if Pod Exists + ansible.builtin.command: kubectl get pod "{{ timescaledb_pod_name.stdout }}" -n "{{ telemetry_namespace }}" --no-headers + register: timescaledb_pod_check + changed_when: false + failed_when: false + no_log: true + +- name: Changes for telemetry timescaledb mount path + when: timescaledb_pod_check.rc == 0 + block: + - name: Check if file exists + ansible.builtin.stat: + path: "{{ backup_tsdb_tar }}" + register: timescaledb_tar + + - name: Edit timescaledb deployment with correct mountPath + ansible.builtin.shell: | + kubectl patch statefulset "{{ tsdb_pod }}" -n "{{ telemetry_namespace }}" --type json \ + -p '[{"op": "replace", "path": "{{ timescaledb_container_volume_mount }}", "value": "{{ timescaledb_host_mount }}"}]' + args: + executable: /bin/bash + when: timescaledb_tar.stat.exists + changed_when: true + + - name: Copy timescaledb_data tar file to timescaledb host_path + ansible.builtin.copy: + src: "{{ backup_tsdb_tar }}" + dest: "{{ timescaledb_mnt }}" + mode: "{{ file_mode }}" + when: timescaledb_tar.stat.exists + + - name: Untar timescaledb_data tar file + ansible.builtin.unarchive: + src: "{{ timescaledb_mnt_tar }}" + dest: "{{ timescaledb_mnt }}" + when: timescaledb_tar.stat.exists + + - name: Remove data directory from /opt/omnia/telemetry/timescaledb + ansible.builtin.file: + path: "{{ timescaledb_mnt }}/data" + state: absent + when: timescaledb_tar.stat.exists + + - name: Wait for all pods to be Running before upgrade + ansible.builtin.shell: | + set -o pipefail + kubectl get pods --all-namespaces | grep -v "Running" | awk 'NR>1 {print $2}' | wc -l + register: pod_count + until: pod_count.stdout == "0" + retries: "{{ retry_count }}" + delay: "{{ delay_pod }}" + changed_when: false + when: timescaledb_tar.stat.exists + + - name: Remove timescaledb_data tar file from timecaledb host_path + ansible.builtin.file: + path: "{{ timescaledb_mnt_tar }}" + state: absent + when: timescaledb_tar.stat.exists + + - name: Remove timescaledb_data tar file from idrac host_path + ansible.builtin.file: + path: "{{ idrac_mnt_tar }}" + state: absent + when: timescaledb_tar.stat.exists diff --git a/upgrade/roles/upgrade_idrac_telemetry/tasks/timescale_db_restore.yml b/upgrade/roles/upgrade_idrac_telemetry/tasks/timescale_db_restore.yml new file mode 100644 index 000000000..7604ed446 --- /dev/null +++ b/upgrade/roles/upgrade_idrac_telemetry/tasks/timescale_db_restore.yml @@ -0,0 +1,112 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if telemetry_metrics database exists before restore + ansible.builtin.command: > + kubectl exec "{{ timescaledb_pod_name.stdout }}" + -n "{{ telemetry_namespace }}" + -- psql -U {{ timescaledb_user }} -tc "SELECT 1 FROM pg_database WHERE datname='{{ timescaledb_name }}';" + register: check_database + changed_when: false + +- name: Create telemetry_metrics database for restore + ansible.builtin.command: > + kubectl exec "{{ timescaledb_pod_name.stdout }}" + -n "{{ telemetry_namespace }}" + -- psql -U {{ timescaledb_user }} -c "CREATE DATABASE {{ timescaledb_name }};" + when: check_database.stdout.find('1') == -1 # Create DB only if it doesn't exist + register: create_database + changed_when: create_database.rc == 0 + +- name: Get external IP of timescaledb service + ansible.builtin.command: kubectl get svc "{{ tsdb_pod }}" -n "{{ telemetry_namespace }}" -o jsonpath='{.status.loadBalancer.ingress[0].ip}' + register: timescaledb_service_external_ip + failed_when: false + changed_when: false + +- name: Check if the telemetry_metrics backup file exists for restore + ansible.builtin.stat: + path: "{{ k8s_backup_location }}/{{ timescale_telemetry_backup_file }}" + register: telemetry_backup_file + +- name: Print debug and create omnia telemetry schema if the telemetry_metrics backup file is not present for restore + when: not telemetry_backup_file.stat.exists + block: + - name: Message user that tsdb backup file does not exists + ansible.builtin.debug: + msg: "Backup file {{ k8s_backup_location }}/{{ timescale_telemetry_backup_file }} does not exist!" + + - name: Dump file does not exist , Invoke python utility to create schema and table omnia_telemetry schema + ansible.builtin.command: | + {{ python_version }} {{ db_schema_utility }} {{ timescaledb_user }} {{ timescaledb_password }} + {{ timescaledb_service_external_ip.stdout }} {{ timescaledb_container_port }} {{ timescaledb_name }} + changed_when: false + no_log: true + +- name: Restore when tsdb backup file exists + when: telemetry_backup_file.stat.exists + block: + - name: Copy telemetry_tsdb_dump.sql dump file to the timescaledb pod + ansible.builtin.shell: > + set -o pipefail && \ + kubectl cp "{{ k8s_backup_location }}/{{ timescale_telemetry_backup_file }}" + "{{ telemetry_namespace }}/{{ timescaledb_pod_name.stdout }}:/tmp/telemetry_tsdb_dump.sql" + register: copy_backup + changed_when: copy_backup.rc == 0 + + - name: Restore telemetry_metrics database from backup file + ansible.builtin.shell: > + set -o pipefail && \ + kubectl exec "{{ timescaledb_pod_name.stdout }}" + -n "{{ telemetry_namespace }}" -- psql -U {{ timescaledb_user }} {{ timescaledb_name }} + -f /tmp/telemetry_tsdb_dump.sql + register: restore_database + changed_when: restore_database.rc == 0 + + - name: Verify metrics in telemetry_metrics database after restore + when: + - upgrade_precheck_pod_status_report.idrac_telemetry_pod_running_status is false + - upgrade_precheck_pod_status_report.timescaledb_pod_running_status is true + - upgrade_precheck_pod_status_report.mysqldb_pod_running_status is false + block: + - name: Count records in timeseries_metrics table + ansible.builtin.command: > + kubectl exec "{{ timescaledb_pod_name.stdout }}" + -n "{{ telemetry_namespace }}" + -- psql -U {{ timescaledb_user }} -d telemetry_metrics -c "SELECT COUNT(*) FROM omnia_telemetry.metrics;" + register: count_metrics + changed_when: false + + - name: Display count of metrics + ansible.builtin.debug: + msg: "Count of metrics: {{ count_metrics.stdout }}" + + - name: Verify metrics in telemetry_metrics database after restore + when: + - upgrade_precheck_pod_status_report.idrac_telemetry_pod_running_status is true + - upgrade_precheck_pod_status_report.timescaledb_pod_running_status is true + - upgrade_precheck_pod_status_report.mysqldb_pod_running_status is true + block: + - name: Count records in timeseries_metrics table + ansible.builtin.command: > + kubectl exec "{{ timescaledb_pod_name.stdout }}" + -n "{{ telemetry_namespace }}" + -- psql -U {{ timescaledb_user }} -d telemetry_metrics -c "SELECT COUNT(*) FROM public.timeseries_metrics;" + register: count_metrics + changed_when: false + + - name: Display count of metrics + ansible.builtin.debug: + msg: "Count of metrics: {{ count_metrics.stdout }}" diff --git a/upgrade/roles/upgrade_idrac_telemetry/tasks/upgrade_tsdb.yml b/upgrade/roles/upgrade_idrac_telemetry/tasks/upgrade_tsdb.yml new file mode 100644 index 000000000..8b84e5485 --- /dev/null +++ b/upgrade/roles/upgrade_idrac_telemetry/tasks/upgrade_tsdb.yml @@ -0,0 +1,37 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +--- + +- name: Deleting the timescale DB pod + ansible.builtin.command: kubectl delete statefulset "{{ tsdb_pod }}" -n "{{ telemetry_namespace }}" + changed_when: false + failed_when: false + +- name: Delete iDRAC reference tools telemetry folder + ansible.builtin.file: + path: "{{ idrac_reference_tools_folder }}" + state: absent + +- name: Include provision_config to refer timezone + ansible.builtin.include_tasks: include_provision_config.yml + +- name: Include timescale DB role + ansible.builtin.include_role: + name: "{{ timescaledb_role_path }}" + +# Task will execute only if timescaledb metrics data not present +- name: Initiate the Telemetry DB restore + ansible.builtin.include_tasks: timescale_db_restore.yml + when: not (omnia_telemetry_schema_flag or public_schema_flag) diff --git a/upgrade/roles/upgrade_idrac_telemetry/vars/main.yml b/upgrade/roles/upgrade_idrac_telemetry/vars/main.yml new file mode 100644 index 000000000..98e74ba64 --- /dev/null +++ b/upgrade/roles/upgrade_idrac_telemetry/vars/main.yml @@ -0,0 +1,64 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + + +# Usage: include_provision_config.yml +provision_config_file: "{{ role_path }}/../../../input/provision_config.yml" +fail_msg_provision_config_file: "provision_config.yml file doesn't exist." +fail_timezone_msg: "Failed. Incorrect timezone provided. Please check the file timezone.txt in discovery/roles/discovery_validations/common/files/ folder." + +telemetry_config_file: "{{ role_path }}/../../../input/telemetry_config.yml" +telemetry_namespace: "telemetry-and-visualizations" +vault_filename: "{{ role_path }}/../../../input/.telemetry_vault_key" +vault_file_perm: '0644' +idrac_telemetry_pod: "idrac-telemetry" +tsdb_pod: "timescaledb" +timescaledb_name: "telemetry_metrics" +mysqldb_pod: "mysqldb" +mysqldb_table: "idrac_telemetrysource_services_db.services" +python_pip_packages: + - omsdk==1.2.513 + - pysnmp==6.1.3 + +# Usage: k8s_secrets.yml +# namespace: telemetry-and-visualizations +secrets_name: credentials +mysqldb_secrets_name: mysqldb-credentials + +timescaledb_role_path: "{{ playbook_dir }}/../telemetry/roles/timescaledb" +timezone_file_path: "{{ playbook_dir }}/../telemetry/roles/telemetry_validation/files/timezone.txt" +idrac_deployment_file: "{{ role_path }}/../../../telemetry/roles/idrac_telemetry/tasks/idrac_telemetry_deployment.yml" +k8s_backup_location: "{{ backup_location }}/k8s" +timescale_telemetry_backup_file: "telemetry_tsdb_dump.sql" +idrac_reference_tools_folder: "{{ mount_location }}/iDRAC-Telemetry-Reference-Tools" + +python_version: "{{ ansible_python_interpreter }}" +db_schema_utility: "{{ role_path }}/../../../telemetry/roles/omnia_telemetry_prepare_oim/files/omnia_telemetry_schema_creation.py" +timescaledb_container_port: 5432 + +# Usage: reconfig_timescaledb_data.yml +idrac_telemetry_path: "/go/src/github.com/telemetry-reference-tools/omnia_timescaledb.tar.gz" +postgresql_pod_data: "/var/lib/postgresql/data" +timescaledb_mnt: "{{ mount_location }}/timescaledb" +timescaledb_mnt_tar: "{{ mount_location }}/timescaledb/omnia_timescaledb.tar.gz" +idrac_mnt_tar: "{{ mount_location }}/iDRAC-Telemetry-Reference-Tools/omnia_timescaledb.tar.gz" +timescaledb_container_volume_mount: "/spec/template/spec/containers/0/volumeMounts/1/mountPath" +timescaledb_host_mount: "/var/lib/postgresql/data" +backup_tsdb_tar: "{{ k8s_backup_location }}/omnia_timescaledb.tar.gz" +file_mode: "0644" +database_name: "telemetry_metrics" +timescaledb_k8s_name: timescaledb +retry_count: 5 +delay_pod: 30 diff --git a/upgrade/roles/upgrade_inventory/tasks/main.yml b/upgrade/roles/upgrade_inventory/tasks/main.yml deleted file mode 100644 index 8e309d5b5..000000000 --- a/upgrade/roles/upgrade_inventory/tasks/main.yml +++ /dev/null @@ -1,76 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -- name: Check whether user supplied old inventory - ansible.builtin.set_fact: - managergrp_exists: "{{ 'manager' in groups }}" - computegrp_exists: "{{ 'compute' in groups }}" - -- name: Extract manager, compute ips from inventory - when: managergrp_exists and computegrp_exists - block: - - name: Extract host ips under 'manager' group - ansible.builtin.set_fact: - manager_ips: "{{ groups['manager'] | default([]) }}" - - - name: Extract host ips under 'compute' group - ansible.builtin.set_fact: - compute_ips: "{{ groups['compute'] | default([]) }}" - - - name: Extract host ips under 'login' group - ansible.builtin.set_fact: - login_ips: "{{ groups['login'] | default([]) }}" - - - name: Extract host ips under 'nfs' group - ansible.builtin.set_fact: - nfs_node_ips: "{{ groups['nfs'] | default([]) }}" - - - name: Check whether idrac group exists in old inventory - ansible.builtin.set_fact: - idracgrp_exists: "{{ 'idrac' in groups }}" - - - name: Extract idrac ips under 'idrac' group - ansible.builtin.set_fact: - idrac_ips: "{{ groups['idrac'] | default([]) }}" - when: idracgrp_exists - -- name: Fetch file contents from software_config.json - ansible.builtin.slurp: - src: "{{ software_config_file_path }}" - register: config_content - -- name: Decode file contents - ansible.builtin.set_fact: - config_data: "{{ config_content['content'] | b64decode | from_json }}" - -- name: Check if slurm exists in software_config.json - ansible.builtin.set_fact: - slurm_exists: "{{ 'slurm' in config_data['softwares'] | map(attribute='name') | default(false) }}" - -- name: Check if k8s exists in software_config.json - ansible.builtin.set_fact: - k8s_exists: "{{ 'k8s' in config_data['softwares'] | map(attribute='name') | default(false) }}" - -- name: Render new inventory template - ansible.builtin.template: - src: "{{ inventory_template_src }}" - dest: "{{ new_inventory_path }}" - mode: "{{ file_permission }}" - -- name: Create NFS inventory (only when NFS group exists) - ansible.builtin.template: - src: "{{ nfs_inventory_template_src }}" - dest: "{{ new_inventory_path }}" - mode: "{{ file_permission }}" - when: nfs_node_ips | length > 0 diff --git a/upgrade/roles/upgrade_inventory/templates/nfs_inventory_template.j2 b/upgrade/roles/upgrade_inventory/templates/nfs_inventory_template.j2 deleted file mode 100644 index 445eab341..000000000 --- a/upgrade/roles/upgrade_inventory/templates/nfs_inventory_template.j2 +++ /dev/null @@ -1,7 +0,0 @@ -# General Cluster Storage -# NFS node - -[nfs] -{% for ip in nfs_node_ips %} -{{ ip }} -{% endfor %} diff --git a/upgrade/roles/upgrade_inventory/templates/upgrade_host_inventory_template.j2 b/upgrade/roles/upgrade_inventory/templates/upgrade_host_inventory_template.j2 deleted file mode 100644 index c3b121d2f..000000000 --- a/upgrade/roles/upgrade_inventory/templates/upgrade_host_inventory_template.j2 +++ /dev/null @@ -1,58 +0,0 @@ -# Batch Scheduler: Slurm - -[slurm_control_node] -{% if slurm_exists %} -{% for ip in manager_ips %} -{{ ip }} -{% endfor %} -{% endif %} - -[slurm_node] -{% if slurm_exists %} -{% for ip in compute_ips %} -{{ ip }} -{% endfor %} -{% endif %} - -[login] -{% if slurm_exists %} -{% for ip in login_ips %} -{{ ip }} -{% endfor %} -{% endif %} - - -# AI Scheduler: Kubernetes - -[kube_control_plane] -{% if k8s_exists %} -{% for ip in manager_ips %} -{{ ip }} -{% endfor %} -{% endif %} - -[etcd] -{% if k8s_exists %} -{% for ip in manager_ips %} -{{ ip }} -{% endfor %} -{% endif %} - -[kube_node] -{% if k8s_exists %} -{% for ip in compute_ips %} -{{ ip }} -{% endfor %} -{% endif %} - -[auth_server] -{% for ip in manager_ips %} -{{ ip }} -{% endfor %} - -{% if idracgrp_exists %} -[idrac] -{% for ip in idrac_ips %} -{{ ip }} -{% endfor %} -{% endif %} diff --git a/upgrade/roles/upgrade_inventory/vars/main.yml b/upgrade/roles/upgrade_inventory/vars/main.yml deleted file mode 100644 index 67c532488..000000000 --- a/upgrade/roles/upgrade_inventory/vars/main.yml +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -inventory_template_src: "../templates/upgrade_host_inventory_template.j2" -nfs_inventory_template_src: "../templates/nfs_inventory_template.j2" -new_inventory_path: "{{ role_path }}/../../inventory" -software_config_file_path: "{{ role_path }}/../../../input/software_config.json" -file_permission: "0755" diff --git a/upgrade/roles/upgrade_k8s_oim/files/rhel/telemetry_v1.29.5.json b/upgrade/roles/upgrade_k8s_oim/files/rhel/telemetry_v1.29.5.json new file mode 100644 index 000000000..3bc51a327 --- /dev/null +++ b/upgrade/roles/upgrade_k8s_oim/files/rhel/telemetry_v1.29.5.json @@ -0,0 +1,292 @@ +{ + "telemetry": { + "cluster": [ + { + "package": "buildkit", + "type": "git", + "url": "https://github.com/moby/buildkit.git", + "version": "v0.13.1" + }, + { "package": "smartmontools", + "type": "rpm", + "repo_name": "baseos" + }, + { + "package": "containerd.io-1.6.16-3.1.el8", + "type": "rpm", + "repo_name": "docker-ce-repo" + }, + { + "package": "docker-ce-cli-1:20.10.20-3.el8", + "type": "rpm", + "repo_name": "docker-ce-repo" + }, + { + "package": "docker-ce-3:20.10.20-3.el8", + "type": "rpm", + "repo_name": "docker-ce-repo" + }, + { + "package": "docker-ce-rootless-extras", + "type": "rpm", + "repo_name": "docker-ce-repo" + }, + { + "package": "kubespray-v2.25.0", + "type": "git", + "url": "https://github.com/kubernetes-sigs/kubespray.git", + "version": "v2.25.0" + }, + { + "package": "kubectl-v1.27.0", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.27.0/bin/linux/amd64/kubectl" + }, + { + "package": "kubectl-v1.28.0", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.28.0/bin/linux/amd64/kubectl" + }, + { + "package": "kubectl-v1.29.5", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.29.5/bin/linux/amd64/kubectl" + }, + { + "package": "kubelet-v1.27.0", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.27.0/bin/linux/amd64/kubelet" + }, + { + "package": "kubelet-v1.28.0", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.28.0/bin/linux/amd64/kubelet" + }, + { + "package": "kubelet-v1.29.5", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.29.5/bin/linux/amd64/kubelet" + }, + { + "package": "kubeadm-v1.27.0", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.27.0/bin/linux/amd64/kubeadm" + }, + { + "package": "kubeadm-v1.28.0", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.28.0/bin/linux/amd64/kubeadm" + }, + { + "package": "kubeadm-v1.29.5", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.29.5/bin/linux/amd64/kubeadm" + }, + { + "package": "calicoctl-v3.27.3", + "type": "tarball", + "url": "https://github.com/projectcalico/calico/releases/download/v3.27.3/calicoctl-linux-amd64" + }, + { + "package": "calicocrds-v3.27.3", + "type": "tarball", + "url": "https://github.com/projectcalico/calico/archive/v3.27.3.tar.gz" + }, + { + "package": "cri-tools-v1.27.1", + "type": "tarball", + "url": "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.27.1/crictl-v1.27.1-linux-amd64.tar.gz" + }, + { + "package": "cri-tools-v1.28.0", + "type": "tarball", + "url": "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.28.0/crictl-v1.28.0-linux-amd64.tar.gz" + }, + { + "package": "cri-tools-v1.29.0", + "type": "tarball", + "url": "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.29.0/crictl-v1.29.0-linux-amd64.tar.gz" + }, + { + "package": "etcd-v3.5.12", + "type": "tarball", + "url": "https://github.com/etcd-io/etcd/releases/download/v3.5.12/etcd-v3.5.12-linux-amd64.tar.gz" + }, + { + "package": "cni-plugins-v1.3.0", + "type": "tarball", + "url": "https://github.com/containernetworking/plugins/releases/download/v1.3.0/cni-plugins-linux-amd64-v1.3.0.tgz" + }, + { + "package": "nerdctl-v1.7.4", + "type": "tarball", + "url": "https://github.com/containerd/nerdctl/releases/download/v1.7.4/nerdctl-1.7.4-linux-amd64.tar.gz" + }, + { + "package": "containerd-1.7.16", + "type": "tarball", + "url": "https://github.com/containerd/containerd/releases/download/v1.7.16/containerd-1.7.16-linux-amd64.tar.gz" + }, + { + "package": "helm-v3.14.2", + "type": "tarball", + "url": "https://get.helm.sh/helm-v3.14.2-linux-amd64.tar.gz" + }, + { + "package": "nfs-subdir-external-provisioner-4.0.18", + "type": "tarball", + "url": "https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz" + }, + { + "package": "docker.io/library/nginx", + "tag": "1.25.2-alpine", + "type": "image" + }, + { + "package": "registry.k8s.io/coredns/coredns", + "tag": "v1.10.1", + "type": "image" + }, + { + "package": "registry.k8s.io/cpa/cluster-proportional-autoscaler", + "tag": "v1.8.8", + "type": "image" + }, + { + "package": "registry.k8s.io/dns/k8s-dns-node-cache", + "tag": "1.22.28", + "type": "image" + }, + { + "package": "registry.k8s.io/coredns/coredns", + "tag": "v1.11.1", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-apiserver", + "tag": "v1.27.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-apiserver", + "tag": "v1.28.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-apiserver", + "tag": "v1.29.5", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-controller-manager", + "tag": "v1.27.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-controller-manager", + "tag": "v1.28.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-controller-manager", + "tag": "v1.29.5", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-proxy", + "tag": "v1.27.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-proxy", + "tag": "v1.28.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-proxy", + "tag": "v1.29.5", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-scheduler", + "tag": "v1.27.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-scheduler", + "tag": "v1.28.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-scheduler", + "tag": "v1.29.5", + "type": "image" + }, + { + "package": "registry.k8s.io/pause", + "tag": "3.9", + "type": "image" + }, + { + "package": "quay.io/coreos/etcd", + "tag": "v3.5.12", + "type": "image" + }, + { + "package": "quay.io/calico/cni", + "tag": "v3.27.3", + "type": "image" + }, + { + "package": "quay.io/calico/kube-controllers", + "tag": "v3.27.3", + "type": "image" + }, + { + "package": "quay.io/calico/pod2daemon-flexvol", + "tag": "v3.27.3", + "type": "image" + }, + { + "package": "quay.io/calico/node", + "tag": "v3.27.3", + "type": "image" + }, + { + "package": "docker.io/flannel/flannel-cni-plugin", + "tag": "v1.1.2", + "type": "image" + }, + { + "package": "docker.io/flannel/flannel", + "tag": "v0.22.0", + "type": "image" + }, + { + "package": "quay.io/metallb/speaker", + "tag": "v0.13.9", + "type": "image" + }, + { + "package": "quay.io/metallb/controller", + "tag": "v0.13.9", + "type": "image" + }, + { + "package": "docker.io/kubernetesui/dashboard", + "tag": "v2.7.0", + "type": "image" + }, + { + "package": "docker.io/kubernetesui/metrics-scraper", + "tag": "v1.0.8", + "type": "image" + }, + { + "package": "docker.io/grafana/grafana-enterprise", + "tag": "8.3.2", + "type": "image" + } + ] + } + } diff --git a/upgrade/roles/upgrade_k8s_oim/files/rocky/telemetry_v1.29.5.json b/upgrade/roles/upgrade_k8s_oim/files/rocky/telemetry_v1.29.5.json new file mode 100644 index 000000000..f5837562a --- /dev/null +++ b/upgrade/roles/upgrade_k8s_oim/files/rocky/telemetry_v1.29.5.json @@ -0,0 +1,293 @@ +{ + "telemetry": { + "cluster": [ + { + "package": "buildkit", + "type": "git", + "url": "https://github.com/moby/buildkit.git", + "version": "v0.13.1" + }, + { "package": "smartmontools", + "type": "rpm", + "repo_name": "baseos" + }, + { + "package": "containerd.io-1.6.16-3.1.el8", + "type": "rpm", + "repo_name": "docker-ce-repo" + }, + { + "package": "docker-ce-cli-1:20.10.20-3.el8", + "type": "rpm", + "repo_name": "docker-ce-repo" + }, + { + "package": "docker-ce-3:20.10.20-3.el8", + "type": "rpm", + "repo_name": "docker-ce-repo" + }, + { + "package": "docker-ce-rootless-extras", + "type": "rpm", + "repo_name": "docker-ce-repo" + }, + { + "package": "kubespray-v2.25.0", + "type": "git", + "url": "https://github.com/kubernetes-sigs/kubespray.git", + "version": "v2.25.0" + }, + { + "package": "kubectl-v1.27.0", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.27.0/bin/linux/amd64/kubectl" + }, + { + "package": "kubectl-v1.28.0", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.28.0/bin/linux/amd64/kubectl" + }, + { + "package": "kubectl-v1.29.5", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.29.5/bin/linux/amd64/kubectl" + }, + { + "package": "kubelet-v1.27.0", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.27.0/bin/linux/amd64/kubelet" + }, + { + "package": "kubelet-v1.28.0", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.28.0/bin/linux/amd64/kubelet" + }, + { + "package": "kubelet-v1.29.5", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.29.5/bin/linux/amd64/kubelet" + }, + { + "package": "kubeadm-v1.27.0", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.27.0/bin/linux/amd64/kubeadm" + }, + { + "package": "kubeadm-v1.28.0", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.28.0/bin/linux/amd64/kubeadm" + }, + { + "package": "kubeadm-v1.29.5", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.29.5/bin/linux/amd64/kubeadm" + }, + { + "package": "calicoctl-v3.27.3", + "type": "tarball", + "url": "https://github.com/projectcalico/calico/releases/download/v3.27.3/calicoctl-linux-amd64" + }, + { + "package": "calicocrds-v3.27.3", + "type": "tarball", + "url": "https://github.com/projectcalico/calico/archive/v3.27.3.tar.gz" + }, + { + "package": "cri-tools-v1.27.1", + "type": "tarball", + "url": "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.27.1/crictl-v1.27.1-linux-amd64.tar.gz" + }, + { + "package": "cri-tools-v1.28.0", + "type": "tarball", + "url": "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.28.0/crictl-v1.28.0-linux-amd64.tar.gz" + }, + { + "package": "cri-tools-v1.29.0", + "type": "tarball", + "url": "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.29.0/crictl-v1.29.0-linux-amd64.tar.gz" + }, + { + "package": "etcd-v3.5.12", + "type": "tarball", + "url": "https://github.com/etcd-io/etcd/releases/download/v3.5.12/etcd-v3.5.12-linux-amd64.tar.gz" + }, + { + "package": "cni-plugins-v1.3.0", + "type": "tarball", + "url": "https://github.com/containernetworking/plugins/releases/download/v1.3.0/cni-plugins-linux-amd64-v1.3.0.tgz" + }, + { + "package": "nerdctl-v1.7.4", + "type": "tarball", + "url": "https://github.com/containerd/nerdctl/releases/download/v1.7.4/nerdctl-1.7.4-linux-amd64.tar.gz" + }, + { + "package": "containerd-1.7.16", + "type": "tarball", + "url": "https://github.com/containerd/containerd/releases/download/v1.7.16/containerd-1.7.16-linux-amd64.tar.gz" + }, + { + "package": "helm-v3.14.2", + "type": "tarball", + "url": "https://get.helm.sh/helm-v3.14.2-linux-amd64.tar.gz" + }, + { + "package": "nfs-subdir-external-provisioner-4.0.18", + "type": "tarball", + "url": "https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz" + }, + { + "package": "docker.io/library/nginx", + "tag": "1.25.2-alpine", + "type": "image" + }, + { + "package": "registry.k8s.io/coredns/coredns", + "tag": "v1.10.1", + "type": "image" + }, + { + "package": "registry.k8s.io/cpa/cluster-proportional-autoscaler", + "tag": "v1.8.8", + "type": "image" + }, + { + "package": "registry.k8s.io/dns/k8s-dns-node-cache", + "tag": "1.22.28", + "type": "image" + }, + { + "package": "registry.k8s.io/coredns/coredns", + "tag": "v1.11.1", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-apiserver", + "tag": "v1.27.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-apiserver", + "tag": "v1.28.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-apiserver", + "tag": "v1.29.5", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-controller-manager", + "tag": "v1.27.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-controller-manager", + "tag": "v1.28.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-controller-manager", + "tag": "v1.29.5", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-proxy", + "tag": "v1.27.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-proxy", + "tag": "v1.28.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-proxy", + "tag": "v1.29.5", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-scheduler", + "tag": "v1.27.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-scheduler", + "tag": "v1.28.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-scheduler", + "tag": "v1.29.5", + "type": "image" + }, + { + "package": "registry.k8s.io/pause", + "tag": "3.9", + "type": "image" + }, + { + "package": "quay.io/coreos/etcd", + "tag": "v3.5.12", + "type": "image" + }, + { + "package": "quay.io/calico/cni", + "tag": "v3.27.3", + "type": "image" + }, + { + "package": "quay.io/calico/kube-controllers", + "tag": "v3.27.3", + "type": "image" + }, + { + "package": "quay.io/calico/pod2daemon-flexvol", + "tag": "v3.27.3", + "type": "image" + }, + { + "package": "quay.io/calico/node", + "tag": "v3.27.3", + "type": "image" + }, + { + "package": "docker.io/flannel/flannel-cni-plugin", + "tag": "v1.1.2", + "type": "image" + }, + { + "package": "docker.io/flannel/flannel", + "tag": "v0.22.0", + "type": "image" + }, + { + "package": "quay.io/metallb/speaker", + "tag": "v0.13.9", + "type": "image" + }, + { + "package": "quay.io/metallb/controller", + "tag": "v0.13.9", + "type": "image" + }, + { + "package": "docker.io/kubernetesui/dashboard", + "tag": "v2.7.0", + "type": "image" + }, + { + "package": "docker.io/kubernetesui/metrics-scraper", + "tag": "v1.0.8", + "type": "image" + }, + { + "package": "docker.io/grafana/grafana-enterprise", + "tag": "8.3.2", + "type": "image" + } + ] + } + } + diff --git a/upgrade/roles/upgrade_k8s_oim/files/ubuntu/20.04/telemetry_v1.29.5.json b/upgrade/roles/upgrade_k8s_oim/files/ubuntu/20.04/telemetry_v1.29.5.json new file mode 100644 index 000000000..7ad889edc --- /dev/null +++ b/upgrade/roles/upgrade_k8s_oim/files/ubuntu/20.04/telemetry_v1.29.5.json @@ -0,0 +1,293 @@ +{ + "telemetry": { + "cluster": [ + { + "package": "buildkit", + "type": "git", + "url": "https://github.com/moby/buildkit.git", + "version": "v0.13.1" + }, + { "package": "smartmontools", + "type": "deb", + "repo_name": "focal" + }, + { + "package": "containerd.io=1.6.20-1", + "type": "deb", + "repo_name": "docker-ce-repo" + }, + { + "package": "docker-ce-cli=5:20.10.20~3-0~ubuntu-focal", + "type": "deb", + "repo_name": "docker-ce-repo" + }, + { + "package": "docker-ce=5:20.10.20~3-0~ubuntu-focal", + "type": "deb", + "repo_name": "docker-ce-repo" + }, + { + "package": "docker-ce-rootless-extras=5:20.10.20~3-0~ubuntu-focal", + "type": "deb", + "repo_name": "docker-ce-repo" + }, + { + "package": "kubespray-v2.25.0", + "type": "git", + "url": "https://github.com/kubernetes-sigs/kubespray.git", + "version": "v2.25.0" + }, + { + "package": "kubectl-v1.27.0", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.27.0/bin/linux/amd64/kubectl" + }, + { + "package": "kubectl-v1.28.0", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.28.0/bin/linux/amd64/kubectl" + }, + { + "package": "kubectl-v1.29.5", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.29.5/bin/linux/amd64/kubectl" + }, + { + "package": "kubelet-v1.27.0", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.27.0/bin/linux/amd64/kubelet" + }, + { + "package": "kubelet-v1.28.0", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.28.0/bin/linux/amd64/kubelet" + }, + { + "package": "kubelet-v1.29.5", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.29.5/bin/linux/amd64/kubelet" + }, + { + "package": "kubeadm-v1.27.0", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.27.0/bin/linux/amd64/kubeadm" + }, + { + "package": "kubeadm-v1.28.0", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.28.0/bin/linux/amd64/kubeadm" + }, + { + "package": "kubeadm-v1.29.5", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.29.5/bin/linux/amd64/kubeadm" + }, + { + "package": "calicoctl-v3.27.3", + "type": "tarball", + "url": "https://github.com/projectcalico/calico/releases/download/v3.27.3/calicoctl-linux-amd64" + }, + { + "package": "calicocrds-v3.27.3", + "type": "tarball", + "url": "https://github.com/projectcalico/calico/archive/v3.27.3.tar.gz" + }, + { + "package": "cri-tools-v1.27.1", + "type": "tarball", + "url": "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.27.1/crictl-v1.27.1-linux-amd64.tar.gz" + }, + { + "package": "cri-tools-v1.28.0", + "type": "tarball", + "url": "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.28.0/crictl-v1.28.0-linux-amd64.tar.gz" + }, + { + "package": "cri-tools-v1.29.0", + "type": "tarball", + "url": "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.29.0/crictl-v1.29.0-linux-amd64.tar.gz" + }, + { + "package": "etcd-v3.5.12", + "type": "tarball", + "url": "https://github.com/etcd-io/etcd/releases/download/v3.5.12/etcd-v3.5.12-linux-amd64.tar.gz" + }, + { + "package": "cni-plugins-v1.3.0", + "type": "tarball", + "url": "https://github.com/containernetworking/plugins/releases/download/v1.3.0/cni-plugins-linux-amd64-v1.3.0.tgz" + }, + { + "package": "nerdctl-v1.7.4", + "type": "tarball", + "url": "https://github.com/containerd/nerdctl/releases/download/v1.7.4/nerdctl-1.7.4-linux-amd64.tar.gz" + }, + { + "package": "containerd-1.7.16", + "type": "tarball", + "url": "https://github.com/containerd/containerd/releases/download/v1.7.16/containerd-1.7.16-linux-amd64.tar.gz" + }, + { + "package": "helm-v3.14.2", + "type": "tarball", + "url": "https://get.helm.sh/helm-v3.14.2-linux-amd64.tar.gz" + }, + { + "package": "nfs-subdir-external-provisioner-4.0.18", + "type": "tarball", + "url": "https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz" + }, + { + "package": "docker.io/library/nginx", + "tag": "1.25.2-alpine", + "type": "image" + }, + { + "package": "registry.k8s.io/coredns/coredns", + "tag": "v1.10.1", + "type": "image" + }, + { + "package": "registry.k8s.io/cpa/cluster-proportional-autoscaler", + "tag": "v1.8.8", + "type": "image" + }, + { + "package": "registry.k8s.io/dns/k8s-dns-node-cache", + "tag": "1.22.28", + "type": "image" + }, + { + "package": "registry.k8s.io/coredns/coredns", + "tag": "v1.11.1", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-apiserver", + "tag": "v1.27.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-apiserver", + "tag": "v1.28.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-apiserver", + "tag": "v1.29.5", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-controller-manager", + "tag": "v1.27.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-controller-manager", + "tag": "v1.28.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-controller-manager", + "tag": "v1.29.5", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-proxy", + "tag": "v1.27.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-proxy", + "tag": "v1.28.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-proxy", + "tag": "v1.29.5", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-scheduler", + "tag": "v1.27.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-scheduler", + "tag": "v1.28.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-scheduler", + "tag": "v1.29.5", + "type": "image" + }, + { + "package": "registry.k8s.io/pause", + "tag": "3.9", + "type": "image" + }, + { + "package": "quay.io/coreos/etcd", + "tag": "v3.5.12", + "type": "image" + }, + { + "package": "quay.io/calico/cni", + "tag": "v3.27.3", + "type": "image" + }, + { + "package": "quay.io/calico/kube-controllers", + "tag": "v3.27.3", + "type": "image" + }, + { + "package": "quay.io/calico/pod2daemon-flexvol", + "tag": "v3.27.3", + "type": "image" + }, + { + "package": "quay.io/calico/node", + "tag": "v3.27.3", + "type": "image" + }, + { + "package": "docker.io/flannel/flannel-cni-plugin", + "tag": "v1.1.2", + "type": "image" + }, + { + "package": "docker.io/flannel/flannel", + "tag": "v0.22.0", + "type": "image" + }, + { + "package": "quay.io/metallb/speaker", + "tag": "v0.13.9", + "type": "image" + }, + { + "package": "quay.io/metallb/controller", + "tag": "v0.13.9", + "type": "image" + }, + { + "package": "docker.io/kubernetesui/dashboard", + "tag": "v2.7.0", + "type": "image" + }, + { + "package": "docker.io/kubernetesui/metrics-scraper", + "tag": "v1.0.8", + "type": "image" + }, + { + "package": "docker.io/grafana/grafana-enterprise", + "tag": "8.3.2", + "type": "image" + } + ] + } + } + diff --git a/upgrade/roles/upgrade_k8s_oim/files/ubuntu/22.04/telemetry_v1.29.5.json b/upgrade/roles/upgrade_k8s_oim/files/ubuntu/22.04/telemetry_v1.29.5.json new file mode 100644 index 000000000..c81809749 --- /dev/null +++ b/upgrade/roles/upgrade_k8s_oim/files/ubuntu/22.04/telemetry_v1.29.5.json @@ -0,0 +1,293 @@ +{ + "telemetry": { + "cluster": [ + { + "package": "buildkit", + "type": "git", + "url": "https://github.com/moby/buildkit.git", + "version": "v0.13.1" + }, + { "package": "smartmontools", + "type": "deb", + "repo_name": "jammy" + }, + { + "package": "containerd.io=1.6.20-1", + "type": "deb", + "repo_name": "docker-ce-repo" + }, + { + "package": "docker-ce-cli=5:20.10.20~3-0~ubuntu-jammy", + "type": "deb", + "repo_name": "docker-ce-repo" + }, + { + "package": "docker-ce=5:20.10.20~3-0~ubuntu-jammy", + "type": "deb", + "repo_name": "docker-ce-repo" + }, + { + "package": "docker-ce-rootless-extras=5:20.10.20~3-0~ubuntu-jammy", + "type": "deb", + "repo_name": "docker-ce-repo" + }, + { + "package": "kubespray-v2.25.0", + "type": "git", + "url": "https://github.com/kubernetes-sigs/kubespray.git", + "version": "v2.25.0" + }, + { + "package": "kubectl-v1.27.0", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.27.0/bin/linux/amd64/kubectl" + }, + { + "package": "kubectl-v1.28.0", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.28.0/bin/linux/amd64/kubectl" + }, + { + "package": "kubectl-v1.29.5", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.29.5/bin/linux/amd64/kubectl" + }, + { + "package": "kubelet-v1.27.0", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.27.0/bin/linux/amd64/kubelet" + }, + { + "package": "kubelet-v1.28.0", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.28.0/bin/linux/amd64/kubelet" + }, + { + "package": "kubelet-v1.29.5", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.29.5/bin/linux/amd64/kubelet" + }, + { + "package": "kubeadm-v1.27.0", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.27.0/bin/linux/amd64/kubeadm" + }, + { + "package": "kubeadm-v1.28.0", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.28.0/bin/linux/amd64/kubeadm" + }, + { + "package": "kubeadm-v1.29.5", + "type": "tarball", + "url": "https://dl.k8s.io/release/v1.29.5/bin/linux/amd64/kubeadm" + }, + { + "package": "calicoctl-v3.27.3", + "type": "tarball", + "url": "https://github.com/projectcalico/calico/releases/download/v3.27.3/calicoctl-linux-amd64" + }, + { + "package": "calicocrds-v3.27.3", + "type": "tarball", + "url": "https://github.com/projectcalico/calico/archive/v3.27.3.tar.gz" + }, + { + "package": "cri-tools-v1.27.1", + "type": "tarball", + "url": "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.27.1/crictl-v1.27.1-linux-amd64.tar.gz" + }, + { + "package": "cri-tools-v1.28.0", + "type": "tarball", + "url": "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.28.0/crictl-v1.28.0-linux-amd64.tar.gz" + }, + { + "package": "cri-tools-v1.29.0", + "type": "tarball", + "url": "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.29.0/crictl-v1.29.0-linux-amd64.tar.gz" + }, + { + "package": "etcd-v3.5.12", + "type": "tarball", + "url": "https://github.com/etcd-io/etcd/releases/download/v3.5.12/etcd-v3.5.12-linux-amd64.tar.gz" + }, + { + "package": "cni-plugins-v1.3.0", + "type": "tarball", + "url": "https://github.com/containernetworking/plugins/releases/download/v1.3.0/cni-plugins-linux-amd64-v1.3.0.tgz" + }, + { + "package": "nerdctl-v1.7.4", + "type": "tarball", + "url": "https://github.com/containerd/nerdctl/releases/download/v1.7.4/nerdctl-1.7.4-linux-amd64.tar.gz" + }, + { + "package": "containerd-1.7.16", + "type": "tarball", + "url": "https://github.com/containerd/containerd/releases/download/v1.7.16/containerd-1.7.16-linux-amd64.tar.gz" + }, + { + "package": "helm-v3.14.2", + "type": "tarball", + "url": "https://get.helm.sh/helm-v3.14.2-linux-amd64.tar.gz" + }, + { + "package": "nfs-subdir-external-provisioner-4.0.18", + "type": "tarball", + "url": "https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz" + }, + { + "package": "docker.io/library/nginx", + "tag": "1.25.2-alpine", + "type": "image" + }, + { + "package": "registry.k8s.io/coredns/coredns", + "tag": "v1.10.1", + "type": "image" + }, + { + "package": "registry.k8s.io/cpa/cluster-proportional-autoscaler", + "tag": "v1.8.8", + "type": "image" + }, + { + "package": "registry.k8s.io/dns/k8s-dns-node-cache", + "tag": "1.22.28", + "type": "image" + }, + { + "package": "registry.k8s.io/coredns/coredns", + "tag": "v1.11.1", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-apiserver", + "tag": "v1.27.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-apiserver", + "tag": "v1.28.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-apiserver", + "tag": "v1.29.5", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-controller-manager", + "tag": "v1.27.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-controller-manager", + "tag": "v1.28.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-controller-manager", + "tag": "v1.29.5", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-proxy", + "tag": "v1.27.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-proxy", + "tag": "v1.28.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-proxy", + "tag": "v1.29.5", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-scheduler", + "tag": "v1.27.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-scheduler", + "tag": "v1.28.0", + "type": "image" + }, + { + "package": "registry.k8s.io/kube-scheduler", + "tag": "v1.29.5", + "type": "image" + }, + { + "package": "registry.k8s.io/pause", + "tag": "3.9", + "type": "image" + }, + { + "package": "quay.io/coreos/etcd", + "tag": "v3.5.12", + "type": "image" + }, + { + "package": "quay.io/calico/cni", + "tag": "v3.27.3", + "type": "image" + }, + { + "package": "quay.io/calico/kube-controllers", + "tag": "v3.27.3", + "type": "image" + }, + { + "package": "quay.io/calico/pod2daemon-flexvol", + "tag": "v3.27.3", + "type": "image" + }, + { + "package": "quay.io/calico/node", + "tag": "v3.27.3", + "type": "image" + }, + { + "package": "docker.io/flannel/flannel-cni-plugin", + "tag": "v1.1.2", + "type": "image" + }, + { + "package": "docker.io/flannel/flannel", + "tag": "v0.22.0", + "type": "image" + }, + { + "package": "quay.io/metallb/speaker", + "tag": "v0.13.9", + "type": "image" + }, + { + "package": "quay.io/metallb/controller", + "tag": "v0.13.9", + "type": "image" + }, + { + "package": "docker.io/kubernetesui/dashboard", + "tag": "v2.7.0", + "type": "image" + }, + { + "package": "docker.io/kubernetesui/metrics-scraper", + "tag": "v1.0.8", + "type": "image" + }, + { + "package": "docker.io/grafana/grafana-enterprise", + "tag": "8.3.2", + "type": "image" + } + ] + } + } + diff --git a/upgrade/roles/upgrade_k8s_oim/tasks/download_k8s_pkgs.yml b/upgrade/roles/upgrade_k8s_oim/tasks/download_k8s_pkgs.yml new file mode 100644 index 000000000..8b5901ed8 --- /dev/null +++ b/upgrade/roles/upgrade_k8s_oim/tasks/download_k8s_pkgs.yml @@ -0,0 +1,82 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Take backup of original telemetry.json file - Ubuntu/Rocky + ansible.builtin.copy: + src: "{{ telemetry_json_path_ubuntu_rocky }}" + dest: "{{ telemetry_backup_path }}" + mode: "{{ file_permission }}" + when: + - oim_os in oim_os_ubuntu or + oim_os in oim_os_rocky + +- name: Take backup of original telemetry.json file - RHEL + ansible.builtin.copy: + src: "{{ telemetry_json_path_rhel }}" + dest: "{{ telemetry_backup_path }}" + mode: "{{ file_permission }}" + when: oim_os in oim_os_as_rhel + +- name: Place modified telemetry.json in input folder - ubuntu + ansible.builtin.copy: + src: "{{ upgrade_telemetry_json_path_ubuntu }}" + dest: "{{ telemetry_json_path_ubuntu_rocky }}" + mode: "{{ file_permission }}" + when: + - oim_os in oim_os_ubuntu + +- name: Place modified telemetry.json in input folder - rocky + ansible.builtin.copy: + src: "{{ upgrade_telemetry_json_path_rocky }}" + dest: "{{ telemetry_json_path_ubuntu_rocky }}" + mode: "{{ file_permission }}" + when: + - oim_os in oim_os_rocky + +- name: Place modified telemetry.json in input folder - rhel + ansible.builtin.copy: + src: "{{ upgrade_telemetry_json_path_rhel }}" + dest: "{{ telemetry_json_path_rhel }}" + mode: "{{ file_permission }}" + when: oim_os in oim_os_as_rhel + +- name: Download k8s packages for upgrade (This task may take 10-15 mins to complete) + ansible.builtin.command: "{{ run_local_repo }}" + register: result + no_log: true + until: result.rc == 0 + retries: 3 + delay: 5 + changed_when: false + +- name: Place back original telemetry.json file - Ubuntu/Rocky + ansible.builtin.copy: + src: "{{ original_telemetry_bk }}" + dest: "{{ telemetry_json_path_ubuntu_rocky }}" + mode: "{{ file_permission }}" + when: + - oim_os in oim_os_ubuntu or + oim_os in oim_os_rocky + +- name: Place back original telemetry.json file - RHEL + ansible.builtin.copy: + src: "{{ original_telemetry_bk }}" + dest: "{{ telemetry_json_path_rhel }}" + mode: "{{ file_permission }}" + when: oim_os in oim_os_as_rhel + +- name: Delete backup + ansible.builtin.file: + path: "{{ original_telemetry_bk }}" + state: absent diff --git a/upgrade/roles/upgrade_k8s_oim/tasks/k8s_check.yml b/upgrade/roles/upgrade_k8s_oim/tasks/k8s_check.yml new file mode 100644 index 000000000..5c3e95880 --- /dev/null +++ b/upgrade/roles/upgrade_k8s_oim/tasks/k8s_check.yml @@ -0,0 +1,41 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Set k8s_oim_installed to false by default + ansible.builtin.set_fact: + k8s_oim_installed: false + +- name: Check if Kubernetes is installed on the Omnia Infrastructure Manager + ansible.builtin.command: kubectl get pod -A + changed_when: false + register: kubectl_command_status + failed_when: false + +- name: Check Kubernetes version installed on Omnia Infrastructure Manager + ansible.builtin.shell: | + set -o pipefail; + kubectl get nodes -o wide | awk 'NR==2 {print $5}' + register: kubernetes_version + changed_when: false + when: kubectl_command_status.rc == 0 + +- name: Check if track_upgrade.txt exists + ansible.builtin.stat: + path: "{{ track_file_path }}" + register: file_status + +- name: Set k8s_oim_installed to true if k8s is installed and kube version is v1.26.12 + ansible.builtin.set_fact: + k8s_oim_installed: true + when: (kubectl_command_status.rc == 0 and "v1.26.12" in kubernetes_version.stdout) or (file_status.stat.exists and "v1.29.5" not in kubernetes_version.stdout) diff --git a/upgrade/roles/upgrade_k8s_oim/tasks/main.yml b/upgrade/roles/upgrade_k8s_oim/tasks/main.yml new file mode 100644 index 000000000..e9ff00850 --- /dev/null +++ b/upgrade/roles/upgrade_k8s_oim/tasks/main.yml @@ -0,0 +1,93 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Include vars from upgrade_config.yml + ansible.builtin.include_vars: "{{ upgrade_config_path }}" + register: include_upgrade_config + no_log: true + +- name: Include vars from current installed software_config.json + ansible.builtin.include_vars: "{{ sw_json_path }}" + register: include_sw_json + no_log: true + +- name: Include vars from local_repo_access.yml + ansible.builtin.include_vars: + file: "{{ access_file_path }}" + no_log: true + +- name: Check telemetry_config.yml file is encrypted + ansible.builtin.command: cat {{ telemetry_config_file }} + changed_when: false + register: config_content + no_log: true + +- name: Decrpyt telemetry_login_vars.yml + ansible.builtin.command: >- + ansible-vault decrypt {{ telemetry_config_file }} + --vault-password-file {{ telemetry_vault_filename }} + changed_when: false + when: "'$ANSIBLE_VAULT;' in config_content.stdout" + +- name: Include variable file telemetry_config.yml + block: + - name: Include variable file telemetry_config.yml + ansible.builtin.include_vars: "{{ telemetry_config_file }}" + register: include_telemetry_config + no_log: true + rescue: + - name: Failed to include telemetry_config.yml + ansible.builtin.fail: + msg: "{{ telemetry_config_syntax_fail_msg }} Error: {{ include_telemetry_config.message }}" + +- name: Create ansible vault key + ansible.builtin.set_fact: + vault_key: "{{ lookup('password', '/dev/null chars=ascii_letters') }}" + when: "'$ANSIBLE_VAULT;' not in config_content.stdout" + +- name: Save vault key + ansible.builtin.lineinfile: + path: "{{ telemetry_vault_filename }}" + line: "{{ vault_key }}" + mode: "{{ vault_file_perm }}" + owner: root + create: true + when: "'$ANSIBLE_VAULT;' not in config_content.stdout" + +- name: Encrypt telemetry_config.yml file + ansible.builtin.command: >- + ansible-vault encrypt {{ telemetry_config_file }} + --vault-password-file {{ telemetry_vault_filename }} + changed_when: false + +- name: Check Kubernetes installation status on control plane + ansible.builtin.include_tasks: k8s_check.yml + +- name: Install K8s on Control plane + when: k8s_oim_installed + block: + - name: Modify software_config.json for package download + ansible.builtin.include_tasks: modify_sw_json.yml + + - name: Download Packages for K8s upgrade + ansible.builtin.include_tasks: download_k8s_pkgs.yml + + - name: Cleanup metallb + ansible.builtin.include_tasks: metallb_cleanup.yml + + - name: Upgrade Kubernetes on Control Plane + ansible.builtin.include_tasks: pre_upgrade.yml + + - name: Restore metallb + ansible.builtin.include_tasks: restore_metallb.yml diff --git a/upgrade/roles/upgrade_k8s_oim/tasks/metallb_cleanup.yml b/upgrade/roles/upgrade_k8s_oim/tasks/metallb_cleanup.yml new file mode 100644 index 000000000..aa79d3a2f --- /dev/null +++ b/upgrade/roles/upgrade_k8s_oim/tasks/metallb_cleanup.yml @@ -0,0 +1,48 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Get info for all Kubernetes services + kubernetes.core.k8s_info: + kind: Service + register: svc_info + +- name: Change svc type of LoadBalancer svcs to ClusterIP + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: Service + metadata: + name: "{{ item.metadata.name }}" + namespace: "{{ item.metadata.namespace }}" + spec: + type: ClusterIP + loop: "{{ svc_info.resources }}" + when: item.spec.type == 'LoadBalancer' and item.metadata.name in ['grafana', 'timescaledb'] + no_log: true + +- name: Delete metallb pools.yaml + kubernetes.core.k8s: + state: absent + src: "{{ metallb_pools_path }}" + +- name: Delete metallb layer2.yaml + kubernetes.core.k8s: + state: absent + src: "{{ metallb_layer2_path }}" + +- name: Delete metallb CRD + kubernetes.core.k8s: + state: absent + src: "{{ metallb_crd_path }}" diff --git a/upgrade/roles/validate_omnia_version/tasks/main.yml b/upgrade/roles/upgrade_k8s_oim/tasks/modify_sw_json.yml similarity index 50% rename from upgrade/roles/validate_omnia_version/tasks/main.yml rename to upgrade/roles/upgrade_k8s_oim/tasks/modify_sw_json.yml index 8cae3f59e..eacc9743a 100644 --- a/upgrade/roles/validate_omnia_version/tasks/main.yml +++ b/upgrade/roles/upgrade_k8s_oim/tasks/modify_sw_json.yml @@ -12,21 +12,32 @@ # See the License for the specific language governing permissions and # limitations under the License. --- - -- name: Read file path parameters from upgrade_config.yml - ansible.builtin.include_vars: - file: upgrade_config.yml - changed_when: false - -- name: Set fact for upgrade_status +- name: Get OS distribution and version ansible.builtin.set_fact: - upgrade_status: false + oim_os: "{{ ansible_distribution | lower }}" + oim_os_version: "{{ ansible_distribution_version | lower }}" -- name: Extract omnia_version +- name: Set oim_os in case of rhel ansible.builtin.set_fact: - old_omnia_version: "{{ lookup('pipe', 'cat {{ old_input_location }}/../.metadata/omnia_version') | regex_search('omnia_version: (.*)', '\\1') | join('') | trim }}" # noqa:yaml[line-length] + oim_os: "rhel" + when: oim_os in oim_os_rhel -- name: Set upgrade_status based on old_omnia_version +- name: Set target k8s version ansible.builtin.set_fact: - upgrade_status: true - when: "old_omnia_version is version('1.6', '<')" + k8s_version: "v1.29.5" + +- name: Check if metadata file exists + ansible.builtin.stat: + path: "{{ metadata_file_path }}" + register: metadata_file + +- name: Include vars from metadata file + ansible.builtin.include_vars: + file: "{{ metadata_file_path }}" + when: metadata_file.stat.exists + +- name: Place upgrade_software_config.json in input folder + ansible.builtin.template: + src: "{{ sw_config_template_path }}" + dest: "{{ input_dir_path }}" + mode: "{{ file_permission }}" diff --git a/upgrade/roles/upgrade_k8s_oim/tasks/pre_upgrade.yml b/upgrade/roles/upgrade_k8s_oim/tasks/pre_upgrade.yml new file mode 100644 index 000000000..f83e7e2cc --- /dev/null +++ b/upgrade/roles/upgrade_k8s_oim/tasks/pre_upgrade.yml @@ -0,0 +1,110 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Create kubespray directory "{{ kubespray_path }}" + ansible.builtin.file: + path: "{{ kubespray_path }}" + state: directory + mode: "{{ file_permission }}" + +- name: Get kubespray git repo + block: + - name: Get kubespray git repo tarball + ansible.builtin.get_url: + url: "{{ offline_git_path }}/kubespray-v2.25.0.tar.gz" + dest: "{{ kubespray_path }}" + mode: "{{ file_permission }}" + failed_when: false + rescue: + - name: Fail if unable to get kubespray tar file from local repo + ansible.builtin.fail: + msg: "{{ fail_msg_kubespray_not_found }}" + +- name: Untar kubespray git repo + ansible.builtin.unarchive: + src: "{{ kubespray_path }}/kubespray-v2.25.0.tar.gz" + dest: "{{ kubespray_path }}" + +- name: Add collections path in kubespray ansible.cfg + community.general.ini_file: + path: "{{ kubespray_path }}/kubespray-v2.25.0/ansible.cfg" + section: defaults + option: collections_path + value: "$VIRTUAL_ENV" + mode: "{{ file_permission }}" + backup: true + +- name: Set kube_version_var dynamically + ansible.builtin.set_fact: + kube_version_var: "kube_version_list_{{ k8s_version | replace('.', '_') }}" + +- name: Set the version list variable dynamically + ansible.builtin.set_fact: + version_list_to_use: "{{ lookup('vars', kube_version_var) }}" + +- name: Gather all IP addresses + ansible.builtin.command: ip -4 addr show + register: ip_output + changed_when: false + +- name: Extract IP addresses + ansible.builtin.set_fact: + oim_ip_addresses: "{{ ip_output.stdout | regex_findall('inet\\s([0-9.]+)') }}" + +- name: Check if /usr/bin/crictl exists + ansible.builtin.stat: + path: "{{ crictl_dest }}/crictl" + register: crictl_status + +- name: If crictl binary exists copy to /usr/local/bin + ansible.builtin.copy: + src: "{{ crictl_dest }}/{{ item }}" + dest: "{{ ctr_pkg_dest }}/{{ item }}" + mode: "{{ copy_permission }}" + loop: "{{ binary_package_list }}" + when: crictl_status.stat.exists + loop_control: + loop_var: item + +- name: Copy crictl package to /usr/bin + ansible.builtin.copy: + src: "{{ crictl_src }}" + dest: "{{ crictl_dest }}" + mode: "{{ copy_permission }}" + +- name: Create track_upgrade.txt file + ansible.builtin.file: + path: "{{ track_file_path }}" + mode: "{{ file_permission }}" + state: touch + failed_when: false + +- name: Render vars file - k8s_var.yml for kube version + ansible.builtin.template: + src: "{{ k8s_var_src }}" + dest: "{{ k8s_var_dest }}" + mode: "{{ file_permission }}" + with_items: "{{ version_list_to_use }}" + loop_control: + loop_var: kube_version + +- name: SEQUENTIAL KUBERNETES UPGRADE ON Omnia Infrastructure Manager + ansible.builtin.debug: + msg: "{{ user_msg_upgrade.split('\n') }}" + +- name: K8s upgrade tasks + ansible.builtin.include_tasks: upgrade_k8s.yml + loop: "{{ version_list_to_use }}" + loop_control: + loop_var: kubernetes_version diff --git a/upgrade/roles/upgrade_k8s_oim/tasks/restore_metallb.yml b/upgrade/roles/upgrade_k8s_oim/tasks/restore_metallb.yml new file mode 100644 index 000000000..e3ead92d8 --- /dev/null +++ b/upgrade/roles/upgrade_k8s_oim/tasks/restore_metallb.yml @@ -0,0 +1,80 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Re-apply metallb CRD + kubernetes.core.k8s: + state: present + src: "{{ metallb_crd_path }}" + +- name: Get info for all Kubernetes services + kubernetes.core.k8s_info: + kind: Service + register: svc_info + +- name: Change svc type back to LoadBalancer + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: Service + metadata: + name: "{{ svc_item.metadata.name }}" + namespace: "{{ svc_item.metadata.namespace }}" + spec: + type: LoadBalancer + loop: "{{ svc_info.resources }}" + loop_control: + loop_var: svc_item + when: svc_item.spec.type == 'ClusterIP' and svc_item.metadata.name in ['grafana', 'timescaledb'] + +- name: Re-apply metallb pools.yaml + kubernetes.core.k8s: + state: present + src: "{{ metallb_pools_path }}" + register: metallb_result + retries: 5 + delay: 10 + until: metallb_result is not failed + +- name: Re-apply metallb layer2.yaml + kubernetes.core.k8s: + state: present + src: "{{ metallb_layer2_path }}" + +- name: Delete track_upgrade.txt file after successful upgrade + ansible.builtin.file: + path: "{{ track_file_path }}" + state: absent + +- name: Copy binaries to /usr/local/bin + ansible.builtin.copy: + src: "{{ bin_path }}/{{ item }}" + dest: "{{ local_bin_path }}/{{ item }}" + mode: "{{ copy_permission }}" + loop: "{{ binary_list }}" + +- name: Omnia Infrastructure Manager UPGRADE COMPLETE !! + ansible.builtin.debug: + msg: "{{ upgrade_success_msg }}" + +- name: Get pod status + ansible.builtin.command: "kubectl get pods -A" + register: pods_op + changed_when: false + +- name: Print pod status + ansible.builtin.debug: + msg: | + The status of all pods is as below: + {{ pods_op.stdout }} diff --git a/upgrade/roles/upgrade_k8s_oim/tasks/upgrade_k8s.yml b/upgrade/roles/upgrade_k8s_oim/tasks/upgrade_k8s.yml new file mode 100644 index 000000000..5d02bde15 --- /dev/null +++ b/upgrade/roles/upgrade_k8s_oim/tasks/upgrade_k8s.yml @@ -0,0 +1,122 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Wait for all pods to be Running before upgrade + ansible.builtin.shell: | + set -o pipefail; + kubectl get pods --all-namespaces | grep -v "Running" | awk 'NR>1 {print $2}' | wc -l + register: pod_count + until: pod_count.stdout == "0" + retries: 30 + delay: 10 + changed_when: false + +- name: Check if track_upgrade.txt contains kubernetes_version + ansible.builtin.command: "grep '{{ kubernetes_version }}' {{ track_file_path }}" + register: track_file_check + no_log: true + ignore_errors: true + changed_when: false + +- name: Upgrade K8s version on Omnia Infrastructure Manager, if not already upgraded, to {{ kubernetes_version }} + block: + - name: K8s upgrade in progress (This process may take 10-15 mins to complete) + ansible.builtin.command: > + ansible-playbook {{ kubespray_path }}/kubespray-v2.25.0/upgrade-cluster.yml \ + -i "{{ inv_path }}" \ + --extra-vars "@{{ k8s_var_dest }}" + register: kubespray_results + args: + chdir: "{{ kubespray_path }}/kubespray-v2.25.0" + vars: + kube_version: "{{ kubernetes_version }}" + when: track_file_check.rc != 0 + changed_when: false + + rescue: + - name: Remove track_upgrade.txt if the upgrade failed + ansible.builtin.file: + path: "{{ track_file_path }}" + state: absent + + - name: Fail the playbook due to upgrade failure + ansible.builtin.fail: + msg: "{{ k8s_upgrade_failed_msg }}" + +- name: Set kube_version variable after upgrade + ansible.builtin.set_fact: + upgraded_version: "{{ kubernetes_version }}" + when: (kubespray_results is defined and kubespray_results.rc | default(-1) == 0) + +- name: Add kubernetes_version to track_upgrade.txt file + ansible.builtin.lineinfile: + path: "{{ track_file_path }}" + line: "{{ kubernetes_version }}" + mode: "{{ file_permission }}" + create: true + +- name: Wait for older dns-autoscaler pod to be pending + when: upgraded_version is defined and upgraded_version == 'v1.27.0' + block: + - name: Get old dns-autoscaler pod + ansible.builtin.command: > + kubectl get pods -n kube-system -l k8s-app=dns-autoscaler + --sort-by=.metadata.creationTimestamp -o jsonpath='{.items[0].metadata.name}' + register: oldest_dns_autoscaler_pod + changed_when: false + + - name: Get new dns-autoscaler pod + ansible.builtin.command: > + kubectl get pods -n kube-system -l k8s-app=dns-autoscaler + --sort-by=.metadata.creationTimestamp -o jsonpath='{.items[-1].metadata.name}' + register: newest_dns_autoscaler_pod + changed_when: false + + - name: Check new dns-autoscaler pod status + ansible.builtin.command: > + kubectl get pod {{ newest_dns_autoscaler_pod.stdout }} -n kube-system -o jsonpath='{.status.phase}' + register: new_dns_autoscaler_status + until: new_dns_autoscaler_status.stdout == "Pending" + retries: 10 + delay: 10 + changed_when: false + + - name: Check old dns-autoscaler pod status + ansible.builtin.command: > + kubectl get pod {{ oldest_dns_autoscaler_pod.stdout }} -n kube-system -o jsonpath='{.status.phase}' + register: old_dns_autoscaler_status + until: old_dns_autoscaler_status.stdout == "Running" + retries: 10 + delay: 10 + changed_when: false + + - name: Wait for all other pods to be running + ansible.builtin.shell: | + set -o pipefail; + kubectl get pods --all-namespaces --field-selector=status.phase!=Running | \ + grep -v "{{ newest_dns_autoscaler_pod.stdout }}" | \ + awk 'NR>1 {print $2}' | wc -l + register: remaining_pod_count + retries: 30 + delay: 10 + until: remaining_pod_count.stdout == "0" + changed_when: false + + - name: Delete the older dns-autoscaler pod + ansible.builtin.command: > + kubectl delete pod {{ oldest_dns_autoscaler_pod.stdout }} -n kube-system + when: + - old_dns_autoscaler_status.stdout == "Running" + - remaining_pod_count.stdout | int == 0 + changed_when: false diff --git a/upgrade/roles/upgrade_k8s_oim/templates/k8s_var.j2 b/upgrade/roles/upgrade_k8s_oim/templates/k8s_var.j2 new file mode 100644 index 000000000..83f02943a --- /dev/null +++ b/upgrade/roles/upgrade_k8s_oim/templates/k8s_var.j2 @@ -0,0 +1,38 @@ +kube_version: "{{ kube_version }}" +deploy_container_engine: false +dashboard_enabled: false +helm_enabled: true +kube_network_plugin: "{{ hostvars['localhost']['k8s_cni'] }}" +kube_service_addresses: "{{ hostvars['localhost']['k8s_service_addresses'] }}" +kube_pods_subnet: "{{ hostvars['localhost']['k8s_pod_network_cidr'] }}" +metallb_enabled: false +metallb_speaker_enabled: false +metallb_namespace: "metallb-system" +kube_proxy_strict_arp: true +kube_proxy_mode: 'iptables' +metallb_config: + address_pools: + primary: + ip_range: + - "{{ hostvars['localhost']['pod_external_ip_range'] }}" + auto_assign: true + layer2: + - primary +override_system_hostname: false +populate_inventory_to_hosts_file: false +enable_nodelocaldns: false +unsafe_show_logs: true +kube_image_repo: "registry.k8s.io" +docker_image_repo: "docker.io" +quay_image_repo: "quay.io" +kubeadm_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/kubeadm-{{ kube_version }}.tar.gz" +kubectl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/kubectl-{{ kube_version }}.tar.gz" +kubelet_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/kubelet-{{ kube_version }}.tar.gz" +calicoctl_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/calicoctl-v3.27.3.tar.gz" +calico_crds_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/calicocrds-v3.27.3.tar.gz" +cni_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/cni-plugins-v1.3.0.tar.gz" +docker_rh_repo_base_url: "" +docker_rh_repo_gpgkey: "" +etcd_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/etcd-v3.5.12.tar.gz" +helm_download_url: "{{ hostvars['localhost']['offline_tarball_path'] }}/helm-v3.14.2.tar.gz" +bin_dir: /usr/bin \ No newline at end of file diff --git a/upgrade/roles/upgrade_k8s_oim/templates/upgrade_software_config.j2 b/upgrade/roles/upgrade_k8s_oim/templates/upgrade_software_config.j2 new file mode 100644 index 000000000..b19490557 --- /dev/null +++ b/upgrade/roles/upgrade_k8s_oim/templates/upgrade_software_config.j2 @@ -0,0 +1,8 @@ +{ + "cluster_os_type": "{{ oim_os }}", + "cluster_os_version": "{{ oim_os_version }}", + "repo_config": "{{ md_repo_config }}", + "softwares": [ + {"name": "telemetry"} + ] +} \ No newline at end of file diff --git a/upgrade/roles/upgrade_k8s_oim/vars/main.yml b/upgrade/roles/upgrade_k8s_oim/vars/main.yml new file mode 100644 index 000000000..3a0c1619a --- /dev/null +++ b/upgrade/roles/upgrade_k8s_oim/vars/main.yml @@ -0,0 +1,92 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# main.yml +telemetry_config_file: "{{ installed_omnia_path }}/input/telemetry_config.yml" +telemetry_vault_filename: "{{ installed_omnia_path }}/input/.telemetry_vault_key" +telemetry_config_syntax_fail_msg: "Failed.Syntax errors present in telemetry_config.yml.Fix errors and re-run playbook again." +vault_file_perm: '0644' + +# modify_sw_json.yml +upgrade_config_path: "{{ role_path }}/../../upgrade_config.yml" +metadata_file_path: "/opt/omnia/offline/.data/metadata.yml" +sw_json_path: "{{ installed_omnia_path }}/input/software_config.json" +sw_config_template_path: "../templates/upgrade_software_config.j2" +input_dir_path: "{{ role_path }}/../../../input/upgrade_software_config.json" +file_permission: "0644" + +# download_k8s_pkgs.yml +upgrade_telemetry_json_path_ubuntu: "../files/{{ oim_os }}/{{ oim_os_version }}/telemetry_{{ k8s_version }}.json" +upgrade_telemetry_json_path_rocky: "../files/{{ oim_os }}/telemetry_{{ k8s_version }}.json" +upgrade_telemetry_json_path_rhel: "../files/rhel/telemetry_{{ k8s_version }}.json" +telemetry_json_path_ubuntu_rocky: "{{ role_path }}/../../../input/config/{{ oim_os }}/{{ oim_os_version }}/telemetry.json" +telemetry_json_path_rhel: "{{ role_path }}/../../../input/config/rhel/{{ oim_os_version }}/telemetry.json" +oim_os_ubuntu: "ubuntu" +oim_os_rocky: "rocky" +oim_os_rhel: "redhat" +telemetry_backup_path: "../files/" +original_telemetry_bk: "../files/telemetry.json" +oim_os_as_rhel: "rhel" +run_local_repo: "ansible-playbook {{ role_path }}/../../../local_repo/local_repo.yml -e sw_config_json_path={{ input_dir_path }}" + +# metallb_cleanup.yml +metallb_pools_path: "/etc/kubernetes/pools.yaml" +metallb_layer2_path: "/etc/kubernetes/layer2.yaml" +metallb_crd_path: "/etc/kubernetes/metallb.yaml" +ctr_pkg_src: "/usr/bin/ctr" +ctr_pkg_dest: "/usr/local/bin" +copy_permission: "0755" + +# upgrade_k8s.yml +access_file_path: "/opt/omnia/offline/local_repo_access.yml" +telemetry_config_path: "{{ installed_omnia_path }}/input/telemetry_config.yml" +kubespray_path: "/opt/omnia/kubespray" +crictl_src: "/usr/local/bin/crictl" +crictl_dest: "/usr/bin" +fail_msg_kubespray_not_found: "Kubespray git tar file not found in local repo." +inventory_grps: + - 'kube_control_plane' + - 'kube_node' + - 'etcd' + - 'k8s_cluster' +kube_version_list_v1_29_5: # noqa: var-naming[pattern] + - v1.27.0 + - v1.28.0 + - v1.29.5 +k8s_var_src: "../templates/k8s_var.j2" +k8s_var_dest: "{{ role_path }}/files/k8s_var_{{ kube_version }}.yml" +bin_path: "/usr/bin" +local_bin_path: "/usr/local/bin" +binary_list: ['kubectl', 'kubeadm', 'kubelet'] +inv_path: "{{ installed_omnia_path }}/telemetry/roles/orchestrator/files/k8s_inv.ini" +pods_state_fail_msg: "Not all pods are in running state. Ensure all pods are running and re-run upgrade_oim.yml" +nodes_output_fail_msg: "Node is not in ready state. Ensure the node is healthy and re-run upgrade_oim.yml" +track_file_path: "{{ role_path }}/files/track_upgrade.txt" +k8s_upgrade_failed_msg: "Failed to upgrade Kubernetes on Omnia Infrastructure Manager. Please execute upgrade/restore_oim.yml to restore Kubernetes version to v1.26.12 on Omnia Infrastructure Manager and re-run upgrade_oim.yml" # noqa: yaml[line-length] +binary_package_list: ['crictl'] + +user_msg_upgrade: | + + *************************************************************************************************************************************************** + Kubernetes Upgrade on Omnia Infrastructure Manager is sequential in nature. Upgrade will happen from v1.26.12 -> v1.27.0 -> v1.28.0 -> v1.29.5 + + The upgrade process will take approx. 15-20 minutes to complete. + **************************************************************************************************************************************************** + +upgrade_success_msg: | + + *************************************************************************************************************************************************** + Successfully upgraded Kubernetes version to v1.29.5 on the Omnia Infrastructure Manager. + **************************************************************************************************************************************************** diff --git a/upgrade/roles/update_metadata/vars/main.yml b/upgrade/roles/upgrade_local_repo/tasks/main.yml similarity index 80% rename from upgrade/roles/update_metadata/vars/main.yml rename to upgrade/roles/upgrade_local_repo/tasks/main.yml index 40920b485..362ecd469 100644 --- a/upgrade/roles/update_metadata/vars/main.yml +++ b/upgrade/roles/upgrade_local_repo/tasks/main.yml @@ -13,9 +13,5 @@ # limitations under the License. --- -# Usage: update.yml -dir_mode: "0755" - -# Usage: update_metadata.yml -metadata_yaml_file_path: "/opt/omnia/.data/metadata.yml" -file_permissions: "0644" \ No newline at end of file +- name: Include upgrade_nerdctl.yml + ansible.builtin.include_tasks: upgrade_nerdctl.yml diff --git a/upgrade/roles/upgrade_local_repo/tasks/upgrade_nerdctl.yml b/upgrade/roles/upgrade_local_repo/tasks/upgrade_nerdctl.yml new file mode 100644 index 000000000..e31f38ac8 --- /dev/null +++ b/upgrade/roles/upgrade_local_repo/tasks/upgrade_nerdctl.yml @@ -0,0 +1,69 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Stop all running containers + ansible.builtin.shell: | + for container_id in $(nerdctl ps -q); do + nerdctl stop $container_id + done + args: + executable: /bin/bash + changed_when: false + +- name: Remove old nerdctl binary + ansible.builtin.file: + path: "{{ item }}" + state: absent + changed_when: false + with_items: "{{ nerdctl.binary_files_path }}" + +- name: Remove old nerdctl binary from tmp directory + ansible.builtin.file: + path: "{{ item }}" + state: absent + with_fileglob: + - "{{ temp_download_dir }}/*nerdctl*" + +- name: Create nerdctl temp directory + ansible.builtin.file: + path: "{{ nerdctl_directory }}" + state: directory + mode: "{{ directory_permissions }}" + +- name: Download nerdctl archive + ansible.builtin.get_url: + url: "{{ nerdctl.url }}" + dest: "{{ nerdctl.archive_dest }}" + mode: "{{ file_permission }}" + register: download_nerdctl + until: download_nerdctl is not failed + retries: "{{ max_retries }}" + +- name: Extract nerdctl archive + ansible.builtin.unarchive: + src: "{{ nerdctl.archive_dest }}" + dest: "{{ nerdctl_directory }}" + mode: "{{ file_permission }}" + +- name: Make nerdctl executable + ansible.builtin.file: + path: "{{ nerdctl.folder_dest }}" + mode: "{{ nerdctl.folder_permission }}" + +- name: Move nerdctl to system bin directory + ansible.builtin.copy: + src: "{{ nerdctl.folder_dest }}" + dest: "{{ item }}" + mode: preserve + with_items: "{{ nerdctl.executable_dest }}" diff --git a/upgrade/roles/upgrade_local_repo/vars/main.yml b/upgrade/roles/upgrade_local_repo/vars/main.yml new file mode 100644 index 000000000..1405383d8 --- /dev/null +++ b/upgrade/roles/upgrade_local_repo/vars/main.yml @@ -0,0 +1,30 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +temp_download_dir: "/tmp" +max_retries: 10 +directory_permissions: "0755" +nerdctl_directory: "{{ temp_download_dir }}/nerdctl/upgrade" +file_permission: "0644" +nerdctl: + folder_dest: "{{ nerdctl_directory }}/nerdctl" + folder_permission: "+x" + url: "https://github.com/containerd/nerdctl/releases/download/v1.7.4/nerdctl-1.7.4-linux-amd64.tar.gz" + archive_dest: "{{ nerdctl_directory }}/nerdctl-1.7.4-linux-amd64.tar.gz" + binary_files_path: + - "/usr/local/bin/nerdctl" + - "/usr/bin/nerdctl" + executable_dest: + - "/usr/local/bin/" + - "/usr/bin/" diff --git a/upgrade/roles/upgrade_oim/files/requirements_venv161.yml b/upgrade/roles/upgrade_oim/files/requirements_venv161.yml new file mode 100644 index 000000000..272c7d981 --- /dev/null +++ b/upgrade/roles/upgrade_oim/files/requirements_venv161.yml @@ -0,0 +1,23 @@ +--- +collections: + - name: ansible.utils + version: 2.5.2 + - name: community.crypto + version: 2.14.0 + - name: community.docker + version: 3.4.8 + - name: community.general + version: 4.8.7 + - name: community.grafana + version: 1.3.0 + - name: community.mysql + version: 3.7.2 + - name: community.postgresql + version: 3.5.0 + - name: dellemc.os10 + version: 1.1.1 + - name: kubernetes.core + version: 2.2.3 + - name: https://github.com/kubernetes-sigs/kubespray.git + type: git + version: v2.23.2 diff --git a/upgrade/roles/upgrade_oim/tasks/create_older_venv.yml b/upgrade/roles/upgrade_oim/tasks/create_older_venv.yml new file mode 100644 index 000000000..67fe55a62 --- /dev/null +++ b/upgrade/roles/upgrade_oim/tasks/create_older_venv.yml @@ -0,0 +1,67 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Setup venv for older omnia venv + block: + - name: Ansible dist + ansible.builtin.debug: + msg: "{{ ansible_distribution }}" + + - name: Install dnf python 3.9 + ansible.builtin.dnf: + name: + - python39 + when: (ansible_distribution | lower) in ['redhat', 'rhel', 'rocky'] + + - name: Install apt python 3.9 + ansible.builtin.apt: + name: + - python3.9 + - python3.9-venv + when: (ansible_distribution | lower) in ['ubuntu'] + + - name: Create python venv for omnia1.6.1 + ansible.builtin.pip: + name: ansible==7.7.0 + virtualenv: "{{ omnia161_venv_path }}" + virtualenv_command: "python3.9 -m venv" + + - name: Create the .omnia file in the older venv + ansible.builtin.file: + path: "{{ omnia161_venv_path }}/.omnia" + state: touch + mode: "{{ file_permission }}" + + - name: Install pip packages + ansible.builtin.pip: + requirements: "{{ installed_omnia_path }}/prepare_cp/roles/omnia_appliance_cp/files/requirements_pip.txt" + virtualenv: "{{ omnia161_venv_path }}" + virtualenv_python: "{{ omnia161_venv_path }}/bin/python" + + - name: Install pip packages + ansible.builtin.pip: + name: "{{ item }}" + virtualenv: "{{ omnia161_venv_path }}" + virtualenv_python: "{{ omnia161_venv_path }}/bin/python" + with_items: "{{ python_pip_modules }}" + + - name: Install the collections + ansible.builtin.command: "{{ omnia161_venv_path }}/bin/ansible-galaxy collection install -r {{ role_path }}/files/requirements_venv161.yml -p {{ omnia161_venv_path }}" # noqa: yaml[line-length] + changed_when: false + + rescue: + - name: Venv creation failed + ansible.builtin.debug: + msg: "{{ venv_creation_failed }}" diff --git a/upgrade/roles/upgrade_oim/tasks/main.yml b/upgrade/roles/upgrade_oim/tasks/main.yml new file mode 100644 index 000000000..e11e891c4 --- /dev/null +++ b/upgrade/roles/upgrade_oim/tasks/main.yml @@ -0,0 +1,72 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Backup DB and Kubernetes cluster on Omnia Infrastructure Manager + ansible.builtin.include_role: + name: backup_k8s + +- name: Backup omnia_inventory files + ansible.builtin.copy: + src: "{{ omnia_inventory_dir }}" + dest: "{{ backup_location }}/omnia_inventory" + mode: "{{ file_permission }}" + +- name: Create venv for installed version + ansible.builtin.include_tasks: create_older_venv.yml + when: not older_os + +- name: Import parameters from Omnia current version to upgrade version + ansible.builtin.include_role: + name: import_input_parameters + +- name: Upgrade software on Omnia Infrastructure Manager + block: + - name: Upgrade services and packages of prepare oim on OIM + ansible.builtin.include_role: + name: upgrade_prepare_oim + + - name: Upgrade services and packages of discovery on OIM + ansible.builtin.include_role: + name: upgrade_discovery + + - name: Stop telemetry service + ansible.builtin.include_role: + name: omnia_telemetry + tasks_from: stop_omnia_telemetry.yml + + - name: Upgrade packages required for local repo - nerdctl + ansible.builtin.include_role: + name: upgrade_local_repo + + - name: Upgrade kubernetes on Omnia Infrastructure Manager + ansible.builtin.include_role: + name: upgrade_k8s_oim + when: not older_os + + - name: Upgrade omnia_telemetry binary + ansible.builtin.include_role: + name: omnia_telemetry + + - name: Upgrade iDRAC telemetry role + ansible.builtin.include_role: + name: upgrade_idrac_telemetry + + - name: Upgrade control_plane to oim in omniadb and xcat + ansible.builtin.include_role: + name: cp_to_oim + + rescue: + - name: Failed to upgrade Omnia Infrastructure Manager + ansible.builtin.fail: + msg: "{{ upgrade_fail_msg }}" diff --git a/upgrade/roles/upgrade_oim/vars/main.yml b/upgrade/roles/upgrade_oim/vars/main.yml new file mode 100644 index 000000000..a8ecf1a17 --- /dev/null +++ b/upgrade/roles/upgrade_oim/vars/main.yml @@ -0,0 +1,39 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Usage: create_older_venv.yml +omnia161_venv_path: /opt/omnia/omnia161_venv +venv_creation_failed: "Venv setup for Omnia 1.6.1 has failed" +python_pip_modules: + - omsdk==1.2.513 + - pysnmp==6.1.3 + - psycopg2-binary + - requests + - pyarrow + - pandas + - passlib + - netaddr + - pexpect + - pyinstaller + - psutil + - commented-configparser + - iprange-python + - pyopenssl==21.0.0 + - urllib3==1.26.5 + +# Usage: main.yml +omnia_inventory_dir: "/opt/omnia/omnia_inventory/" +file_permission: "0644" +upgrade_fail_msg: "Failed to upgrade. Execute restore_oim.yml to restore Omnia Infrastructure Manager state." diff --git a/upgrade/roles/upgrade_omniadb/tasks/main.yml b/upgrade/roles/upgrade_omniadb/tasks/main.yml deleted file mode 100644 index 3247f38b8..000000000 --- a/upgrade/roles/upgrade_omniadb/tasks/main.yml +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Read parameters from config file - ansible.builtin.include_role: - name: "{{ path_to_discovery_commons }}" - tasks_from: include_provision_credentials_config.yml - -- name: Encrypt config file once validations are successful - block: - - name: Encrypt provision_config_credentials.yml - ansible.builtin.command: >- - {{ ansible_vault_path.stdout.split(' ')[1] }} encrypt {{ provision_credentials_config_filename }} - --vault-password-file {{ provision_credentials_vault_path }} - changed_when: false - -- name: Delete and recreate OmniaDB - ansible.builtin.include_tasks: delete_and_recreate.yml - -- name: Restore OmniaDB data from backup and modify columns - ansible.builtin.include_tasks: restore_data.yml - -- name: Populate cpu data - ansible.builtin.include_tasks: populate_cpus.yml - -- name: Populate gpu data - ansible.builtin.include_tasks: populate_gpus.yml diff --git a/upgrade/roles/upgrade_omniadb/tasks/populate_cpus.yml b/upgrade/roles/upgrade_omniadb/tasks/populate_cpus.yml deleted file mode 100644 index f69b5d876..000000000 --- a/upgrade/roles/upgrade_omniadb/tasks/populate_cpus.yml +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -- name: Retrieve hosts from PostgreSQL - community.postgresql.postgresql_query: - db: omniadb - login_password: "{{ postgresdb_password }}" - query: "SELECT admin_ip FROM cluster.nodeinfo" - register: admin_ips - when: db_ping_new.is_available - -- name: Store CPU data - ansible.builtin.shell: - cmd: >- - set -o pipefail && - sshpass -p '{{ provision_password }}' ssh {{ item }} - 'set -o pipefail && - intel_cpu_check=`lscpu | grep -i "^vendor ID" | grep -ic "Intel"` - amd_cpu_check=`lscpu | grep -i "^vendor ID" | grep -ic "AMD"` - cpu_count=`lscpu | grep -i "^socket" | grep -o -E "[0-9]+"` - - if [ $intel_cpu_check -gt 0 ]; then - echo "intel $cpu_count" - # Check for AMD CPU - elif [ $amd_cpu_check -gt 0 ]; then - echo "amd $cpu_count" - else - echo " $cpu_count" - fi - ' - register: cpus - loop: "{{ admin_ips.query_result | map(attribute='admin_ip') | list }}" - when: db_ping_new.is_available and item != ansible_default_ipv4.address - changed_when: true - failed_when: false - -- name: Initialize dictionary - ansible.builtin.set_fact: - cpu_dict: {} - when: db_ping_new.is_available - -- name: Build dictionary for each ip - ansible.builtin.set_fact: - cpu_dict: "{{ cpu_dict | default({}) | combine({item.item: item.stdout}) }}" - loop: "{{ cpus.results }}" - when: db_ping_new.is_available - no_log: true - -- name: Store cpu type in database - community.postgresql.postgresql_query: - db: omniadb - login_password: "{{ postgresdb_password }}" - query: "UPDATE cluster.nodeinfo SET cpu = '{{ item.value.split(' ')[0] }}' WHERE admin_ip = '{{ item.key }}'" - loop: "{{ cpu_dict | dict2items }}" - when: db_ping_new.is_available - -- name: Store cpu count in database - community.postgresql.postgresql_query: - db: omniadb - login_password: "{{ postgresdb_password }}" - query: "UPDATE cluster.nodeinfo SET cpu_count = '{{ item.value.split(' ')[1] }}' WHERE admin_ip = '{{ item.key }}'" - loop: "{{ cpu_dict | dict2items }}" - when: - - db_ping_new.is_available - - item.value is not none - - item.value.split(' ') | length == 2 diff --git a/upgrade/roles/upgrade_omniadb/tasks/populate_gpus.yml b/upgrade/roles/upgrade_omniadb/tasks/populate_gpus.yml deleted file mode 100644 index c28f3dbc2..000000000 --- a/upgrade/roles/upgrade_omniadb/tasks/populate_gpus.yml +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -- name: Retrieve hosts from PostgreSQL - community.postgresql.postgresql_query: - db: omniadb - login_password: "{{ postgresdb_password }}" - query: "SELECT admin_ip FROM cluster.nodeinfo" - register: admin_ips - when: db_ping_new.is_available - -- name: Store GPU data - ansible.builtin.shell: - cmd: >- - set -o pipefail && - sshpass -p '{{ provision_password }}' ssh {{ item }} - 'set -o pipefail && - nvidia_gpu_count=`lshw -c display | grep -ic "vendor: NVIDIA Corporation"` - amd_display_controller_count=`lshw -c display | grep -ic "vendor: Advanced Micro Devices"` - amd_processing_accelerator_count=`lshw | grep --after-context=2 "description: Processing accelerators" | grep -c "vendor: Advanced Micro Devices"` - amd_gpu_count=$((amd_display_controller_count + amd_processing_accelerator_count)) - - if [ nvidia_gpu_count -eq 0 ]; then - echo "nvidia ${nvidia_gpu_count}"; - elif [ gpu -eq 0 ]; then - echo "amd ${amd_gpu_count}"; - else - echo " 0" - fi - ' - register: gpus - loop: "{{ admin_ips.query_result | map(attribute='admin_ip') | list }}" - when: db_ping_new.is_available and item != ansible_default_ipv4.address - changed_when: true - failed_when: false - -- name: Initialize dictionary - ansible.builtin.set_fact: - gpu_dict: {} - when: db_ping_new.is_available - -- name: Build dictionary for each ip - ansible.builtin.set_fact: - gpu_dict: "{{ gpu_dict | default({}) | combine({item.item: item.stdout}) }}" - loop: "{{ gpus.results }}" - when: db_ping_new.is_available - no_log: true - -- name: Store gpu type in database - community.postgresql.postgresql_query: - db: omniadb - login_password: "{{ postgresdb_password }}" - query: "UPDATE cluster.nodeinfo SET gpu = '{{ item.value.split(' ')[0] }}' WHERE admin_ip = '{{ item.key }}'" - loop: "{{ gpu_dict | dict2items }}" - when: db_ping_new.is_available - -- name: Store gpu count in database - community.postgresql.postgresql_query: - db: omniadb - login_password: "{{ postgresdb_password }}" - query: "UPDATE cluster.nodeinfo SET gpu_count = '{{ item.value.split(' ')[1] }}' WHERE admin_ip = '{{ item.key }}'" - loop: "{{ gpu_dict | dict2items }}" - when: - - db_ping_new.is_available - - item.value is not none - - item.value.split(' ') | length == 2 diff --git a/upgrade/roles/upgrade_omniadb/tasks/restore_data.yml b/upgrade/roles/upgrade_omniadb/tasks/restore_data.yml deleted file mode 100644 index e1ddee4d2..000000000 --- a/upgrade/roles/upgrade_omniadb/tasks/restore_data.yml +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -- name: Ping new omnia database - community.postgresql.postgresql_ping: - db: omniadb - login_password: "{{ postgresdb_password }}" - register: db_ping_new - -- name: Read in old omnia data - community.postgresql.postgresql_db: - db: omniadb - login_password: "{{ postgresdb_password }}" - state: restore - target: "{{ backup_location }}/backup.sql" - when: db_ping_new.is_available - -- name: Add and remove nodeinfo table schema columns - community.postgresql.postgresql_query: - db: omniadb - login_password: "{{ postgresdb_password }}" - query: | - ALTER TABLE cluster.nodeinfo - DROP COLUMN IF EXISTS ib_ip, - ADD COLUMN IF NOT EXISTS discovery_mechanism VARCHAR(65), - ADD COLUMN IF NOT EXISTS cpu VARCHAR(10), - ADD COLUMN IF NOT EXISTS gpu VARCHAR(10), - ADD COLUMN IF NOT EXISTS cpu_count INTEGER, - ADD COLUMN IF NOT EXISTS gpu_count INTEGER - when: db_ping_new.is_available - -- name: Rename nodeinfo serial column - community.postgresql.postgresql_query: - db: omniadb - login_password: "{{ postgresdb_password }}" - query: | - ALTER TABLE cluster.nodeinfo - RENAME COLUMN serial TO service_tag - when: db_ping_new.is_available - failed_when: false - -- name: Add nicinfo table schema - community.postgresql.postgresql_query: - db: omniadb - login_password: "{{ postgresdb_password }}" - query: | - CREATE TABLE IF NOT EXISTS cluster.nicinfo ( - ID SERIAL NOT NULL PRIMARY KEY UNIQUE, - category VARCHAR(60), - FOREIGN KEY (id) REFERENCES cluster.nodeinfo(id) ON DELETE CASCADE - ) - when: db_ping_new.is_available diff --git a/upgrade/roles/upgrade_precheck/tasks/check_mysql_data_existence.yml b/upgrade/roles/upgrade_precheck/tasks/check_mysql_data_existence.yml new file mode 100644 index 000000000..7dcfd1f3f --- /dev/null +++ b/upgrade/roles/upgrade_precheck/tasks/check_mysql_data_existence.yml @@ -0,0 +1,51 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Get MySQL Pod Name + ansible.builtin.command: kubectl get pod -n "{{ telemetry_visualizations_ns }}" -l app="mysqldb" -o jsonpath='{.items[0].metadata.name}' + register: mysql_pod_name + changed_when: false + +- name: Check whether to take a MYSQL db backup or not + block: + - name: Check if database exists + ansible.builtin.command: kubectl exec -n "{{ telemetry_visualizations_ns }}" "{{ mysql_pod_name.stdout }}" -- mysql -u "{{ mysqldb_user }}" -p"{{ mysqldb_password }}" -e "SHOW DATABASES LIKE '{{ mysqldb_name }}';" # noqa: yaml[line-length] + register: db_check_result + changed_when: false + when: mysql_pod_name is defined + + - name: Display database existence result + ansible.builtin.debug: + msg: "Database '{{ mysqldb_name }}' exists." + when: + - db_check_result is defined + - db_check_result.stdout | length > 0 + + - name: Check tables exists in the DB + ansible.builtin.command: kubectl exec -n "{{ telemetry_visualizations_ns }}" "{{ mysql_pod_name.stdout }}" -- mysql -u "{{ mysqldb_user }}" -p"{{ mysqldb_password }}" -D "{{ mysqldb_name }}" -e "SHOW TABLES;" # noqa: yaml[line-length] + register: show_tables_result + changed_when: false + when: + - db_check_result is defined + - db_check_result.stdout | length > 0 + + - name: Set facts for MYSQL DB + when: + - show_tables_result.stdout_lines is defined + - show_tables_result.stdout_lines | length > 0 + block: + - name: Set mysqldb_backup_flag + ansible.builtin.set_fact: + mysqldb_backup_flag: true diff --git a/upgrade/roles/upgrade_precheck/tasks/check_timescaldb_existence.yml b/upgrade/roles/upgrade_precheck/tasks/check_timescaldb_existence.yml new file mode 100644 index 000000000..9733056d6 --- /dev/null +++ b/upgrade/roles/upgrade_precheck/tasks/check_timescaldb_existence.yml @@ -0,0 +1,93 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Get timescaledb pod name + ansible.builtin.command: 'kubectl get pod -n "{{ telemetry_visualizations_ns }}" -l app="{{ timescaledb_k8s_name }}" -o jsonpath="{.items[0].metadata.name}"' + register: timescaledb_pod_name + changed_when: false + failed_when: false + +- name: Connect to TimescaleDB pod and database + block: + - name: Start a bash session in the TimescaleDB pod + ansible.builtin.command: kubectl exec -it "{{ timescaledb_pod_name.stdout }}" -n "{{ telemetry_visualizations_ns }}" -- /bin/bash + register: timescaledb_bash_session + changed_when: false + ignore_errors: true + + - name: Check if bash session was started + ansible.builtin.debug: + msg: "Bash session started successfully" + when: timescaledb_bash_session.rc == 0 + + - name: Get external IP of timescaledb service + ansible.builtin.command: kubectl get svc "{{ timescaledb_k8s_name }}" -n "{{ telemetry_visualizations_ns }}" -o jsonpath='{.status.loadBalancer.ingress[0].ip}' # noqa: yaml[line-length] + register: timescaledb_service_external_ip + when: timescaledb_bash_session.rc == 0 + failed_when: false + changed_when: false + + - name: Connect to database + ansible.builtin.command: 'kubectl exec -it "{{ timescaledb_pod_name.stdout }}" -n "{{ telemetry_visualizations_ns }}" -- psql -d "postgres://{{ timescaledb_user }}:{{ timescaledb_password }}@{{ timescaledb_service_external_ip.stdout }}:5432/{{ database_name }}"' # noqa: yaml[line-length] + register: timescaledb_connection_status + changed_when: false + failed_when: false + when: + - timescaledb_bash_session.rc == 0 + - "'running' in timescaledb_pod_status.stdout | lower" + + - name: Check if schemas exist + ansible.builtin.command: 'kubectl exec -it "{{ timescaledb_pod_name.stdout }}" -n "{{ telemetry_visualizations_ns }}" -- psql -d "postgres://{{ timescaledb_user }}:{{ timescaledb_password }}@{{ timescaledb_service_external_ip.stdout }}:5432/{{ database_name }}" -c "\dn"' # noqa: yaml[line-length] + register: schema_exist + changed_when: false + failed_when: false + + - name: Extract schema names + ansible.builtin.set_fact: + schema_names: "{{ schema_exist.stdout_lines | map('split', '|') | map('first') | list }}" + + - name: Set fact timescale DB flags + ansible.builtin.set_fact: + public_schema_flag: false + omnia_telemetry_schema_flag: false + + - name: Set support value omnia_telemetry_support + ansible.builtin.set_fact: + omnia_telemetry_support: "{{ omnia_telemetry_support | lower }}" + + - name: Check omnia telemetry schema exists + when: omnia_telemetry_support + block: + - name: Set fact omnia_telemetry_schema_flag + ansible.builtin.set_fact: + omnia_telemetry_schema_flag: true + when: "'omnia_telemetry' in schema_names | map('trim')" + + - name: Check public schema exists + when: idrac_telemetry_pod_status.rc== 0 + block: + - name: Set fact public_schema_flag + ansible.builtin.set_fact: + public_schema_flag: true + when: "'public' in schema_names | map('trim')" + + - name: Set timescaledb_backup_flag + when: + - (timescaledb_connection_status is defined) and (timescaledb_connection_status.rc == 0) + - public_schema_flag or omnia_telemetry_schema_flag + block: + - name: Set timescaledb_backup_flag + ansible.builtin.set_fact: + timescaledb_backup_flag: true diff --git a/utils/roles/control_plane_cleanup/tasks/include_telemetry_config.yml b/upgrade/roles/upgrade_precheck/tasks/include_telemetry_config.yml similarity index 97% rename from utils/roles/control_plane_cleanup/tasks/include_telemetry_config.yml rename to upgrade/roles/upgrade_precheck/tasks/include_telemetry_config.yml index 4e42a6105..e1b809bb9 100644 --- a/utils/roles/control_plane_cleanup/tasks/include_telemetry_config.yml +++ b/upgrade/roles/upgrade_precheck/tasks/include_telemetry_config.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. --- + - name: Check telemetry_config.yml file is encrypted ansible.builtin.command: cat {{ telemetry_config_file }} changed_when: false diff --git a/upgrade/roles/upgrade_precheck/tasks/main.yml b/upgrade/roles/upgrade_precheck/tasks/main.yml new file mode 100644 index 000000000..2fbf9517c --- /dev/null +++ b/upgrade/roles/upgrade_precheck/tasks/main.yml @@ -0,0 +1,26 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check installed omnia version and version to be upgraded + ansible.builtin.include_tasks: upgrade_version_check.yml + +- name: Check supported OS on Omnia Infrastructure Manager + ansible.builtin.include_tasks: upgrade_os_check.yml + +- name: Check local repo metadata file + ansible.builtin.include_tasks: precheck_local_repo_access.yml + +- name: Check if kubernetes is installed and all PODs are running + ansible.builtin.include_tasks: upgrade_precheck_k8s.yml diff --git a/upgrade/roles/upgrade_precheck/tasks/precheck_local_repo_access.yml b/upgrade/roles/upgrade_precheck/tasks/precheck_local_repo_access.yml new file mode 100644 index 000000000..cd9303c74 --- /dev/null +++ b/upgrade/roles/upgrade_precheck/tasks/precheck_local_repo_access.yml @@ -0,0 +1,42 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check local_repo_access.yml file + ansible.builtin.stat: + path: "{{ local_repo_access_dest_path }}" + register: check_local_repo_access + +- name: Fail if local_repo_access.yml does not exist + ansible.builtin.fail: + msg: "{{ local_repo_access_fail_msg }}" + when: not check_local_repo_access.stat.exists + +- name: Check offline metadata file exists + ansible.builtin.stat: + path: "{{ repo_config_metadata_file }}" + register: check_offline_metadata + +- name: Fail if offline metadata file does not exist + ansible.builtin.fail: + msg: "{{ repo_config_metadata_file_msg }}" + when: not check_offline_metadata.stat.exists + +- name: Read repo config value from metadata file + ansible.builtin.include_vars: "{{ repo_config_metadata_file }}" + +- name: Fail if repo config value is empty + ansible.builtin.fail: + msg: "{{ incorrect_repo_config_value_msg }}" + when: md_repo_config is defined and not (md_repo_config == 'partial' or md_repo_config == 'always' or md_repo_config == 'never') diff --git a/upgrade/roles/upgrade_precheck/tasks/upgrade_os_check.yml b/upgrade/roles/upgrade_precheck/tasks/upgrade_os_check.yml new file mode 100644 index 000000000..444e97b01 --- /dev/null +++ b/upgrade/roles/upgrade_precheck/tasks/upgrade_os_check.yml @@ -0,0 +1,59 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Get the os version + ansible.builtin.debug: + msg: "{{ ansible_distribution }} version {{ ansible_distribution_version }}" + +- name: Initialize variable + ansible.builtin.set_fact: + supported_os: false + older_os: false + +- name: Set supported OS + ansible.builtin.set_fact: + supported_os: true + when: + - (ansible_distribution | lower) in os_check_dict + - ansible_distribution_version in os_check_dict[(ansible_distribution | lower)] or + ansible_distribution_version in older_os_dict[(ansible_distribution | lower)] + +- name: Fail if it is not supported OS + ansible.builtin.fail: + msg: "{{ unsupported_os }}" + when: not supported_os + +- name: Set fact whether it is older OS + ansible.builtin.set_fact: + older_os: true + when: + - (ansible_distribution | lower) in older_os_dict + - ansible_distribution_version in older_os_dict[(ansible_distribution | lower)] + +- name: Support on RHEL/Rocky 8.6/8.7 OS + ansible.builtin.debug: + msg: "{{ msg_supported_older_os }}" + when: older_os + +- name: Check virtual ENV + ansible.builtin.set_fact: + venv_path: "{{ lookup('ansible.builtin.env', 'VIRTUAL_ENV') }}" + +- name: Check activated venv for RHEL/Rocky 8.8 and ubuntu 22.04/20.04 + ansible.builtin.fail: + msg: "{{ activate_supported_venv }}" + when: + - not older_os + - "'omnia17_venv' not in venv_path" diff --git a/upgrade/roles/upgrade_precheck/tasks/upgrade_precheck_k8s.yml b/upgrade/roles/upgrade_precheck/tasks/upgrade_precheck_k8s.yml new file mode 100644 index 000000000..f2d167eb9 --- /dev/null +++ b/upgrade/roles/upgrade_precheck/tasks/upgrade_precheck_k8s.yml @@ -0,0 +1,244 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Set Kubernetes upgrade precheck flag to false + ansible.builtin.set_fact: + k8s_upgrade_precheck_flag: false + +- name: Set k8s_backup_status to false by default + ansible.builtin.set_fact: + k8s_backup_status: false + +- name: Initialize individual pods running status + ansible.builtin.set_fact: + upgrade_precheck_pod_status_report: + grafana_pod_running_status: false + loki_pod_running_status: false + mysqldb_pod_running_status: false + timescaledb_pod_running_status: false + idrac_telemetry_pod_running_status: false + +- name: Gather service facts + ansible.builtin.service_facts: + +- name: Check if etcd service is present in service facts + when: "'etcd.service' in ansible_facts.services and ansible_facts.services['etcd.service'].status == 'enabled'" + block: + - name: Fail if etcd servce not running + ansible.builtin.fail: + msg: "{{ etcd_not_running }}" + when: "ansible_facts.services['etcd.service'].state != 'running'" + + - name: Check if Kubernetes is deployed + ansible.builtin.command: kubectl + register: k8s_return_code + changed_when: false + failed_when: false + + - name: Check for k8s cluster + when: k8s_return_code.rc== 0 + block: + - name: Check basic sanity on Kubernetes cluster + ansible.builtin.command: kubectl get node + register: k8s_sanity_return_code + changed_when: false + failed_when: false + + # Fail if kubectl command is working but kubectl get node command not working. Some issue in k8s cluster. + - name: Upgrade precheck - Kubernetes cluster basic sanity failed + ansible.builtin.fail: + msg: "{{ error_k8s_sanity }}" + when: > + k8s_sanity_return_code.rc!= 0 + + - name: Check for non Running pods + ansible.builtin.shell: > + set -o pipefail && \ + kubectl get pods --all-namespaces --no-headers | grep -v "Running" + register: non_running_pod_list + failed_when: false + changed_when: false + until: non_running_pod_list.stdout_lines | length > 0 + retries: "{{ max_attempts }}" + delay: "{{ wait_time }}" + + - name: Upgrade precheck - Check for non running pods + ansible.builtin.fail: + msg: "{{ error_non_running_pods }} {{ non_running_pod_list.stdout_lines }}" + when: non_running_pod_list.stdout_lines | length > 0 + + # Check for individual pod grafana, loki, iDRAC telemetry, timescaledb and mysql db running status + - name: Validate in namespace {{ grafana_ns }} + block: + - name: Get grafana pod running status + ansible.builtin.shell: > + set -o pipefail && \ + kubectl get pods -n {{ grafana_ns }} --no-headers | grep "Running" | grep {{ grafana_pod_selector }} + register: grafana_pod_status + failed_when: false + changed_when: false + + - name: Set running status for grafana pod + ansible.builtin.set_fact: + upgrade_precheck_pod_status_report: + "{{ upgrade_precheck_pod_status_report | combine({'grafana_pod_running_status': grafana_pod_status | length > 0}) }}" + when: grafana_pod_status.rc== 0 + + - name: Get loki pod running status + ansible.builtin.shell: > + set -o pipefail && \ + kubectl get pods -n {{ grafana_ns }} --no-headers | grep "Running" | grep {{ loki_pod_selector }} + register: loki_pod_status + failed_when: false + changed_when: false + + - name: Set running status for loki pod + ansible.builtin.set_fact: + upgrade_precheck_pod_status_report: + "{{ upgrade_precheck_pod_status_report | combine({'loki_pod_running_status': loki_pod_status | length > 0}) }}" + when: loki_pod_status.rc== 0 + + - name: Validate in namespace {{ telemetry_visualizations_ns }} + block: + - name: Get mysqldb pod running status + ansible.builtin.shell: > + set -o pipefail && \ + kubectl get pods -n {{ telemetry_visualizations_ns }} --no-headers | grep "Running" | grep {{ mysqldb_pod_selector }} + register: mysqldb_pod_status + failed_when: false + changed_when: false + + - name: Set running status for mysqldb pod + ansible.builtin.set_fact: + upgrade_precheck_pod_status_report: + "{{ upgrade_precheck_pod_status_report | combine({'mysqldb_pod_running_status': mysqldb_pod_status | length > 0}) }}" + when: mysqldb_pod_status.rc== 0 + + - name: Get timescaledb pod running status + ansible.builtin.shell: > + set -o pipefail && \ + kubectl get pods -n {{ telemetry_visualizations_ns }} --no-headers | grep "Running" | grep {{ timescaledb_pod_selector }} + register: timescaledb_pod_status + failed_when: false + changed_when: false + + - name: Set running status for timescaledb pod + ansible.builtin.set_fact: + upgrade_precheck_pod_status_report: + "{{ upgrade_precheck_pod_status_report | combine({'timescaledb_pod_running_status': timescaledb_pod_status | length > 0}) }}" + when: timescaledb_pod_status.rc== 0 + + - name: Get iDRAC telemetry pods running status + ansible.builtin.shell: > + set -o pipefail && \ + kubectl get pods -n {{ telemetry_visualizations_ns }} --no-headers | grep "Running" | grep {{ idrac_telemetry_pod_selector }} + register: idrac_telemetry_pod_status + failed_when: false + changed_when: false + + - name: Set running status for iDRAC telemetry pod + ansible.builtin.set_fact: + upgrade_precheck_pod_status_report: + "{{ upgrade_precheck_pod_status_report | combine({'idrac_telemetry_pod_running_status': idrac_telemetry_pod_status | length > 0}) }}" + when: idrac_telemetry_pod_status.rc== 0 + + - name: Check if Kubernetes is installed on the Omnia Infrastructure Manager + ansible.builtin.command: kubectl get pod -A + changed_when: false + register: kubectl_command_status + failed_when: false + + - name: Check Kubernetes version installed on Omnia Infrastructure Manager + ansible.builtin.shell: | + set -o pipefail; + kubectl get nodes -o wide | awk 'NR==2 {print $5}' + register: kubernetes_version + changed_when: false + when: kubectl_command_status.rc == 0 + + - name: Set k8s_backup_status to true if k8s is installed and kube version is v1.26.12 + ansible.builtin.set_fact: + k8s_backup_status: true + when: (kubectl_command_status.rc == 0) and (kubernetes_version is defined) and ("v1.26.12" in kubernetes_version.stdout) + + - name: Include telemetry config vars + ansible.builtin.include_tasks: include_telemetry_config.yml + + - name: Verify existence of DB data in mysql and timescaledb pod + when: k8s_backup_status + block: + - name: Set fact for mysqldb_backup_flag + ansible.builtin.set_fact: + mysqldb_backup_flag: false + + - name: Set fact timescaledb_backup_flag + ansible.builtin.set_fact: + timescaledb_backup_flag: false + + - name: Enable the backup flag based on the existence of the data in MySQL DB + ansible.builtin.import_tasks: check_mysql_data_existence.yml + when: mysqldb_pod_status.rc== 0 + + - name: Enable the backup flag based on the existence of the timescale DB + ansible.builtin.import_tasks: check_timescaldb_existence.yml + when: timescaledb_pod_status.rc== 0 + + - name: Warning - timescale DB has no data + ansible.builtin.pause: + prompt: "{{ backup_warning_msg }}" + seconds: "{{ warning_wait_time }}" + when: timescaledb_backup_flag is false and mysqldb_backup_flag is false + + # Check for services + - name: Get all LoadBalancer services details + ansible.builtin.command: > + kubectl get svc --all-namespaces -o \ + jsonpath="{range .items[?(@.spec.type=='LoadBalancer')]}{.metadata.namespace}:{.metadata.name}:{.status.loadBalancer.ingress[*].ip}{'\n'}{end}" + register: lb_services + failed_when: false + changed_when: false + + - name: Check for Load balancer service without External IPs assigned + ansible.builtin.set_fact: + improper_services: "{{ lb_services.stdout_lines | select('search', ':$') | list }}" + + - name: Upgrade precheck - Check for Load balancer service without IPs assigned + ansible.builtin.fail: + msg: "{{ error_improper_service }} {{ improper_services }}" + when: > + improper_services | length > 0 + + # Check for PVCs + - name: Get all PVCs + ansible.builtin.command: > + kubectl get pvc --all-namespaces --no-headers -o custom-columns="NAMESPACE:.metadata.namespace,NAME:.metadata.name,STATUS:.status.phase" + register: all_pvcs + failed_when: false + changed_when: false + + - name: Check for PVC not in bound state + ansible.builtin.set_fact: + non_bound_pvc: "{{ all_pvcs.stdout_lines | select('search', 'Pending|Lost') | list }}" + + - name: Upgrade precheck - Check for unbound PVC issue + ansible.builtin.fail: + msg: "{{ error_unbound_pvc }} {{ non_bound_pvc }}" + when: > + non_bound_pvc | length > 0 + + # Playbook execution reaches this step indiactes Kubernetes upgrade precheck flag can be set to true. + - name: Set Kubernetes upgrade precheck flag to true based on report + ansible.builtin.set_fact: + k8s_upgrade_precheck_flag: true diff --git a/upgrade/roles/upgrade_precheck/tasks/upgrade_version_check.yml b/upgrade/roles/upgrade_precheck/tasks/upgrade_version_check.yml new file mode 100644 index 000000000..e0d44871e --- /dev/null +++ b/upgrade/roles/upgrade_precheck/tasks/upgrade_version_check.yml @@ -0,0 +1,77 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if omnia version file path is present + ansible.builtin.stat: + path: "{{ installed_omnia_version_path }}" + register: file_status + +- name: Report omnia_version file absence + ansible.builtin.fail: + msg: "{{ omnia_version_file_absent }}" + when: not file_status.stat.exists + +- name: Get the omnia version + ansible.builtin.include_vars: + file: "{{ installed_omnia_version_path }}" + +- name: Set target omnia version vars + ansible.builtin.set_fact: + installed_omnia_version: "{{ omnia_version | string }}" + +- name: Print the omnia version fetched + ansible.builtin.debug: + msg: Version at {{ installed_omnia_path }} is {{ installed_omnia_version }} + +- name: Get the omnia version to be upgraded + ansible.builtin.include_vars: + file: "{{ omnia_upgrade_version_path }}" + +- name: Set upgrade_omnia_version + ansible.builtin.set_fact: + upgrade_omnia_version: "{{ omnia_version | string }}" + +- name: Check fetched omnia version and version to be upgraded is same + ansible.builtin.fail: + msg: "{{ already_upgraded }}" + when: installed_omnia_version == upgrade_omnia_version + +- name: Assert if >= v1.6.1 + ansible.builtin.assert: + that: + - installed_omnia_version is version('1.6.1', '>=') + fail_msg: "{{ unsupported_omnia_version }}" + +- name: Check discovery metadata file status + ansible.builtin.stat: + path: "{{ meta_path }}" + register: metadata_status + +- name: Discovery_and_provision.yml not executed + ansible.builtin.fail: + msg: "{{ discovery_execution_req }}" + when: not metadata_status.stat.exists + +- name: Include the metadata file + when: metadata_status.stat.exists + block: + - name: Include metadata.yml file + ansible.builtin.include_vars: "{{ meta_path }}" + no_log: true + + - name: Check installed omnia version and version to be upgraded is same + ansible.builtin.fail: + msg: "{{ already_upgraded }}" + when: installed_version is defined and installed_version in upgrade_omnia_version diff --git a/upgrade/roles/upgrade_precheck/vars/main.yml b/upgrade/roles/upgrade_precheck/vars/main.yml new file mode 100644 index 000000000..5873f359e --- /dev/null +++ b/upgrade/roles/upgrade_precheck/vars/main.yml @@ -0,0 +1,75 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Usage: upgrade_version_check +unsupported_omnia_version: "Current installed version is below v1.6.1 and sequential upgrade is supported from Omnia v1.6.1 only. + Please use upgrade functionality from Omnia 1.6.1 source code to upgrade to v1.6.1" +omnia_version_file_absent: "Omnia version file is not present in provided installed omnia path. + Please provide correct omnia installed source code path and re-run upgrade_oim.yml" +discovery_execution_req: "discovery_and_provsion.yml is not executed on the Omnia Infrastructure Manager, Please follow documentation for fresh installation." +already_upgraded: "Current installed omnia version is same as version to be upgraded." + +meta_path: "/opt/omnia/.data/metadata.yml" +installed_omnia_version_path: "{{ installed_omnia_path }}/.metadata/omnia_version" +omnia_upgrade_version_path: "../../../../.metadata/omnia_version" + + +# Usage: upgrade_os_check +unsupported_os: "Current OS {{ ansible_distribution }} {{ ansible_distribution_version }} installed on the Omnia Infrastructure Manager + is not supported for upgrade feature in Omnia v1.7. Upgrade is supported on RHEL/Rocky 8.6/8.7/8.8, Ubuntu 20.04/22.04." +msg_supported_older_os: "After upgrade, new features are not supported on RHEL/Rocky 8.6/8.7" +activate_supported_venv: "Failed. Upgrade is supported using omnia17_venv only on RHEL/Rocky 8.8 and ubuntu 22.04/20.04." + +# Usage: precheck_local_repo_access.yml +local_repo_access_dest_path: "/opt/omnia/offline/local_repo_access.yml" +local_repo_access_fail_msg: "Failed. {{ local_repo_access_dest_path }} does not exist." +repo_config_metadata_file: "/opt/omnia/offline/.data/metadata.yml" +repo_config_metadata_file_msg: "Failed. Local repo has not executed. Please follow documentation for fresh installation." +incorrect_repo_config_value_msg: "Failed. While omnia installation, local repo has not executed with valid repo_config value." + +# Usage: upgrade_precheck_k8s.yml +etcd_not_running: "Upgrade Precheck Failed- etcd service is not running. Please fix etcd service first before initiating upgrade." +grafana_ns: "grafana" +telemetry_visualizations_ns: "telemetry-and-visualizations" +mysqldb_name: "idrac_telemetrysource_services_db" +database_name: "telemetry_metrics" +timescaledb_k8s_name: timescaledb +backup_warning_msg: "The database currently has no data. The database will be populated with fresh data after the upgrade." +warning_wait_time: 30 + +# Usage:include_telemetry_config.yml +telemetry_config_file: "{{ installed_omnia_path }}/input/telemetry_config.yml" +telemetry_vault_filename: "{{ installed_omnia_path }}/input/.telemetry_vault_key" +telemetry_config_syntax_fail_msg: "Failed.Syntax errors present in telemetry_config.yml.Fix errors and re-run playbook again." +vault_file_perm: '0644' + +grafana_pod_selector: "grafana-" +loki_pod_selector: "loki-" +timescaledb_pod_selector: "timescaledb-" +idrac_telemetry_pod_selector: "idrac-telemetry-" +mysqldb_pod_selector: "mysqldb-" + +error_non_running_pods: "Upgrade precheck failed. There are non running pods present in the cluster. + All pods should be in running state before initiating upgrade task. + Please fix these pods first before initiatiing upgrade task. List of non running pods: " +error_unbound_pvc: "Upgrade precheck failed: There should not be any unbounded PVCs before initiatiing upgrade task. + Unbounded PVC found: " +error_improper_service: "Upgrade precheck failed: Some LoadBalancer services do not have external IPs assigned. + Please fix the mentioned services first before initiatiing upgrade task: " +error_k8s_sanity: "Upgrade precheck failed in kubernetes sanity check. 'kubectl get node' command failed. Review the kubernetes cluster. + Please fix the kubernetes cluster first before initiatiing upgrade task." + +max_attempts: 3 +wait_time: 5 diff --git a/upgrade/roles/upgrade_prepare_oim/tasks/main.yml b/upgrade/roles/upgrade_prepare_oim/tasks/main.yml new file mode 100644 index 000000000..500578866 --- /dev/null +++ b/upgrade/roles/upgrade_prepare_oim/tasks/main.yml @@ -0,0 +1,83 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Fetching cluster os + ansible.builtin.set_fact: + oim_os: "{{ ansible_distribution | lower }}" + +- name: Prepare OIM for proxy defaults + ansible.builtin.include_role: + name: configure_proxy + +- name: Create telemetry binary with upgrad version + ansible.builtin.include_role: + name: omnia_telemetry + tasks_from: create_binary.yml + +- name: Update omnia_appliance required pip packages not supported OS + ansible.builtin.pip: + requirements: "{{ installed_omnia_path }}/prepare_cp/roles/omnia_appliance_cp/files/requirements_pip.txt" + when: older_os + +- name: Update omnia_appliance required pip packages if supported OS + ansible.builtin.include_role: + name: omnia_appliance_oim + when: not older_os + +- name: Prepare venv with required pip packages + ansible.builtin.command: "{{ ansible_python_interpreter }} -m pip install {{ item }}" + with_items: "{{ pip_packages }}" + changed_when: false + +- name: Regenerate inventory files + ansible.builtin.include_role: + name: inventory_tagging + +- name: Wait for compute_hostname file to generate + ansible.builtin.wait_for: + path: "{{ hostname_inv_path }}" + state: present + timeout: "{{ timeout_val }}" + +- name: Refresh inventory + ansible.builtin.meta: refresh_inventory + changed_when: false + +- name: Remove compute_servicetag_ip inventory file + ansible.builtin.file: + path: "{{ compute_servicetag_ip_path }}" + state: absent + +- name: Check cryptography version + ansible.builtin.command: "pip show cryptography" + register: cryptography_info + changed_when: false + ignore_errors: true + when: older_os + +- name: Set cryptography version as fact + ansible.builtin.set_fact: + cryptography_version: "{{ (cryptography_info.stdout | regex_search('Version: (\\d+\\.\\d+\\.\\d+)', '\\1') | first) | string }}" + when: + - older_os + - cryptography_info.stdout is defined + - cryptography_info.stdout | regex_search('Version: (\\d+\\.\\d+\\.\\d+)') + +- name: Install cryptography version 44.0.0 if needed + ansible.builtin.pip: + name: cryptography==44.0.0 + when: + - older_os + - cryptography_version is not defined or (cryptography_version < '44.0.0') diff --git a/upgrade/roles/upgrade_prepare_oim/vars/main.yml b/upgrade/roles/upgrade_prepare_oim/vars/main.yml new file mode 100644 index 000000000..662803556 --- /dev/null +++ b/upgrade/roles/upgrade_prepare_oim/vars/main.yml @@ -0,0 +1,34 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Usage: main.yml +compute_servicetag_ip_path: "/opt/omnia/omnia_inventory/compute_servicetag_ip" +hostname_inv_path: "/opt/omnia/omnia_inventory/compute_hostname_ip" +timeout_val: 10 + +pip_packages: + - pyopenssl==21.0.0 + - urllib3==1.26.5 + - psycopg2-binary + - requests + - pyarrow + - pandas + - passlib + - netaddr + - pexpect + - pyinstaller + - psutil + - commented-configparser + - iprange-python diff --git a/upgrade/roles/upgrade_xcat/tasks/main.yml b/upgrade/roles/upgrade_xcat/tasks/main.yml deleted file mode 100644 index 3a6ae2ed0..000000000 --- a/upgrade/roles/upgrade_xcat/tasks/main.yml +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Upgrade xcat - environment: - XCATROOT: "{{ xcat_root_env }}" - PATH: "{{ ansible_env.PATH }}:{{ xcat_path_env }}" - MANPATH: "{{ xcat_manpath_env }}" - PERL_BADLANG: "{{ perl_badlang_env }}" - block: - - name: Update xcat package - ansible.builtin.include_tasks: update_xcat_package.yml - - - name: Update xcat tables - ansible.builtin.include_tasks: update_xcat_tables.yml diff --git a/upgrade/roles/upgrade_xcat/tasks/update_xcat_package.yml b/upgrade/roles/upgrade_xcat/tasks/update_xcat_package.yml deleted file mode 100644 index e521da2e3..000000000 --- a/upgrade/roles/upgrade_xcat/tasks/update_xcat_package.yml +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Delete xcat files - ansible.builtin.file: - path: "{{ item }}" - state: absent - with_items: "{{ xcat_delete_files }}" - -- name: Create xcat installation directory - ansible.builtin.file: - path: "{{ xcat_directory }}" - state: directory - mode: "{{ file_permission }}" - -- name: Download xcat-core package - ansible.builtin.get_url: - url: "{{ xcat_core_url }}" - dest: "{{ xcat_core_dest }}" - mode: "{{ file_permission }}" - register: download_xcat_core - until: download_xcat_core is not failed - retries: "{{ max_retries }}" - -- name: Untar xcat-core package - ansible.builtin.unarchive: - src: "{{ xcat_core_dest }}" - dest: "{{ xcat_directory }}" - changed_when: true - register: untar_xcat_core - until: untar_xcat_core is not failed - retries: "{{ max_retries }}" - -- name: Create xcat-core localrepo - ansible.builtin.command: "{{ xcat_directory }}/xcat-core/mklocalrepo.sh" - changed_when: true - -- name: Download xcat-dep package (This task may take 10 mins) - ansible.builtin.get_url: - url: "{{ xcat_dep_url }}" - dest: "{{ xcat_dep_dest }}" - mode: "{{ file_permission }}" - register: download_xcat_dep - until: download_xcat_dep is not failed - retries: "{{ max_retries }}" - -- name: Untar xcat-dep package - ansible.builtin.unarchive: - src: "{{ xcat_dep_dest }}" - dest: "{{ xcat_directory }}" - changed_when: true - register: untar_xcat_dep - until: untar_xcat_dep is not failed - retries: "{{ max_retries }}" - -- name: Create xcat-dep localrepo - ansible.builtin.command: "{{ xcat_directory }}/xcat-dep/rh8/x86_64/mklocalrepo.sh" - changed_when: true - -- name: Install xCAT and postgres packages (This task may take 10 mins) - ansible.builtin.package: - name: "{{ xcat_packages }}" - state: latest # noqa: package-latest - retries: "{{ package_retry }}" - -- name: Save xcat enviornment variables - ansible.builtin.shell: source {{ xcat_env_path }} && echo "{{ xcat_env_msg }}" - changed_when: true diff --git a/upgrade/roles/upgrade_xcat/vars/main.yml b/upgrade/roles/upgrade_xcat/vars/main.yml deleted file mode 100644 index 393a6e90c..000000000 --- a/upgrade/roles/upgrade_xcat/vars/main.yml +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -# Usage: main.yml -xcat_root_env: "/opt/xcat" -xcat_path_env: "/opt/xcat/bin:/opt/xcat/sbin:/opt/xcat/share/xcat/tools" -xcat_manpath_env: "/opt/xcat/share/man:$MANPATH" -perl_badlang_env: 0 - -# Usage: update_xcat_package.yml -xcat_delete_files: - - /root/xcat/xcat-dep - - /root/xcat/xcat-core - - /root/xcat/xcat-core-2.16.4-linux.tar.bz2 - - /root/xcat/xcat-dep-2.16.4-linux.tar.bz2 - - /etc/yum.repos.d/xcat-core.repo - - /etc/yum.repos.d/xcat-dep.repo -xcat_directory: /root/xcat -xcat_core_url: http://xcat.org/files/xcat/xcat-core/2.16.x_Linux/xcat-core/xcat-core-2.16.5-linux.tar.bz2 -xcat_core_dest: "{{ xcat_directory }}/xcat-core-2.16.5-linux.tar.bz2" -xcat_dep_url: http://xcat.org/files/xcat/xcat-dep/2.x_Linux/xcat-dep-2.16.5-linux.tar.bz2 -xcat_dep_dest: "{{ xcat_directory }}/xcat-dep-2.16.5-linux.tar.bz2" -xcat_packages: - - xCAT - - xCAT-buildkit -package_retry: 3 -file_permission: "0755" -xcat_env_path: "/etc/profile.d/xcat.sh" -xcat_env_msg: "Saving xcat enviornment variables" -max_retries: 10 - -# Usage: update_xcat_tables.yml -metadata_path: /opt/omnia/.data/metadata.yml diff --git a/upgrade/roles/user_confirmation/tasks/main.yml b/upgrade/roles/user_confirmation/tasks/main.yml new file mode 100644 index 000000000..c1861e403 --- /dev/null +++ b/upgrade/roles/user_confirmation/tasks/main.yml @@ -0,0 +1,29 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Upgrade Confirmation + ansible.builtin.pause: + prompt: "{{ user_confirmation_msg }}" + register: upgrade_confirmation_prompt + when: + - not (skip_confirmation | default(false) | bool) + - upgrade_confirmation is not defined + +- name: Check confirmation + ansible.builtin.fail: + msg: "Upgrade confirmation failed" + when: + - not upgrade_confirmation | default(false) | bool + - not upgrade_confirmation_prompt.user_input | default("") == "yes" diff --git a/upgrade/roles/user_confirmation/vars/main.yml b/upgrade/roles/user_confirmation/vars/main.yml new file mode 100644 index 000000000..0e2f45ff6 --- /dev/null +++ b/upgrade/roles/user_confirmation/vars/main.yml @@ -0,0 +1,29 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +user_confirmation_msg: "{{ unsupported_os_user_confirmation_msg if ansible_distribution_version in ['8.6', '8.7'] else supported_os_user_confirmation_msg }}" + +supported_os_user_confirmation_msg: | + "* Telemetry service will be unavailable during upgrade execution. + * Below software version will be upgraded on Omnia Infrastructure Manager + nerdctl - 1.5.0 to 1.7.4 + kubernetes - 1.26.12 to 1.29.5 + Are you sure you want to upgrade Omnia Infrastructure Manager? Type 'yes' to upgrade OIM." + +unsupported_os_user_confirmation_msg: | + "* Telemetry service will be unavailable during upgrade execution. + * Post upgrade, Omnia 1.7 new features are not supported but fixes will available with Omnia 1.7 source on RHEL/Rocky 8.6 and 8.7 OS. + * Only omnia 1.6.1 features supported on RHEL/Rocky 8.6 and 8.7. + Are you sure you want to upgrade Omnia Infrastructure Manager? Type 'yes' to upgrade OIM." diff --git a/upgrade/roles/user_messages/vars/main.yml b/upgrade/roles/user_messages/vars/main.yml deleted file mode 100644 index 7d62f97b2..000000000 --- a/upgrade/roles/user_messages/vars/main.yml +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -user_msg_prepare_config: | - - *************************************************************************************************************************************************** - STEP 1 OF UPGRADE: PREPARE_CONFIG COMPLETED SUCCESSFULLY. - - Omnia 1.5.1 input configurations have been imported and populated into the Omnia 1.6.1 input files at {{ role_path }}/../../../input - Backup of Omnia 1.5.1 database has been taken at the 'backup_location' as mentioned in upgrade_config.yml - IMPORTANT: Request user to cross-verify the auto-generated configuration parameters, before proceeding further. - - Once the configurations are verified, user can trigger prepare_upgrade.yml as below: - - ansible-playbook prepare_upgrade.yml -i - - Where is the absolute file path of Omnia 1.5.1 inventory - - prepare_upgrade.yml cleans up the various software packages on Omnia 1.5.1 cluster and the control plane. This includes Kubernetes, Telemetry, etc. - It also creates the Omnia 1.6.1 local repo, upgrades xCAT on control plane, and restores the Omnia Database from the backup. - NOTE: Omnia NFS share of Omnia 1.5.1 is deleted while running prepare_upgrade.yml,hence we recommend users to take backup of the share before proceeding . - - **************************************************************************************************************************************************** - - -user_msg_prepare_upgrade: | - - *************************************************************************************************************************************************** - STEP 2 OF UPGRADE: PREPARE_UPGRADE COMPLETED SUCCESSFULLY. - - Omnia 1.5.1 control plane and cluster cleanup have completed successfully. - On the control plane, Omnia 1.6.1 local_repo is configured, xCAT is upgraded, and Omnia Database is restored from the backup. - - User can now trigger upgrade.yml as below: - - ansible-playbook upgrade.yml -i - - Where is the Omnia 1.6.1 inventory file, auto-generated in Step 1, and is available in 'upgrade' folder - After executing upgrade.yml, the cluster will be fully upgraded to Omnia 1.6.1 - - **************************************************************************************************************************************************** - - -user_msg_prepare_config2: | - - *************************************************************************************************************************************************** - STEP 1 OF UPGRADE: PREPARE_CONFIG COMPLETED SUCCESSFULLY. - - Omnia 1.6 input configurations have been imported and populated into the Omnia 1.6.1 input files at {{ role_path }}/../../../input - - IMPORTANT: Request user to cross-verify the configuration parameters, before proceeding further. - - Once the configurations are verified, user can trigger prepare_upgrade.yml as below: - - ansible-playbook prepare_upgrade.yml -i - - Where is the absolute file path of Omnia 1.6 inventory - - **************************************************************************************************************************************************** - - -user_msg_prepare_upgrade2: | - - *************************************************************************************************************************************************** - STEP 2 OF UPGRADE: PREPARE_UPGRADE COMPLETED SUCCESSFULLY. - - User can now trigger upgrade.yml as below: - - ansible-playbook upgrade.yml -i - - Where is the absolute file path of Omnia 1.6 inventory - - **************************************************************************************************************************************************** - - -user_msg_upgrade: | - - *************************************************************************************************************************************************** - STEP 3 OF UPGRADE: UPGRADE.YML COMPLETED SUCCESSFULLY. - - The control plane and cluster has been upgraded to Omnia 1.6.1 - - **************************************************************************************************************************************************** diff --git a/upgrade/roles/validate_input_configs/tasks/main.yml b/upgrade/roles/validate_input_configs/tasks/main.yml deleted file mode 100644 index 8eb8d69af..000000000 --- a/upgrade/roles/validate_input_configs/tasks/main.yml +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Validate input configurations for upgrade - ansible.builtin.include_role: - name: pre_requisite - -- name: Validate local repo configurations - ansible.builtin.include_tasks: validate_local_repo.yml - -- name: Validate input configurations for upgrade - ansible.builtin.include_role: - name: discovery_validations/common # noqa:role-name[path] - tasks_from: upgrade_validations.yml diff --git a/upgrade/roles/validate_input_configs/tasks/validate_local_repo.yml b/upgrade/roles/validate_input_configs/tasks/validate_local_repo.yml deleted file mode 100644 index 71969160e..000000000 --- a/upgrade/roles/validate_input_configs/tasks/validate_local_repo.yml +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Validate control plane OS - ansible.builtin.include_role: - name: "{{ role_path }}/../../../local_repo/roles/validation" # noqa:role-name[path] - tasks_from: validate_cp_os.yml - -- name: Validate software_config.json - ansible.builtin.include_role: - name: "{{ role_path }}/../../../local_repo/roles/validation" # noqa:role-name[path] - tasks_from: validate_software_config_json.yml - -- name: Validate local_repo_config.yml - ansible.builtin.include_role: - name: "{{ role_path }}/../../../local_repo/roles/validation" # noqa:role-name[path] - tasks_from: validate_local_repo_config.yml diff --git a/upgrade/roles/validate_upgrade_config/tasks/main.yml b/upgrade/roles/validate_upgrade_config/tasks/main.yml index 720449e52..6b125f12c 100644 --- a/upgrade/roles/validate_upgrade_config/tasks/main.yml +++ b/upgrade/roles/validate_upgrade_config/tasks/main.yml @@ -17,22 +17,25 @@ file: upgrade_config.yml changed_when: false -- name: Validate old_input_location +- name: Get stats of the install path ansible.builtin.stat: - path: "{{ old_input_location }}" - register: stat_input_result + path: "{{ installed_omnia_path }}" + register: installed_path_stat -- name: Validate backup_location - ansible.builtin.stat: - path: "{{ backup_location }}" - register: stat_backup_result +- name: Assert installed path is not empty + ansible.builtin.assert: + that: + - installed_omnia_path + - installed_path_stat.stat.isdir is defined and installed_path_stat.stat.isdir + fail_msg: "{{ valid_installed_path_msg }}" -- name: Fail if old input directory does not exist - ansible.builtin.fail: - msg: "{{ validate_upgrade_config_input_fail_msg }}" - when: not stat_input_result.stat.exists +- name: Set backup_location if empty + ansible.builtin.set_fact: + backup_location: "{{ default_backup_location }}" + when: not backup_location -- name: Fail if old backup directory does not exist - ansible.builtin.fail: - msg: "{{ validate_upgrade_config_backup_fail_msg }}" - when: not stat_backup_result.stat.exists +- name: Ensure backup_location + ansible.builtin.file: + path: "{{ backup_location }}" + state: directory + mode: "{{ directory_mode }}" diff --git a/upgrade/roles/validate_upgrade_config/vars/main.yml b/upgrade/roles/validate_upgrade_config/vars/main.yml index ce05fd7e3..72a664fe6 100644 --- a/upgrade/roles/validate_upgrade_config/vars/main.yml +++ b/upgrade/roles/validate_upgrade_config/vars/main.yml @@ -13,5 +13,7 @@ # limitations under the License. --- -validate_upgrade_config_input_fail_msg: "Input directory invalid, check upgrade_config.yml" -validate_upgrade_config_backup_fail_msg: "Backup directory invalid, check upgrade_config.yml" +default_backup_location: "/opt/omnia/backup_before_upgrade" +valid_installed_path_msg: "Please provide a valid path for 'installed_omnia_path' in the upgrade_config.yml file + and re-run the upgrade_oim.yml playbook." +directory_mode: "0755" diff --git a/upgrade/upgrade.yml b/upgrade/upgrade.yml deleted file mode 100644 index fbbf984e2..000000000 --- a/upgrade/upgrade.yml +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -# This playbook should be invoked with omnia 1.6 format inventory - -- name: Set upgrade status - hosts: all,localhost - connection: local - tasks: - - name: Validate omnia version - ansible.builtin.include_role: - name: "{{ playbook_dir }}/roles/validate_omnia_version" # noqa:role-name[path] - -- name: Invoke omnia.yml - ansible.builtin.import_playbook: "{{ playbook_dir}}/../omnia.yml" - when: upgrade_status | bool - -- name: Encrypt input files - hosts: localhost - connection: local - tasks: - - name: Conditionally encrypt inputs - when: upgrade_status - block: - - name: Invoke encrypt inputs - ansible.builtin.include_role: - name: encrypt_inputs - -- name: Display User Message - hosts: localhost - connection: local - tasks: - - name: Include user message - ansible.builtin.include_role: - name: "{{ playbook_dir }}/roles/user_messages" # noqa:role-name[path] - - - name: Print user message - ansible.builtin.debug: - msg: "{{ user_msg_upgrade.split('\n') }}" diff --git a/upgrade/upgrade_config.yml b/upgrade/upgrade_config.yml index aa52e2ad9..d59f7fec0 100644 --- a/upgrade/upgrade_config.yml +++ b/upgrade/upgrade_config.yml @@ -13,19 +13,15 @@ # limitations under the License. --- -#*********************************************************************** -# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. -# SIMPLY APPEND THE REQUIRD VALUES AGAINST THE PARAMETER OF YOUR CHOICE. -#*********************************************************************** - # Mandatory -# This variable points to the input directory of the old Omnia 1.5 installation. +# Full path to the currently installed Omnia source code directory # Example: -# old_input_location: /root/omnia151/omnia/input -old_input_location: "" +# installed_omnia_path: /root/omnia_16/omnia +installed_omnia_path: "" -# Mandatory -# This variable points to the directory where OmniaDB backups should be stored. This directory MUST exist before the upgrade. +# Full path to the directory where temporary backups will be taken by the upgrade playbook. +# Upgrade script will create the directory, if it doesn't exist. +# This path can be any local path or mounted remote path # Example: -# old_input_location: /root/omnia-backups +# backup_location: /opt/omnia/backup_before_upgrade backup_location: "" diff --git a/upgrade/upgrade_oim.yml b/upgrade/upgrade_oim.yml new file mode 100644 index 000000000..2619c5528 --- /dev/null +++ b/upgrade/upgrade_oim.yml @@ -0,0 +1,49 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if virtual environment is active + ansible.builtin.import_playbook: ../utils/check_venv.yml + when: not ( check_venv_executed | default(false) | bool ) + +- name: Steps to upgrade omnia version on Omnia Infrastructure Manager + hosts: localhost + vars: + older_os_dict: + redhat: ["8.6", "8.7"] + rocky: ["8.6", "8.7"] + os_check_dict: + redhat: ["8.8"] + rocky: ["8.8"] + ubuntu: ["20.04", "22.04"] + tasks: + - name: Validate input values from upgrade_config.yml + ansible.builtin.include_role: + name: validate_upgrade_config + + - name: Pre-upgrade check before initiating upgrade + ansible.builtin.include_role: + name: upgrade_precheck + + - name: User confirmation before intiating upgrade + ansible.builtin.include_role: + name: user_confirmation + + - name: Initiating upgrade on Omnia Infrastructure Manager + ansible.builtin.include_role: + name: upgrade_oim + + - name: Post upgrade - display user messages + ansible.builtin.include_role: + name: post_upgrade diff --git a/utils/ansible.cfg b/utils/ansible.cfg index a7a96b245..cbce5745a 100644 --- a/utils/ansible.cfg +++ b/utils/ansible.cfg @@ -3,6 +3,8 @@ log_path = /var/log/omnia/utils.log host_key_checking = false forks = 5 timeout = 180 +executable = /bin/bash +collections_path = $VIRTUAL_ENV [persistent_connection] command_timeout = 180 diff --git a/utils/check_package_lock.yml b/utils/check_package_lock.yml new file mode 100644 index 000000000..251d1fe6a --- /dev/null +++ b/utils/check_package_lock.yml @@ -0,0 +1,21 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +- name: Check package lock status + hosts: localhost + connection: local + roles: + - check_package_lock diff --git a/utils/control_plane_cleanup.yml b/utils/check_venv.yml similarity index 83% rename from utils/control_plane_cleanup.yml rename to utils/check_venv.yml index d0853478b..8a370159c 100644 --- a/utils/control_plane_cleanup.yml +++ b/utils/check_venv.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,10 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + --- -- name: Cleanup Control plane +- name: Check virtual ENV hosts: localhost connection: local roles: - - control_plane_cleanup + - check_venv diff --git a/utils/cluster/ansible.cfg b/utils/cluster/ansible.cfg index 8600866c4..2a3d9d35d 100644 --- a/utils/cluster/ansible.cfg +++ b/utils/cluster/ansible.cfg @@ -3,6 +3,7 @@ log_path = /var/log/omnia/utils_cluster.log host_key_checking = false forks = 5 timeout = 180 +collections_path = $VIRTUAL_ENV [persistent_connection] command_timeout = 180 @@ -10,4 +11,4 @@ connect_timeout = 180 [ssh_connection] retries = 3 -ssh_args = -o ControlMaster=auto -o ControlPersist=180 \ No newline at end of file +ssh_args = -o ControlMaster=auto -o ControlPersist=180 diff --git a/utils/cluster/change_personality b/utils/cluster/change_personality index 997f8a45e..6bc59bcb3 100644 --- a/utils/cluster/change_personality +++ b/utils/cluster/change_personality @@ -1,4 +1,4 @@ -# Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/utils/cluster/copy_files_container.yml b/utils/cluster/copy_files_container.yml index df24d7b92..4413fbf44 100644 --- a/utils/cluster/copy_files_container.yml +++ b/utils/cluster/copy_files_container.yml @@ -1,4 +1,4 @@ -# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,9 +15,10 @@ - name: Fetching container files hosts: localhost tasks: - - name: Include vars file - include_vars: "pod_details.yml" + - name: Include vars file + ansible.builtin.include_vars: "pod_details.yml" - - name: Copying files from container env - command: "kubectl cp {{ pod_name }}:{{ container_file_location | regex_replace('^\\/', '') }} {{ local_folder_location }} -n {{ namespace }} -c {{ container_name }}" - changed_when: true \ No newline at end of file + - name: Copying files from container env + ansible.builtin.command: "kubectl cp {{ pod_name }}:{{ container_file_location | regex_replace('^\\/', '') }} + {{ local_folder_location }} -n {{ namespace }} -c {{ container_name }}" + changed_when: true diff --git a/utils/cluster/gather_facts_resolution.yml b/utils/cluster/gather_facts_resolution.yml index 5bb0965b6..1fc094a84 100644 --- a/utils/cluster/gather_facts_resolution.yml +++ b/utils/cluster/gather_facts_resolution.yml @@ -1,4 +1,4 @@ -# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,9 +16,9 @@ - name: Fetch gather_facts playbook hosts: localhost connection: local - gather_facts: no + gather_facts: false vars: - cp_path: "/root/.ansible/cp/" + oim_path: "/root/.ansible/oim/" tasks: - name: Include gather_facts playbook - include_tasks: "{{ playbook_dir }}/../roles/cluster_validation/tasks/gather_fact_resolution.yml" \ No newline at end of file + ansible.builtin.include_tasks: "{{ playbook_dir }}/../../scheduler/roles/cluster_validation/tasks/gather_fact_resolution.yml" diff --git a/utils/cluster/install_ipa_client.yml b/utils/cluster/install_ipa_client.yml index e014d0948..79fc3b483 100644 --- a/utils/cluster/install_ipa_client.yml +++ b/utils/cluster/install_ipa_client.yml @@ -17,21 +17,32 @@ # Use the below command for execution, by providing appropriate values for input fields. # ansible-playbook install_ipa_client.yml -i inventory -e kerberos_admin_password="" -e ipa_server_hostname="" -e domain_name="" -e ipa_server_ipadress="" +- name: Check if virtual environment is active + ansible.builtin.import_playbook: ../check_venv.yml + when: not ( hostvars['127.0.0.1']['check_venv_executed'] | default(false) | bool ) + - name: Initialize variables hosts: localhost tasks: + - name: Fail if no inventory is provided + ansible.builtin.fail: + msg: "Inventory is not provided or is empty" + when: + - groups['all'] is not defined or (groups['all'] | length == 0) + - name: Initialize security variables ansible.builtin.set_fact: authentication_system: "freeipa" - name: Install IPA client - hosts: all + hosts: all:!auth_server:!slurm_control_node:!slurm_node:!kube_control_plane:!kube_node:!login:!etcd gather_facts: true vars: - nfs_node_group_fail_msg: "nfs_node group should be present in inventory and contain exactly 1 node" - nfs_node_group_success_msg: "nfs_node group check passed" + inventory_empty_msg: "Inventory is not provided or empty" + ipa_server_unreachable_msg: "The FreeIPA server {{ ipa_server_hostname }} is not reachable or the UI is not accessible." success_validation_msg: 'Validated all required input parameters' - fail_valid_msg: 'Missing input parameters. Provide all required input parameters. Command to execute: + fail_valid_msg: 'Missing input parameters. Provide all required input parameters correctly, ipa_server_hostname should contain domain_name. + Command to execute: ansible-playbook install_ipa_client.yml -i inventory -e kerberos_admin_password="" -e ipa_server_hostname="" -e domain_name="" -e ipa_server_ipadress=""' tasks: - name: Verify all paramaters are provided @@ -44,28 +55,30 @@ - kerberos_admin_password | length > 1 - ipa_server_hostname | length > 1 - domain_name | length > 1 + - domain_name in ipa_server_hostname - ipa_server_ipadress | length > 1 success_msg: "{{ success_validation_msg }}" fail_msg: "{{ fail_valid_msg }}" + - name: Execute hostname_validation role ansible.builtin.include_role: - name: "{{ playbook_dir }}/../../security/roles/hostname_validation" + name: "{{ playbook_dir }}/../../security/roles/hostname_validation" # noqa role-name[path] tasks_from: validate_hostname - name: Add ports of manager and login node to firewall ansible.builtin.include_role: - name: "{{ playbook_dir }}/../../security/roles/login_common" + name: "{{ playbook_dir }}/../../security/roles/login_common" # noqa role-name[path] tasks_from: firewall_settings - name: Enable module idm in Rocky or Centos >= 8.0 ansible.builtin.include_role: - name: "{{ playbook_dir }}/../../security/roles/login_common" + name: "{{ playbook_dir }}/../../security/roles/login_common" # noqa role-name[path] tasks_from: enable_dnf_module - name: Update Packages ansible.builtin.include_role: - name: "{{ playbook_dir }}/../../security/roles/login_common" + name: "{{ playbook_dir }}/../../security/roles/login_common" # noqa role-name[path] tasks_from: update_package - name: Include login_node vars @@ -76,15 +89,13 @@ - name: Execute firewall_settings from login_node role ansible.builtin.include_role: - name: "{{ playbook_dir }}/../../security/roles/login_node" + name: "{{ playbook_dir }}/../../security/roles/login_node" # noqa role-name[path] tasks_from: firewall_settings - name: Install DL1 - client(It may take 5-10 mins) - ansible.builtin.command: yum module install idm:DL1/client -y + ansible.builtin.command: yum module install idm:DL1/client -y # noqa command-instead-of-module changed_when: true failed_when: false - args: - warn: false when: ansible_distribution | lower == os_redhat - name: Install freeipa client package @@ -96,8 +107,6 @@ ansible.builtin.command: rm -rf {{ cert_path }} changed_when: false failed_when: false - args: - warn: false when: ansible_distribution | lower == os_redhat - name: Add host name in hosts file @@ -141,7 +150,7 @@ rescue: - name: Install ipa client failed ansible.builtin.fail: - msg: "Error: {{ install_ipa_client.stderr_lines }}" + msg: "This can be due to ipa server - {{ ipa_server_hostname }} not reachable. Error: {{ install_ipa_client.stderr_lines }}" - name: Install mkhomedir to enable home directory creation for users ansible.builtin.command: authselect enable-feature with-mkhomedir diff --git a/utils/cluster/install_tools.yml b/utils/cluster/install_tools.yml index 6ea03ecf6..20a0023fc 100644 --- a/utils/cluster/install_tools.yml +++ b/utils/cluster/install_tools.yml @@ -1,4 +1,4 @@ -# Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,13 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. --- - -- hosts: manager +- name: Install change personality script + hosts: manager tasks: - - name: Install Change Personality Script - copy: - src: tools/change_personality - dest: /usr/sbin/ - owner: root - group: root - mode: '0700' + - name: Install Change Personality Script + ansible.builtin.copy: + src: tools/change_personality + dest: /usr/sbin/ + owner: root + group: root + mode: '0700' diff --git a/utils/cluster/intel_tools.yml b/utils/cluster/intel_tools.yml index 7c0d2e20d..63a48e0de 100644 --- a/utils/cluster/intel_tools.yml +++ b/utils/cluster/intel_tools.yml @@ -1,4 +1,4 @@ -# Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,43 +15,45 @@ # intel_tools.yml: Install Intel-branded tools # Install the Intel Parallel Studio XE Runtime repository and packages -- hosts: cluster +- name: Import the Intel Parallel Studio XE Runtime repository and packages + hosts: cluster tasks: - - name: Import the Intel(R) Parallel Studio XE Runtime Repo GPG Key - rpm_key: - state: present - key: https://yum.repos.intel.com/2020/setup/RPM-GPG-KEY-intel-psxe-runtime-2020 - - name: Enable the Intel(R) Parallel Studio XE Runtime Repository - package: - name: https://yum.repos.intel.com/2020/setup/intel-psxe-runtime-2020-reposetup-1-0.noarch.rpm - state: present - - name: Install Intel(R) Parallel Studio XE Runtime 2020 - package: - name: intel-psxe-runtime - state: present + - name: Import the Intel(R) Parallel Studio XE Runtime Repo GPG Key + ansible.builtin.rpm_key: + state: present + key: https://yum.repos.intel.com/2020/setup/RPM-GPG-KEY-intel-psxe-runtime-2020 + - name: Enable the Intel(R) Parallel Studio XE Runtime Repository + ansible.builtin.package: + name: https://yum.repos.intel.com/2020/setup/intel-psxe-runtime-2020-reposetup-1-0.noarch.rpm + state: present + - name: Install Intel(R) Parallel Studio XE Runtime 2020 + ansible.builtin.package: + name: intel-psxe-runtime + state: present # Install the Intel Cluster Checker -- hosts: cluster +- name: Install the Intel Cluster Checker + hosts: cluster tasks: - - name: Import the Intel(R) Cluster Checker Repo GPG Key - rpm_key: - state: present - key: https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB - - name: Enable the Intel(R) Cluster Checker Repository - yum_repository: - name: intel-clck-2019-repo - description: Intel(R) Cluster Checker 2019 - baseurl: https://yum.repos.intel.com/clck/2019 - gpgcheck: yes - gpgkey: https://yum.repos.intel.com/clck/2019/setup/PUBLIC_KEY.PUB - - name: Enable the Intel(R) Cluster Checker Extensions Repository - yum_repository: - name: intel-clck-ext-2019-repo - description: Intel(R) Cluster Checker Select Solutions Extensions 2019 - baseurl: https://yum.repos.intel.com/clck-ext/2019 - gpgcheck: yes - gpgkey: https://yum.repos.intel.com/clck-ext/2019/setup/PUBLIC_KEY.PUB - - name: Install the Intel(R) Cluster Checker - package: - name: intel-clck-2019.8-* - state: present + - name: Import the Intel(R) Cluster Checker Repo GPG Key + ansible.builtin.rpm_key: + state: present + key: https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB + - name: Enable the Intel(R) Cluster Checker Repository + ansible.builtin.yum_repository: + name: intel-clck-2019-repo + description: Intel(R) Cluster Checker 2019 + baseurl: https://yum.repos.intel.com/clck/2019 + gpgcheck: true + gpgkey: https://yum.repos.intel.com/clck/2019/setup/PUBLIC_KEY.PUB + - name: Enable the Intel(R) Cluster Checker Extensions Repository + ansible.builtin.yum_repository: + name: intel-clck-ext-2019-repo + description: Intel(R) Cluster Checker Select Solutions Extensions 2019 + baseurl: https://yum.repos.intel.com/clck-ext/2019 + gpgcheck: true + gpgkey: https://yum.repos.intel.com/clck-ext/2019/setup/PUBLIC_KEY.PUB + - name: Install the Intel(R) Cluster Checker + ansible.builtin.package: + name: intel-clck-2019.8-* + state: present diff --git a/utils/cluster/olm.yml b/utils/cluster/olm.yml index 873fe6b7e..5818cd0e3 100644 --- a/utils/cluster/olm.yml +++ b/utils/cluster/olm.yml @@ -1,4 +1,4 @@ -# Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/utils/cluster/pod_details.yml b/utils/cluster/pod_details.yml index 4ed11021e..fe452294b 100644 --- a/utils/cluster/pod_details.yml +++ b/utils/cluster/pod_details.yml @@ -1,4 +1,4 @@ -# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -22,4 +22,4 @@ namespace: pod_name: container_file_location: local_folder_location: -container_name: \ No newline at end of file +container_name: diff --git a/utils/cluster/scuttle b/utils/cluster/scuttle index 8731efe85..e450db156 100644 --- a/utils/cluster/scuttle +++ b/utils/cluster/scuttle @@ -1,4 +1,4 @@ -# Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/utils/delete_node.yml b/utils/delete_node.yml index 2da3403ee..8dbff0868 100644 --- a/utils/delete_node.yml +++ b/utils/delete_node.yml @@ -13,6 +13,10 @@ # limitations under the License. --- +- name: Check if virtual environment is active + ansible.builtin.import_playbook: check_venv.yml + when: not ( check_venv_executed | default(false) | bool ) + - name: Delete node hosts: localhost connection: ssh diff --git a/utils/hpc_apptainer_job_execution/hpc_apptainer_job_execution_config.yml b/utils/hpc_apptainer_job_execution/hpc_apptainer_job_execution_config.yml index b7736c491..c280599b9 100644 --- a/utils/hpc_apptainer_job_execution/hpc_apptainer_job_execution_config.yml +++ b/utils/hpc_apptainer_job_execution/hpc_apptainer_job_execution_config.yml @@ -15,14 +15,14 @@ # Docker images to be downloaded in all target nodes using apptainer to create sif file # Example for single image -# hpc_apptainer_image: +# hpc_apptainer_image: # - { image_url: "docker.io/intel/oneapi-hpckit:latest" } # Example for multiple images -# hpc_apptainer_image: +# hpc_apptainer_image: # - { image_url: "docker.io/intel/oneapi-hpckit:latest" } # - { image_url: "docker.io/tensorflow/tensorflow:latest" } # If provided docker credentials in omnia_config.yml, it will be used for downloading docker images -hpc_apptainer_image: +hpc_apptainer_image: - { image_url: "" } # Path to directory for storing apptainer sif files in cluster nodes diff --git a/utils/hpc_apptainer_job_execution/roles/hpc_apptainer/tasks/configure_cluster.yml b/utils/hpc_apptainer_job_execution/roles/hpc_apptainer/tasks/configure_cluster.yml index e89479b2c..78c5ceae6 100644 --- a/utils/hpc_apptainer_job_execution/roles/hpc_apptainer/tasks/configure_cluster.yml +++ b/utils/hpc_apptainer_job_execution/roles/hpc_apptainer/tasks/configure_cluster.yml @@ -13,10 +13,10 @@ # limitations under the License. --- -- name: Set cp_hostname and cp_ip +- name: Set oim_hostname and oim_ip ansible.builtin.set_fact: - cp_hostname: "{{ hostvars['127.0.0.1'].cp_hostname }}" - cp_ip: "{{ hostvars['127.0.0.1'].cp_ip }}" + oim_hostname: "{{ hostvars['127.0.0.1'].oim_hostname }}" + oim_ip: "{{ hostvars['127.0.0.1'].oim_ip }}" - name: Create ca cert directory ansible.builtin.file: @@ -30,9 +30,9 @@ dest: "{{ ca_cert_dest }}" mode: preserve -- name: Add control plane information to node +- name: Add Omnia Infrastructure Manager information to node ansible.builtin.lineinfile: path: "{{ hosts_file_path }}" - line: "{{ cp_ip }} {{ cp_hostname }}" + line: "{{ oim_ip }} {{ oim_hostname }}" state: present mode: "{{ file_permission }}" diff --git a/utils/hpc_apptainer_job_execution/roles/hpc_apptainer/tasks/download_docker_images.yml b/utils/hpc_apptainer_job_execution/roles/hpc_apptainer/tasks/download_docker_images.yml index a82318d81..59e30b217 100644 --- a/utils/hpc_apptainer_job_execution/roles/hpc_apptainer/tasks/download_docker_images.yml +++ b/utils/hpc_apptainer_job_execution/roles/hpc_apptainer/tasks/download_docker_images.yml @@ -12,14 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. --- - -- name: Add docker ansible-galaxy collection - ansible.builtin.command: ansible-galaxy collection install {{ docker_galaxy_collection }} - changed_when: true - register: ansible_collection_install - until: ansible_collection_install is not failed - retries: "{{ max_retries }}" - - name: Install python docker ansible.builtin.command: "{{ python_version }} -m pip install {{ docker_python_package }}" changed_when: true @@ -77,7 +69,7 @@ - name: Tag and push docker image to local registry community.docker.docker_image: name: "{{ item.image_url }}" - repository: "{{ cp_hostname }}:{{ docker_registry_port }}/{{ item.image_url }}" + repository: "{{ oim_hostname }}:{{ docker_registry_port }}/{{ item.image_url }}" push: true source: local with_items: "{{ hpc_apptainer_image }}" @@ -90,7 +82,7 @@ - name: Add docker image name to list ansible.builtin.lineinfile: path: "{{ hpc_image_list }}" - line: "{{ cp_hostname }}:{{ docker_registry_port }}/{{ item.image_url }}" + line: "{{ oim_hostname }}:{{ docker_registry_port }}/{{ item.image_url }}" state: present create: true mode: "{{ file_permission }}" diff --git a/utils/hpc_apptainer_job_execution/roles/hpc_apptainer/tasks/include_omnia_config.yml b/utils/hpc_apptainer_job_execution/roles/hpc_apptainer/tasks/include_omnia_config.yml index a90cd9752..87aee73ad 100644 --- a/utils/hpc_apptainer_job_execution/roles/hpc_apptainer/tasks/include_omnia_config.yml +++ b/utils/hpc_apptainer_job_execution/roles/hpc_apptainer/tasks/include_omnia_config.yml @@ -13,11 +13,6 @@ # limitations under the License. --- -- name: Fetch ansible-vault path - ansible.builtin.command: whereis ansible-vault - changed_when: false - register: ansible_vault_path - - name: Check omnia_config.yml file is encrypted ansible.builtin.command: cat {{ omnia_config_filename }} changed_when: false @@ -26,7 +21,7 @@ - name: Decrpyt omnia_config.yml ansible.builtin.command: >- - {{ ansible_vault_path.stdout.split(' ')[1] }} decrypt {{ omnia_config_filename }} + ansible-vault decrypt {{ omnia_config_filename }} --vault-password-file {{ omnia_vault_path }} changed_when: false when: ansible_vault_search_key in omnia_config_content.stdout @@ -59,7 +54,7 @@ - name: Encrypt omnia_config.yml ansible.builtin.command: >- - {{ ansible_vault_path.stdout.split(' ')[1] }} encrypt {{ omnia_config_filename }} + ansible_vault encrypt {{ omnia_config_filename }} --vault-password-file {{ omnia_vault_path }} changed_when: false diff --git a/utils/hpc_apptainer_job_execution/roles/hpc_apptainer/tasks/validate_input.yml b/utils/hpc_apptainer_job_execution/roles/hpc_apptainer/tasks/validate_input.yml index 92f0a9e5d..4fa13f9b7 100644 --- a/utils/hpc_apptainer_job_execution/roles/hpc_apptainer/tasks/validate_input.yml +++ b/utils/hpc_apptainer_job_execution/roles/hpc_apptainer/tasks/validate_input.yml @@ -28,7 +28,7 @@ - name: Validate hpc_apptainer_path ansible.builtin.assert: - that: + that: - hpc_apptainer_path | default("", true) | length > 1 quiet: true fail_msg: "{{ apptainer_path_fail_msg }}" @@ -36,7 +36,7 @@ - name: Include omnia_config.yml ansible.builtin.include_tasks: include_omnia_config.yml -- name: Fetch control plane hostname +- name: Fetch Omnia Infrastructure Manager hostname ansible.builtin.command: hostname changed_when: false register: fetch_hostname @@ -54,12 +54,12 @@ - name: Include variable file metadata.yml ansible.builtin.include_vars: "{{ metadata_path }}" -- name: Fail when control_plane_ip not present in metadata file +- name: Fail when oim_ip not present in metadata file ansible.builtin.fail: - msg: "{{ cp_ip_fail_msg }}" + msg: "{{ oim_ip_fail_msg }}" when: md_pxe_nic_ip is not defined -- name: Set cp_hostname and cp_ip +- name: Set oim_hostname and oim_ip ansible.builtin.set_fact: - cp_hostname: "{{ fetch_hostname.stdout }}" - cp_ip: "{{ md_pxe_nic_ip }}" + oim_hostname: "{{ fetch_hostname.stdout }}" + oim_ip: "{{ md_pxe_nic_ip }}" diff --git a/utils/hpc_apptainer_job_execution/roles/hpc_apptainer/vars/main.yml b/utils/hpc_apptainer_job_execution/roles/hpc_apptainer/vars/main.yml index f35982db7..e628d6827 100644 --- a/utils/hpc_apptainer_job_execution/roles/hpc_apptainer/vars/main.yml +++ b/utils/hpc_apptainer_job_execution/roles/hpc_apptainer/vars/main.yml @@ -19,7 +19,7 @@ syntax_fail_msg: "Failed. Syntax is not correct in hpc_apptainer_job_execution_c Correct syntax errors and re-run hpc_apptainer_job_execution.yml" metadata_path: "/opt/omnia/.data/metadata.yml" metadata_missing_fail_msg: "Failed. Run provision.yml and provision the server before executing hpc_apptainer_job_execution.yml" -cp_ip_fail_msg: "Failed. Run provision.yml and verify no failures occurred. +oim_ip_fail_msg: "Failed. Run provision.yml and verify no failures occurred. Re-run hpc_apptainer_job_execution.yml after provision.yml execution and provisioning of server." apptainer_path_fail_msg: "Failed. hpc_apptainer_path can't be empty in hpc_apptainer_job_execution_config.yml" @@ -37,7 +37,7 @@ omnia_config_syntax_fail_msg: "Failed. Syntax errors present in omnia_config.yml # Usage: package_installation.yml docker_repo_url: https://download.docker.com/linux/centos/docker-ce.repo docker_repo_dest: /etc/yum.repos.d/docker-ce.repo -repo_permission: 0644 +repo_permission: "0644" max_retries: 20 max_delay: 5 docker_packages: @@ -46,16 +46,15 @@ docker_packages: - containerd.io-1.6.21 # Usage: configure_cluster.yml -docker_certs_folder: "/etc/docker/certs.d/{{ cp_hostname }}:5001" -directory_permissions: 0755 +docker_certs_folder: "/etc/docker/certs.d/{{ oim_hostname }}:5001" +directory_permissions: "0755" ca_cert_src: /etc/xcat/cert/ca.pem ca_cert_dest: "{{ docker_certs_folder }}/ca.crt" hosts_file_path: /etc/hosts -file_permission: 0644 +file_permission: "0644" # Usage: download_docker_images.yml -docker_galaxy_collection: community.docker:3.4.8 -python_version: python3.9 +python_version: "{{ ansible_python_interpreter }}" docker_python_package: docker==6.1.3 hpc_image_list: /opt/omnia/hpc_images.yml docker_registry_port: 5001 @@ -63,5 +62,5 @@ docker_registry_failure_msg: "Failed to initiate docker-registry service." docker_pull_fail_msg: "Failed. Unable to pull the image_url of hpc_apptainer_image variable provided in the input hpc_apptainer_job_execution_config.yml." # Usage: get_hpc_images.yml -apptainer_path_mode: 0777 +apptainer_path_mode: "0777" apptainer_sif_path: "{{ hpc_apptainer_path }}/{{ item.split('/')[-1] | replace(':', '_') }}.sif" diff --git a/utils/inventory_tagging.yml b/utils/inventory_tagging.yml index d0226c316..21ba14f76 100644 --- a/utils/inventory_tagging.yml +++ b/utils/inventory_tagging.yml @@ -12,6 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. --- +- name: Check if virtual environment is active + ansible.builtin.import_playbook: check_venv.yml + when: not ( check_venv_executed | default(false) | bool ) - name: Regenerate omnia_inventory Files hosts: localhost diff --git a/utils/ip_rule_assignment/ansible.cfg b/utils/ip_rule_assignment/ansible.cfg index 0d4104a2a..d83d0b640 100644 --- a/utils/ip_rule_assignment/ansible.cfg +++ b/utils/ip_rule_assignment/ansible.cfg @@ -4,6 +4,7 @@ host_key_checking = false forks = 5 timeout = 180 executable = /bin/bash +collections_path = $VIRTUAL_ENV [persistent_connection] command_timeout = 180 @@ -11,4 +12,4 @@ connect_timeout = 180 [ssh_connection] retries = 3 -ssh_args = -o ControlMaster=auto -o ControlPersist=180 \ No newline at end of file +ssh_args = -o ControlMaster=auto -o ControlPersist=180 diff --git a/utils/ip_rule_assignment/ip_rule_assignment.yml b/utils/ip_rule_assignment/ip_rule_assignment.yml index a835e9459..322b09045 100644 --- a/utils/ip_rule_assignment/ip_rule_assignment.yml +++ b/utils/ip_rule_assignment/ip_rule_assignment.yml @@ -12,6 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. --- +- name: Check if virtual environment is active + ansible.builtin.import_playbook: ../check_venv.yml + when: not ( hostvars['127.0.0.1']['check_venv_executed'] | default(false) | bool ) + +- name: Inventory Check + hosts: localhost + tasks: + - name: Check inventory file exists + ansible.builtin.include_role: + name: update_network_manager + tasks_from: check_inventory - name: Update network manager hosts: all diff --git a/server_spec_update/roles/create_nicinfo_db/tasks/main.yml b/utils/ip_rule_assignment/roles/update_network_manager/tasks/check_inventory.yml similarity index 76% rename from server_spec_update/roles/create_nicinfo_db/tasks/main.yml rename to utils/ip_rule_assignment/roles/update_network_manager/tasks/check_inventory.yml index 1ea74c861..0afab335b 100644 --- a/server_spec_update/roles/create_nicinfo_db/tasks/main.yml +++ b/utils/ip_rule_assignment/roles/update_network_manager/tasks/check_inventory.yml @@ -13,8 +13,8 @@ # limitations under the License. --- -- name: Create table for nic info - when: add_network_status - block: - - name: Enter control_plane details in cluster.nodeinfo table - ansible.builtin.include_tasks: add_nic_db.yml +- name: Fail if no inventory is provided + ansible.builtin.fail: + msg: "{{inventory_not_provided}}" + when: + - groups['all'] is not defined or (groups['all'] | length == 0) diff --git a/utils/ip_rule_assignment/roles/update_network_manager/tasks/inventory_validation.yml b/utils/ip_rule_assignment/roles/update_network_manager/tasks/inventory_validation.yml index dbbdffabb..46f4edd60 100644 --- a/utils/ip_rule_assignment/roles/update_network_manager/tasks/inventory_validation.yml +++ b/utils/ip_rule_assignment/roles/update_network_manager/tasks/inventory_validation.yml @@ -45,3 +45,16 @@ fail_msg: "{{ gateway_ip_fail_msg }} {{ item.value }}" when: item.key == 'gateway' with_dict: "{{ hostvars[inventory_hostname].nic_info }}" + +- name: Gather NIC list + ansible.builtin.shell: "/usr/sbin/ip -o link show | awk -F': ' '{print $2}'" + register: nic_list + changed_when: false # Since we're only gathering information + +- name: Validate nic names in inventory file + ansible.builtin.assert: + that: + - "{{ item.value in nic_list.stdout_lines }}" + fail_msg: "{{ invalid_nic_name }} NIC '{{ item.value }}' not found" + when: item.key == 'nic_name' + with_dict: "{{ hostvars[inventory_hostname].nic_info }}" diff --git a/utils/ip_rule_assignment/roles/update_network_manager/vars/main.yml b/utils/ip_rule_assignment/roles/update_network_manager/vars/main.yml index 7b92e3bc6..47fda7261 100644 --- a/utils/ip_rule_assignment/roles/update_network_manager/vars/main.yml +++ b/utils/ip_rule_assignment/roles/update_network_manager/vars/main.yml @@ -21,3 +21,7 @@ warning_wait_time: 30 nic_configuration_file_path: "/etc/network/interfaces.d/{{ item.nic_name }}" nic_config_warning_msg: "nic configuration file not found for {{ nic_configuration_file_path }} Make sure that nic_name given in inventory is valid and server_spec_update.yml should be executed before ip_rule_assignment.yml" +invalid_nic_name: "Failed, invalid nic name provided in inventory" + +# Usage: check_inventory.yml +inventory_not_provided: "Failed. Inventory not provided. Re-run playbook with inventory by providing -i inventory." diff --git a/utils/k8s-upgrade/ansible.cfg b/utils/k8s-upgrade/ansible.cfg index b3f049dd8..5a9004d7b 100644 --- a/utils/k8s-upgrade/ansible.cfg +++ b/utils/k8s-upgrade/ansible.cfg @@ -1,4 +1,5 @@ [defaults] roles_path = ./roles inventory = ./inventory.ini -remote_user = root \ No newline at end of file +remote_user = root +collections_path = $VIRTUAL_ENV diff --git a/utils/k8s-upgrade/inventory.yml b/utils/k8s-upgrade/inventory.yml index 53b7f94e6..6a5559d64 100644 --- a/utils/k8s-upgrade/inventory.yml +++ b/utils/k8s-upgrade/inventory.yml @@ -3,6 +3,6 @@ all: manager: hosts: compute104.omnia.local: - compute: + compute: hosts: - compute105.omnia.local: \ No newline at end of file + compute105.omnia.local: diff --git a/utils/k8s-upgrade/roles/upgrade-compute/tasks/compute-upgrade.yml b/utils/k8s-upgrade/roles/upgrade-compute/tasks/compute-upgrade.yml index 636449c37..b67ad3be5 100644 --- a/utils/k8s-upgrade/roles/upgrade-compute/tasks/compute-upgrade.yml +++ b/utils/k8s-upgrade/roles/upgrade-compute/tasks/compute-upgrade.yml @@ -8,7 +8,7 @@ - name: "Print kubernetes packages version" ansible.builtin.debug: - msg: + msg: - "{{ ansible_facts.packages.kubelet[0].version }}" - "{{ ansible_facts.packages.kubeadm[0].version }}" - "{{ ansible_facts.packages.kubectl[0].version }}" @@ -16,7 +16,7 @@ - name: "Remove version lock from kubernetes packages" community.general.yum_versionlock: state: absent - name: + name: - "{{ item }}" with_items: "{{ versionlock_k8s_packages }}" @@ -41,13 +41,13 @@ disable_excludes: kubernetes update_cache: true with_items: "{{ upgrade_k8s_packages }}" - - - name: "Reload Daemon and kubelet" + + - name: "Reload Daemon and kubelet" ansible.builtin.systemd: name: kubelet state: restarted daemon_reload: true - + - name: "Uncordon compute Node" ansible.builtin.command: kubectl uncordon {{ item.1 }} delegate_to: "{{ item.0 }}" @@ -57,7 +57,7 @@ - name: "Print updated kubernetes packages version" ansible.builtin.debug: - msg: + msg: - "{{ ansible_facts.packages.kubelet[0].version }}" - "{{ ansible_facts.packages.kubeadm[0].version }}" - "{{ ansible_facts.packages.kubectl[0].version }}" @@ -65,10 +65,10 @@ - name: "Add version lock on kubernetes packages from being updated" community.general.yum_versionlock: state: present - name: + name: - "{{ item }}" with_items: "{{ versionlock_k8s_packages }}" - + - name: "Clean metadata and update yum modules" ansible.builtin.shell: yum clean metadata && yum clean all && yum makecache changed_when: true diff --git a/utils/k8s-upgrade/roles/upgrade-compute/tasks/main.yml b/utils/k8s-upgrade/roles/upgrade-compute/tasks/main.yml index afa2b721e..eb3c60f44 100644 --- a/utils/k8s-upgrade/roles/upgrade-compute/tasks/main.yml +++ b/utils/k8s-upgrade/roles/upgrade-compute/tasks/main.yml @@ -4,4 +4,4 @@ ansible.builtin.include_tasks: compute-upgrade.yml loop: "{{ k8s_versions }}" when: - - "{{ item <= k8s_upgrade_version }}" \ No newline at end of file + - item | int <= k8s_upgrade_version | int diff --git a/utils/k8s-upgrade/roles/upgrade-compute/vars/main.yml b/utils/k8s-upgrade/roles/upgrade-compute/vars/main.yml index 0811be38d..d5ea02e38 100644 --- a/utils/k8s-upgrade/roles/upgrade-compute/vars/main.yml +++ b/utils/k8s-upgrade/roles/upgrade-compute/vars/main.yml @@ -13,4 +13,4 @@ upgrade_k8s_packages: versionlock_k8s_packages: - "kubelet*" - "kubectl*" - - "kubeadm*" + - "kubeadm*" diff --git a/utils/k8s-upgrade/roles/upgrade-manager/tasks/main.yml b/utils/k8s-upgrade/roles/upgrade-manager/tasks/main.yml index 913fb7e9c..5d467805d 100644 --- a/utils/k8s-upgrade/roles/upgrade-manager/tasks/main.yml +++ b/utils/k8s-upgrade/roles/upgrade-manager/tasks/main.yml @@ -1,7 +1,6 @@ --- -# Upgrade k8s master cluster - name: "Upgrading k8s cluster version to {{ k8s_upgrade_version }}" ansible.builtin.include_tasks: manager-upgrade.yml loop: "{{ k8s_versions }}" when: - - "{{ item <= k8s_upgrade_version }}" + - item | int <= k8s_upgrade_version | int diff --git a/utils/k8s-upgrade/roles/upgrade-manager/tasks/manager-upgrade.yml b/utils/k8s-upgrade/roles/upgrade-manager/tasks/manager-upgrade.yml index e39911676..f2ac4aa86 100644 --- a/utils/k8s-upgrade/roles/upgrade-manager/tasks/manager-upgrade.yml +++ b/utils/k8s-upgrade/roles/upgrade-manager/tasks/manager-upgrade.yml @@ -1,6 +1,6 @@ --- - name: "Upgrade Manager" - when: inventory_hostname in groups["manager"] + when: inventory_hostname in groups["manager"] block: - name: "Get the installed packages facts" ansible.builtin.package_facts: @@ -8,30 +8,30 @@ - name: "Print kubernetes packages version" ansible.builtin.debug: - msg: + msg: - "{{ ansible_facts.packages.kubelet[0].version }}" - "{{ ansible_facts.packages.kubeadm[0].version }}" - "{{ ansible_facts.packages.kubectl[0].version }}" - + - name: "Remove version lock from kubernetes packages" community.general.yum_versionlock: state: absent - name: + name: - "{{ item }}" with_items: "{{ versionlock_k8s_packages }}" - - name: "Drain cluster master node -> {{ inventory_hostname }}" + - name: "Drain cluster head node -> {{ inventory_hostname }}" ansible.builtin.shell: | kubectl drain {{ inventory_hostname }} --ignore-daemonsets --delete-local-data register: drain_node_register changed_when: drain_node_register.rc != 0 - + - name: "Verify the upgrade plan" ansible.builtin.shell: | kubeadm upgrade plan --allow-experimental-upgrades --ignore-preflight-errors=all register: upgrade_plan_register changed_when: upgrade_plan_register.rc != 0 - + - name: "Upgrade Manager Node Packages with version {{ item }}" ansible.builtin.yum: name: "{{ item }}" @@ -44,14 +44,14 @@ kubeadm config images pull register: pull_images_register changed_when: pull_images_register != 0 - + - name: "Upgrading k8s cluster to {{ item }}" ansible.builtin.shell: | kubeadm upgrade apply v"{{ item }}" --allow-experimental-upgrades --ignore-preflight-errors=all --force --yes register: upgrade_register changed_when: upgrade_register != 0 failed_when: "'FAILED' in upgrade_register.stderr" - + - name: "Reload Daemon and kubelet" ansible.builtin.systemd: state: restarted @@ -64,13 +64,13 @@ register: uncordon_register changed_when: uncordon_register != 0 - - name: Wait for all control-plane pods become created + - name: Wait for all oim-node pods become created ansible.builtin.shell: "kubectl get po --namespace=kube-system --selector tier=control-plane --output=jsonpath='{.items[*].metadata.name}'" - register: control_plane_pods_created - until: item in control_plane_pods_created.stdout + register: oim_pods_created + until: item in oim_pods_created.stdout retries: "{{ retries_count }}" delay: "{{ delay_time }}" - changed_when: control_plane_pods_created != 0 + changed_when: oim_pods_created != 0 with_items: - etcd - kube-apiserver @@ -79,7 +79,7 @@ - name: "Print updated kubernetes packages version" ansible.builtin.debug: - msg: + msg: - "{{ ansible_facts.packages.kubelet[0].version }}" - "{{ ansible_facts.packages.kubeadm[0].version }}" - "{{ ansible_facts.packages.kubectl[0].version }}" @@ -87,14 +87,14 @@ - name: "Add version lock on kubernetes packages from being updated" community.general.yum_versionlock: state: present - name: + name: - "{{ item }}" with_items: "{{ versionlock_k8s_packages }}" - + - name: "Clean metadata and update yum modules" - ansible.builtin.shell: yum clean metadata && yum clean all && yum makecache + ansible.builtin.shell: yum clean metadata && yum clean all && yum makecache changed_when: true - + - name: "Success Message" ansible.builtin.debug: - msg: "SUCCESS! Your cluster are upgraded to {{ item }}. Enjoy!" \ No newline at end of file + msg: "SUCCESS! Your cluster are upgraded to {{ item }}. Enjoy!" diff --git a/utils/k8s-upgrade/roles/upgrade-manager/vars/main.yml b/utils/k8s-upgrade/roles/upgrade-manager/vars/main.yml index 7f4e12dce..7f6758e2c 100644 --- a/utils/k8s-upgrade/roles/upgrade-manager/vars/main.yml +++ b/utils/k8s-upgrade/roles/upgrade-manager/vars/main.yml @@ -16,4 +16,4 @@ delay_time: 20 versionlock_k8s_packages: - "kubelet*" - "kubectl*" - - "kubeadm*" + - "kubeadm*" diff --git a/utils/kernel_param_update/kernel_param_update_config.yml b/utils/kernel_param_update/kernel_param_update_config.yml deleted file mode 100644 index 75d9766af..000000000 --- a/utils/kernel_param_update/kernel_param_update_config.yml +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -#*********************************************************************** -# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. -# SIMPLY APPEND THE REQUIRD VALUES AGAINST THE PARAMETER OF YOUR CHOICE. -#*********************************************************************** - -# This variable contains the list of kernel command line -# Use space(' ') as a delimeter in case of multiple parameters -# Example: -# grub_commandline_kernel: “iommu=pt intel_iommu=off pci=realloc=off processor.max_cstate=0 intel_idle.max_cstate=0 intel_pstate=disable” - -grub_commandline_kernel: "" diff --git a/utils/kernel_param_update/roles/kcmdline_update/tasks/kcmdline_update_rocky.yml b/utils/kernel_param_update/roles/kcmdline_update/tasks/kcmdline_update_rocky.yml deleted file mode 120000 index fbb60d4fd..000000000 --- a/utils/kernel_param_update/roles/kcmdline_update/tasks/kcmdline_update_rocky.yml +++ /dev/null @@ -1 +0,0 @@ -kcmdline_update_redhat.yml \ No newline at end of file diff --git a/utils/kernel_param_update/roles/kcmdline_update/tasks/kcmdline_update_ubuntu.yml b/utils/kernel_param_update/roles/kcmdline_update/tasks/kcmdline_update_ubuntu.yml deleted file mode 100644 index af3bc79ce..000000000 --- a/utils/kernel_param_update/roles/kcmdline_update/tasks/kcmdline_update_ubuntu.yml +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -- name: Include variables from kernel_param_update_config.yml - ansible.builtin.include_vars: - file: "{{ kcmdline_config_file }}" - name: kcmdline_vars - -- name: Reading existing kernel parameters - ansible.builtin.shell: > - set -o pipefail && \ - cat "{{ grub_path }}" | grep '^GRUB_CMDLINE_LINUX=' | cut -d'"' -f2 - register: existing_grub_cmdline - changed_when: existing_grub_cmdline.rc != 0 - -- name: Adding grub_commandline_kernel to existing parameters - ansible.builtin.lineinfile: - path: "{{ grub_path }}" - regexp: '^GRUB_CMDLINE_LINUX=' - line: 'GRUB_CMDLINE_LINUX="{{ existing_grub_cmdline.stdout }} {{ kcmdline_vars.grub_commandline_kernel }} "' - -- name: Update Grub configuration - ansible.builtin.command: sudo update-grub - register: output - changed_when: output.rc != 0 diff --git a/utils/nodeinfo_db/nodeinfo_db.yml b/utils/nodeinfo_db/nodeinfo_db.yml index 244931989..ad985fe36 100644 --- a/utils/nodeinfo_db/nodeinfo_db.yml +++ b/utils/nodeinfo_db/nodeinfo_db.yml @@ -13,6 +13,10 @@ # limitations under the License. --- +- name: Check if virtual environment is active + ansible.builtin.import_playbook: ../check_venv.yml + when: not ( hostvars['127.0.0.1']['check_venv_executed'] | default(false) | bool ) + - name: Get data from nodeinfo_db hosts: localhost connection: local diff --git a/utils/nodeinfo_db/nodeinfo_db_config.yml b/utils/nodeinfo_db/nodeinfo_db_config.yml index 3d46a80db..758e3dd75 100644 --- a/utils/nodeinfo_db/nodeinfo_db_config.yml +++ b/utils/nodeinfo_db/nodeinfo_db_config.yml @@ -18,7 +18,8 @@ # If nothing is specified, select_column_name will take all attribute # Default Example: select * from example.xyz " # If we have values in show_column_list attributes -# Accepted show_column_list: "id,service_tag,node,hostname,admin_mac,admin_ip,bmc_ip,status,discovery_mechanism,bmc_mode,switch_ip,switch_name,switch_port,cpu,gpu,cpu_count,gpu_count" +# Accepted show_column_list: "id,service_tag,node,hostname,admin_mac,admin_ip,bmc_ip,status,> +# discovery_mechanism,bmc_mode,switch_ip,switch_name,switch_port,cpu,gpu,cpu_count,gpu_count" # Below Example: select `show_column_list` from example.xyz " show_column_list: "" @@ -37,4 +38,4 @@ filter_value: #### Mandatory, nodeinfo_db: filename # File where data collected from nodeinfo_data should be dumped # Default value: "/tmp/nodeinfo_data.csv" -filename: "/root/nodeinfo_data.csv" \ No newline at end of file +filename: "/root/nodeinfo_data.csv" diff --git a/utils/provision/configure_pxe_static.yml b/utils/obsolete/configure_pxe_static.yml similarity index 100% rename from utils/provision/configure_pxe_static.yml rename to utils/obsolete/configure_pxe_static.yml diff --git a/input/rhsm_config.yml b/utils/obsolete/input/rhsm_config.yml similarity index 100% rename from input/rhsm_config.yml rename to utils/obsolete/input/rhsm_config.yml diff --git a/utils/install_hpc_thirdparty_packages.yml b/utils/obsolete/install_hpc_thirdparty_packages.yml similarity index 100% rename from utils/install_hpc_thirdparty_packages.yml rename to utils/obsolete/install_hpc_thirdparty_packages.yml diff --git a/examples/k8s-tensorflow-nvidia-ngc-resnet50-multinode-mpioperator.yaml b/utils/obsolete/k8s-tensorflow-nvidia-ngc-resnet50-multinode-mpioperator.yaml similarity index 100% rename from examples/k8s-tensorflow-nvidia-ngc-resnet50-multinode-mpioperator.yaml rename to utils/obsolete/k8s-tensorflow-nvidia-ngc-resnet50-multinode-mpioperator.yaml diff --git a/logging/logging.yml b/utils/obsolete/logging/logging.yml similarity index 100% rename from logging/logging.yml rename to utils/obsolete/logging/logging.yml diff --git a/logging/roles/loki/tasks/main.yml b/utils/obsolete/logging/roles/loki/tasks/main.yml similarity index 100% rename from logging/roles/loki/tasks/main.yml rename to utils/obsolete/logging/roles/loki/tasks/main.yml diff --git a/logging/tests/test_logging.yml b/utils/obsolete/logging/tests/test_logging.yml similarity index 100% rename from logging/tests/test_logging.yml rename to utils/obsolete/logging/tests/test_logging.yml diff --git a/utils/metalLB/README.md b/utils/obsolete/metalLB/README.md similarity index 100% rename from utils/metalLB/README.md rename to utils/obsolete/metalLB/README.md diff --git a/utils/metalLB/metal-config.yaml b/utils/obsolete/metalLB/metal-config.yaml similarity index 100% rename from utils/metalLB/metal-config.yaml rename to utils/obsolete/metalLB/metal-config.yaml diff --git a/utils/rhsm_subscription.yml b/utils/obsolete/rhsm_subscription.yml similarity index 100% rename from utils/rhsm_subscription.yml rename to utils/obsolete/rhsm_subscription.yml diff --git a/utils/rhsm_unregister.yml b/utils/obsolete/rhsm_unregister.yml similarity index 100% rename from utils/rhsm_unregister.yml rename to utils/obsolete/rhsm_unregister.yml diff --git a/utils/obsolete/roles/nfs_iscsi/tasks/check_prerequisites.yml b/utils/obsolete/roles/nfs_iscsi/tasks/check_prerequisites.yml new file mode 100644 index 000000000..9d4190a87 --- /dev/null +++ b/utils/obsolete/roles/nfs_iscsi/tasks/check_prerequisites.yml @@ -0,0 +1,139 @@ +# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Include base_vars.yml +- name: Include base_vars of control plane + ansible.builtin.include_vars: "{{ role_path }}/../../control_plane/input_params/base_vars.yml" + +# Check nfs_node_status +- name: Check NFS Node Status + when: powervault_support + block: + - name: Initialize variables + ansible.builtin.set_fact: + nfs_node_status: false + + - name: Set NFS node status + ansible.builtin.set_fact: + nfs_node_status: true + when: + - groups['nfs'] is defined + - groups['nfs'] | length | int > 0 + + - name: NFS group to contain exactly 1 node + ansible.builtin.assert: + that: "groups['nfs'] | length | int == 1" + fail_msg: "{{ nfs_node_group_fail_msg }}" + success_msg: "{{ nfs_node_group_success_msg }}" + when: nfs_node_status + + # Include omnia_config.yml + - name: Check if omnia_vault_key exists + ansible.builtin.stat: + path: "{{ role_path }}/../../{{ config_vaultname }}" + register: vault_key_result + + - name: Create ansible vault key if it does not exist + ansible.builtin.set_fact: + vault_key: "{{ lookup('password', '/dev/null chars=ascii_letters') }}" + when: not vault_key_result.stat.exists + + - name: Save vault key + ansible.builtin.copy: + dest: "{{ role_path }}/../../{{ config_vaultname }}" + content: | + {{ vault_key }} + force: true + mode: "{{ vault_file_perm }}" + when: not vault_key_result.stat.exists + + - name: Check if omnia config file is encrypted + ansible.builtin.command: cat {{ role_path }}/../../{{ config_filename }} + changed_when: false + register: config_content + no_log: True # noqa: yaml[truthy] + + - name: Decrpyt omnia_config.yml + ansible.builtin.command: >- + ansible-vault decrypt {{ role_path }}/../../{{ config_filename }} + --vault-password-file {{ role_path }}/../../{{ config_vaultname }} + when: "'$ANSIBLE_VAULT;' in config_content.stdout" + changed_when: false + + - name: Include variable file omnia_config.yml + ansible.builtin.include_vars: "{{ role_path }}/../../{{ config_filename }}" + no_log: true + + # Include login_vars.yml + + - name: Check login_vars file is encrypted + ansible.builtin.command: cat "{{ role_path }}/../../control_plane/{{ login_vars_filename }}" + changed_when: false + register: config_content + no_log: true + + - name: Decrpyt login_vars.yml + ansible.builtin.command: >- + ansible-vault decrypt "{{ role_path }}/../../control_plane/{{ login_vars_filename }}" + --vault-password-file "{{ role_path }}/../../control_plane/{{ vault_filename }}" + changed_when: false + when: "'$ANSIBLE_VAULT;' in config_content.stdout" + + - name: Include variable file login_vars.yml + ansible.builtin.include_vars: "{{ role_path }}/../../control_plane/{{ login_vars_filename }}" + no_log: true + +# Validate Powervault variables +- name: Fetch powervault inputs + ansible.builtin.include_tasks: "../../cluster_validation/tasks/fetch_powervault_status.yml" + when: + - powervault_support + - nfs_node_status + +# Encrpyt omnia_config.yml file +- name: Encrypt omnia_config.yml + when: powervault_support + block: + - name: Encrypt input config file + ansible.builtin.command: >- + ansible-vault encrypt {{ role_path }}/../../{{ config_filename }} + --vault-password-file {{ role_path }}/../../{{ config_vaultname }} + changed_when: false + + # Encrypt login_vars.yml file + - name: Create ansible vault key + ansible.builtin.set_fact: + vault_key: "{{ lookup('password', '/dev/null chars=ascii_letters') }}" + when: "'$ANSIBLE_VAULT;' not in config_content.stdout" + + - name: Save vault key + ansible.builtin.copy: + dest: "{{ role_path }}/../../control_plane/{{ vault_filename }}" + content: | + {{ vault_key }} + force: true + mode: "{{ vault_file_perm }}" + when: "'$ANSIBLE_VAULT;' not in config_content.stdout" + + - name: Encrypt login_vars file + ansible.builtin.command: >- + ansible-vault encrypt "{{ role_path }}/../../control_plane/{{ login_vars_filename }}" + --vault-password-file "{{ role_path }}/../../control_plane/{{ vault_filename }}" + changed_when: false + + - name: Update login_vars.yml permission + ansible.builtin.file: + path: "{{ role_path }}/../../control_plane/{{ login_vars_filename }}" + mode: "{{ vault_file_perm }}" diff --git a/utils/obsolete/roles/nfs_iscsi/tasks/main.yml b/utils/obsolete/roles/nfs_iscsi/tasks/main.yml new file mode 100644 index 000000000..1d45ba283 --- /dev/null +++ b/utils/obsolete/roles/nfs_iscsi/tasks/main.yml @@ -0,0 +1,43 @@ +# Copyright 2022 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Include base vars + ansible.builtin.include_vars: "{{ role_path }}/../../control_plane/input_params/base_vars.yml" + +- name: Configuring and setup of NFS Server + when: + - powervault_support + - hostvars['127.0.0.1']['nfs_node_status'] + block: + - name: Include variable file for powervault + ansible.builtin.include_vars: "{{ pv_nfs_file }}" + + - name: NFS Server Setp and configure - Block + when: powervault_protocol == 'iscsi' + block: + - name: Validate the nfs configuration + ansible.builtin.include_tasks: validate_nfs_config.yml + + - name: Configure the server + ansible.builtin.include_tasks: nfs_node_configure.yml + + - name: Configure the port of nfs_server + ansible.builtin.include_tasks: nfs_volume.yml + + - name: Mount the partitions + ansible.builtin.include_tasks: mount_me4_partitions.yml + + - name: Setup NFS server on the partitions + ansible.builtin.include_tasks: me4_nfs_server_setup.yml diff --git a/storage/roles/nfs_iscsi/tasks/map_volume.yml b/utils/obsolete/roles/nfs_iscsi/tasks/map_volume.yml similarity index 74% rename from storage/roles/nfs_iscsi/tasks/map_volume.yml rename to utils/obsolete/roles/nfs_iscsi/tasks/map_volume.yml index 1b386c45a..90bda6fd4 100644 --- a/storage/roles/nfs_iscsi/tasks/map_volume.yml +++ b/utils/obsolete/roles/nfs_iscsi/tasks/map_volume.yml @@ -14,29 +14,29 @@ --- - name: Get auth string - shell: echo -n {{ powervault_username }}_{{ powervault_password }} | sha256sum + ansible.builtin.shell: echo -n {{ powervault_username }}_{{ powervault_password }} | sha256sum register: map_auth_string changed_when: false - ignore_errors: yes + ignore_errors: true no_log: true delegate_to: localhost - name: Get session key - uri: + ansible.builtin.uri: url: https://{{ powervault_ip }}/api/login/{{ map_auth_string.stdout | replace(" -", "") }} method: GET headers: {'datatype': 'json'} - validate_certs: no + validate_certs: false register: map_session_key delegate_to: localhost - name: Set fact for IP - set_fact: + ansible.builtin.set_fact: map_ip: "{{ pv_map_ip }}" - name: Get map port - set_fact: + ansible.builtin.set_fact: map_port: "{{ item.0 }}" when: map_ip == item.1 with_together: @@ -45,25 +45,25 @@ register: output - name: Set the LUN nos. - set_fact: - lun_no: "{{ lun_no | default ([]) }} + [ '{{ temp }}']" - temp: "{{ temp|int + t|int }}" + ansible.builtin.set_fact: + lun_no: "{{ lun_no | default([]) }} + [ '{{ temp }}']" + temp: "{{ temp | int + t | int }}" loop: "{{ powervault_volumes }}" - name: Add the lun numbers to volumes - set_fact: - pv_volumes: "{{ pv_volumes | default ([]) + [{ 'name': item.0.name, 'lun_no': item.1 , 'location': item.0.server_share_path }] }}" + ansible.builtin.set_fact: + pv_volumes: "{{ pv_volumes | default([]) + [{ 'name': item.0.name, 'lun_no': item.1 , 'location': item.0.server_share_path }] }}" with_together: - "{{ powervault_volumes }}" - "{{ lun_no }}" - name: Map volumes to initiators - uri: + ansible.builtin.uri: url: https://{{ powervault_ip }}/api/map/volume/{{ item.name }}/access/{{ access }}/ports/{{ map_port }}/lun/{{ item.lun_no }}/initiator/{{ server_iqdn }} method: GET body_format: json - validate_certs: no - use_proxy: no + validate_certs: false + use_proxy: false headers: {'sessionKey': "{{ map_session_key.json.status[0].response }}", 'datatype':'json'} with_items: "{{ pv_volumes }}" diff --git a/storage/roles/nfs_iscsi/tasks/me4_nfs_server_setup.yml b/utils/obsolete/roles/nfs_iscsi/tasks/me4_nfs_server_setup.yml similarity index 80% rename from storage/roles/nfs_iscsi/tasks/me4_nfs_server_setup.yml rename to utils/obsolete/roles/nfs_iscsi/tasks/me4_nfs_server_setup.yml index a69607723..c094e7d14 100644 --- a/storage/roles/nfs_iscsi/tasks/me4_nfs_server_setup.yml +++ b/utils/obsolete/roles/nfs_iscsi/tasks/me4_nfs_server_setup.yml @@ -14,32 +14,32 @@ --- - name: Install nfs-utils - package: + ansible.builtin.package: name: nfs-utils state: present - name: Install firewalld - package: + ansible.builtin.package: name: firewalld state: present - name: Start and enable firewalld - service: + ansible.builtin.service: name: firewalld state: started - enabled: yes + enabled: true - name: Start and enable rpcbind and nfs-server service - service: + ansible.builtin.service: name: "{{ item }}" state: restarted - enabled: yes + enabled: true with_items: - rpcbind - nfs-server - name: Adding NFS share entries in /etc/exports for manager - lineinfile: + ansible.builtin.lineinfile: path: "{{ exports_file_path }}" line: "{{ item.0.location }} {{ item.1 }}(rw,sync,no_root_squash)" with_nested: @@ -47,30 +47,30 @@ - "{{ groups['manager'] }}" - name: Adding NFS share entries in /etc/exports for compute - lineinfile: + ansible.builtin.lineinfile: path: "{{ exports_file_path }}" line: "{{ item.0.location }} {{ item.1 }}(rw,sync,no_root_squash)" with_nested: - - "{{ pv_volumes }}" - - "{{ groups['compute'] }}" + - "{{ pv_volumes }}" + - "{{ groups['compute'] }}" - name: Adding NFS share entries in /etc/exports for compute - lineinfile: + ansible.builtin.lineinfile: path: "{{ exports_file_path }}" line: "{{ item.0.location }} {{ item.1 }}(rw,sync,no_root_squash)" with_nested: - - "{{ pv_volumes }}" - - "{{ groups['login'] }}" + - "{{ pv_volumes }}" + - "{{ groups['login'] }}" when: - groups['login'] is defined - groups['login'] | length | int > 0 - name: Exporting the shared directories - command: exportfs -ra + ansible.builtin.command: exportfs -ra changed_when: true - name: Configuring firewall - firewalld: + ansible.posix.firewalld: service: "{{ item }}" permanent: true state: enabled @@ -78,5 +78,5 @@ - "{{ nfs_services }}" - name: Reload firewalld - command: firewall-cmd --reload + ansible.builtin.command: firewall-cmd --reload changed_when: true diff --git a/storage/roles/nfs_iscsi/tasks/mount_me4_partitions.yml b/utils/obsolete/roles/nfs_iscsi/tasks/mount_me4_partitions.yml similarity index 76% rename from storage/roles/nfs_iscsi/tasks/mount_me4_partitions.yml rename to utils/obsolete/roles/nfs_iscsi/tasks/mount_me4_partitions.yml index 9bba505e5..12ff744d8 100644 --- a/storage/roles/nfs_iscsi/tasks/mount_me4_partitions.yml +++ b/utils/obsolete/roles/nfs_iscsi/tasks/mount_me4_partitions.yml @@ -14,7 +14,7 @@ --- - name: Get ME4 volume - shell: > + ansible.builtin.shell: > set -o pipefail && \ lsscsi -s | grep ME4 changed_when: false @@ -22,12 +22,12 @@ failed_when: false - name: Create a temp file - file: + ansible.builtin.file: path: pv_mount.txt state: absent - name: Get ME4 volume - shell: > + ansible.builtin.shell: > set -o pipefail && \ lsscsi -s | grep ME4 changed_when: false @@ -35,50 +35,50 @@ failed_when: false - name: Set a temp variable - set_fact: + ansible.builtin.set_fact: temp_info: "{{ me4_output.stdout.split('\n') }}" - name: Create a temp file - shell: echo {{ item }} >> pv_mount.txt + ansible.builtin.shell: echo {{ item }} >> pv_mount.txt with_items: "{{ temp_info }}" changed_when: false - name: Extract the disks - command: awk -F' ' '{ print $6}' pv_mount.txt + ansible.builtin.command: awk -F' ' '{ print $6}' pv_mount.txt register: disk_op changed_when: false - name: Create a list of mounted disks - set_fact: + ansible.builtin.set_fact: mounted_pv_disks: "{{ mounted_pv_disks | default([]) }} + [ '{{ item }}']" when: item != '-' with_items: "{{ disk_op.stdout_lines }}" - name: Get all mounted partitions - command: df -h + ansible.builtin.command: df -h changed_when: false register: mounted_partitions - name: Create partition on ME4 volumes - command: "parted -a optimal {{ item }} --script -- mklabel gpt mkpart primary 0% {{ powervault_disk_partition_size }}" + ansible.builtin.command: "parted -a optimal {{ item }} --script -- mklabel gpt mkpart primary 0% {{ powervault_disk_partition_size }}" changed_when: true when: item not in mounted_partitions.stdout with_items: "{{ mounted_pv_disks }}" - name: Update kernel with new partition changes - command: partprobe + ansible.builtin.command: partprobe changed_when: false - name: Check ME4 mounted partitions - shell: > + ansible.builtin.shell: > set -o pipefail && \ mount | grep me4 failed_when: false changed_when: false register: me4_mounted_partitions -- name: Set file system on partition - shell: > +- name: Set file system on partition # noqa: no-changed-when + ansible.builtin.shell: > set -o pipefail && \ echo y | mkfs -t ext4 {{ item.0 }}1 when: item.1.location not in mounted_partitions.stdout @@ -88,25 +88,25 @@ failed_when: false - name: Creating NFS share directories - file: + ansible.builtin.file: path: "{{ item.location }}" state: directory mode: "{{ nfs_share_dir_mode }}" with_items: "{{ pv_volumes }}" - name: Mount K8s partition on K8s NFS share - command: "mount {{ item.0 }}1 {{ item.1.location }}" + ansible.builtin.command: "mount {{ item.0 }}1 {{ item.1.location }}" # noqa command-instead-of-module changed_when: true when: item.1.location not in me4_mounted_partitions.stdout with_together: - - "{{ mounted_pv_disks }}" - - "{{ pv_volumes }}" + - "{{ mounted_pv_disks }}" + - "{{ pv_volumes }}" failed_when: false - name: Configuring auto mount K8s partition on reboot - lineinfile: + ansible.builtin.lineinfile: path: "{{ fstab_file_path }}" line: "{{ item.0 }}1 {{ item.1.location }} ext4 defaults 0 0" with_together: - "{{ mounted_pv_disks }}" - - "{{ pv_volumes }}" \ No newline at end of file + - "{{ pv_volumes }}" diff --git a/storage/roles/nfs_iscsi/tasks/nfs_node_configure.yml b/utils/obsolete/roles/nfs_iscsi/tasks/nfs_node_configure.yml similarity index 70% rename from storage/roles/nfs_iscsi/tasks/nfs_node_configure.yml rename to utils/obsolete/roles/nfs_iscsi/tasks/nfs_node_configure.yml index 066736c13..c6a78ae97 100644 --- a/storage/roles/nfs_iscsi/tasks/nfs_node_configure.yml +++ b/utils/obsolete/roles/nfs_iscsi/tasks/nfs_node_configure.yml @@ -14,114 +14,114 @@ --- - name: Include - include_tasks: ports.yml + ansible.builtin.include_tasks: ports.yml - name: Refresh ssh keys - command: ssh-keygen -R {{ powervault_ip }} + ansible.builtin.command: ssh-keygen -R {{ powervault_ip }} changed_when: false tags: install failed_when: false - name: Validate authentication of username and password - command: ping -c1 {{ powervault_ip }} + ansible.builtin.command: ping -c1 {{ powervault_ip }} register: validate_login changed_when: false failed_when: false - name: NFS node configuration on leap + when: os_supported_leap in ansible_distribution | lower block: - name: Install open-iscsi - zypper: + community.general.zypper: name: open-iscsi state: present tags: install - name: Install sg3_utils - zypper: + community.general.zypper: name: sg3_utils state: present tags: install - name: Start the iSCSI deamon - systemd: + ansible.builtin.systemd: name: iscsid state: started - - - block: - - name: Configure nic - command: ip a add {{ pv_nic_ip }}/255.255.255.0 dev {{ pv_nic }} - register: nic_status - changed_when: false + - name: Configure nic + block: + - name: Configure nic + ansible.builtin.command: ip a add {{ pv_nic_ip }}/255.255.255.0 dev {{ pv_nic }} + register: nic_status + changed_when: false rescue: - - name: Check if nic configured or not - fail: - msg: "{{ nic_conf_failed_msg }}" - when: nic_status_search not in nic_status.stderr + - name: Check if nic configured or not + ansible.builtin.fail: + msg: "{{ nic_conf_failed_msg }}" + when: nic_status_search not in nic_status.stderr - name: Up the nic - command: ip link set dev {{ pv_nic }} up + ansible.builtin.command: ip link set dev {{ pv_nic }} up changed_when: false - when: os_supported_leap in ansible_distribution | lower - name: NFS node configuration on rocky + when: os_supported_leap not in ansible_distribution | lower block: - name: Install packages - package: + ansible.builtin.package: name: iscsi-initiator-utils state: present tags: install - name: Install packages - package: + ansible.builtin.package: name: sg3_utils state: present tags: install - name: Set bootproto value - lineinfile: + ansible.builtin.lineinfile: path: "{{ nic_path }}" regexp: '^BOOTPROTO=' line: 'BOOTPROTO=none' register: result - name: Set onboot value - lineinfile: + ansible.builtin.lineinfile: path: "{{ nic_path }}" regexp: '^ONBOOT=' line: 'ONBOOT=yes' - name: Add ip address - lineinfile: + ansible.builtin.lineinfile: path: "{{ nic_path }}" insertafter: '^ONBOOT=yes' line: 'IPADDR={{ pv_nic_ip }}' - name: Add netmask address - lineinfile: + ansible.builtin.lineinfile: path: "{{ nic_path }}" insertafter: '^IPADDR={{ pv_nic_ip }}' line: NETMASK=255.255.255.0 - name: Down the nic - command: ifdown {{ pv_nic }} + ansible.builtin.command: ifdown {{ pv_nic }} changed_when: true failed_when: false tags: install - name: Up the nic - command: ifup {{ pv_nic }} + ansible.builtin.command: ifup {{ pv_nic }} changed_when: true tags: install - name: Show ip - shell: > + ansible.builtin.shell: > set -o pipefail && \ ifconfig {{ pv_nic }} | grep 'inet' |cut -d: -f2 | awk '{ print $2}' changed_when: false - when: os_supported_leap not in ansible_distribution | lower - name: Discover nodes - command: iscsiadm -m discovery -t sendtargets -p {{ item }} + ansible.builtin.command: iscsiadm -m discovery -t sendtargets -p {{ item }} with_items: "{{ set_port_ip }}" register: ports_available failed_when: false @@ -129,47 +129,47 @@ tags: install - name: Pv port ip - add_host: + ansible.builtin.add_host: name: pv map_ip: "{{ item.item }}" with_items: "{{ ports_available.results }}" when: item.rc == 0 - name: Pv port set ip - set_fact: + ansible.builtin.set_fact: map_ip_output: "{{ item.stdout_lines }}" with_items: "{{ ports_available.results }}" when: item.rc == 0 - name: Find feasible port ip - set_fact: + ansible.builtin.set_fact: discover: "{{ item }}" with_items: "{{ map_ip_output }}" when: hostvars['pv']['map_ip'] in item - name: Split on comma - set_fact: + ansible.builtin.set_fact: ip_port: "{{ discover.split(',')[0] }}" - name: Pv name - set_fact: + ansible.builtin.set_fact: pv_name: "{{ discover.split(',')[1].split()[1] }}" - name: IQDN id - shell: > + ansible.builtin.shell: > set -o pipefail && \ grep "InitiatorName=" /etc/iscsi/initiatorname.iscsi | cut -f2 -d"=" register: iqdn_id changed_when: false - name: Add volume data to dummy host - set_fact: + ansible.builtin.set_fact: server_iqdn: "{{ iqdn_id.stdout }}" - name: Login to the powervault - command: iscsiadm -m node --login {{ pv_name }} -p {{ ip_port }} + ansible.builtin.command: iscsiadm -m node --login {{ pv_name }} -p {{ ip_port }} changed_when: true failed_when: false - name: Map volumes - include_tasks: map_volume.yml \ No newline at end of file + ansible.builtin.include_tasks: map_volume.yml diff --git a/storage/roles/nfs_iscsi/tasks/nfs_volume.yml b/utils/obsolete/roles/nfs_iscsi/tasks/nfs_volume.yml similarity index 81% rename from storage/roles/nfs_iscsi/tasks/nfs_volume.yml rename to utils/obsolete/roles/nfs_iscsi/tasks/nfs_volume.yml index 5e3432687..fb2ea0156 100644 --- a/storage/roles/nfs_iscsi/tasks/nfs_volume.yml +++ b/utils/obsolete/roles/nfs_iscsi/tasks/nfs_volume.yml @@ -14,25 +14,25 @@ --- - name: Refresh ssh keys - command: ssh-keygen -R {{ powervault_ip }} + ansible.builtin.command: ssh-keygen -R {{ powervault_ip }} changed_when: false tags: install failed_when: false - name: Validate authentication of username and password - command: ping -c1 {{ powervault_ip }} + ansible.builtin.command: ping -c1 {{ powervault_ip }} register: validate_login changed_when: false failed_when: false - name: Scan for getting the volume - command: rescan-scsi-bus.sh --forcerescan + ansible.builtin.command: rescan-scsi-bus.sh --forcerescan changed_when: false register: volume_pv - name: Assert if volume created or not - assert: + ansible.builtin.assert: that: - "' Model: ME4' in volume_pv.stdout in volume_pv.stdout" success_msg: "Volume is created" - fail_msg: "Volume is not created properly." \ No newline at end of file + fail_msg: "Volume is not created properly." diff --git a/storage/roles/nfs_iscsi/tasks/ports.yml b/utils/obsolete/roles/nfs_iscsi/tasks/ports.yml similarity index 77% rename from storage/roles/nfs_iscsi/tasks/ports.yml rename to utils/obsolete/roles/nfs_iscsi/tasks/ports.yml index f954eb863..e0c9663c8 100644 --- a/storage/roles/nfs_iscsi/tasks/ports.yml +++ b/utils/obsolete/roles/nfs_iscsi/tasks/ports.yml @@ -14,17 +14,17 @@ --- - name: Include Powervault variables - include_vars: "{{ role_path }}/../../control_plane/input_params/powervault_vars.yml" + ansible.builtin.include_vars: "{{ role_path }}/../../control_plane/input_params/powervault_vars.yml" - name: Set powervault username and password - set_fact: + ansible.builtin.set_fact: powervault_username: "{{ hostvars['127.0.0.1']['powervault_username'] }}" powervault_password: "{{ hostvars['127.0.0.1']['powervault_password'] }}" powervault_ip: "{{ hostvars['127.0.0.1']['powervault_ip'] }}" no_log: true - name: Get auth string - shell: > + ansible.builtin.shell: > set -o pipefail && \ echo -n {{ powervault_username }}_{{ powervault_password }} | sha256sum register: auth_string @@ -32,58 +32,58 @@ delegate_to: localhost - name: Get auth string - shell: echo -n {{ powervault_username }}_{{ powervault_password }} | sha256sum + ansible.builtin.shell: echo -n {{ powervault_username }}_{{ powervault_password }} | sha256sum register: port_auth_string changed_when: false delegate_to: localhost no_log: true - ignore_errors: yes + ignore_errors: true - name: Get session key - uri: + ansible.builtin.uri: url: https://{{ powervault_ip }}/api/login/{{ port_auth_string.stdout | replace(" -", "") }} method: GET headers: {'datatype': 'json'} - validate_certs: no + validate_certs: false register: port_session_key delegate_to: localhost - name: Execute show system command - uri: + ansible.builtin.uri: url: https://{{ powervault_ip }}/api/show/system method: GET body_format: json - validate_certs: no - use_proxy: no + validate_certs: false + use_proxy: false timeout: 60 headers: {'sessionKey': "{{ port_session_key.json.status[0].response }}", 'datatype':'json'} register: system_info - name: Get the product id - set_fact: + ansible.builtin.set_fact: pv_id: "{{ system_info.json.system[0]['product-id'] }}" - name: Verify the product id and model no. of device - assert: + ansible.builtin.assert: that: ("ME4" in pv_id) fail_msg: "{{ fail_iscsi_support }}" - name: Show ports - uri: + ansible.builtin.uri: url: https://{{ powervault_ip }}/api/show/ports method: GET body_format: json - validate_certs: no - use_proxy: no + validate_certs: false + use_proxy: false headers: {'sessionKey': "{{ port_session_key.json.status[0].response }}", 'datatype':'json'} register: show_ports delegate_to: localhost - name: Up ports - set_fact: + ansible.builtin.set_fact: up_port: "{{ up_port + [item.port] }}" target_id: "{{ item.get('target-id') }}" when: item.status == "Up" @@ -92,18 +92,18 @@ label: "{{ item.port }}" - name: Set ip - set_fact: - set_port_ip: "{{ set_port_ip+['{{ port_ip }}{{ temp|int }}'] }}" - temp: "{{ temp|int+t|int }}" + ansible.builtin.set_fact: + set_port_ip: "{{ set_port_ip + ['{{ port_ip }}{{ temp | int }}'] }}" + temp: "{{ temp | int + t | int }}" loop: "{{ up_port }}" - name: Assign ip to ports - uri: - url: https://{{ powervault_ip }}/api/set/host-parameters/gateway/{{ port_gateway }}/ip/{{ item.0 }}/netmask/{{ port_netmask }}/ports/{{ item.1 }}/prompt/yes/noprompt + ansible.builtin.uri: + url: https://{{ powervault_ip }}/api/set/host-parameters/gateway/{{ port_gateway }}/ip/{{ item.0 }}/netmask/{{ port_netmask }}/ports/{{ item.1 }}/prompt/yes/noprompt # noqa: yaml[line-length] method: GET body_format: json - validate_certs: no - use_proxy: no + validate_certs: false + use_proxy: false headers: {'sessionKey': "{{ port_session_key.json.status[0].response }}", 'datatype':'json'} register: set_ports diff --git a/storage/roles/nfs_iscsi/tasks/validate_nfs_config.yml b/utils/obsolete/roles/nfs_iscsi/tasks/validate_nfs_config.yml similarity index 78% rename from storage/roles/nfs_iscsi/tasks/validate_nfs_config.yml rename to utils/obsolete/roles/nfs_iscsi/tasks/validate_nfs_config.yml index a1afaaf38..6028d16cd 100644 --- a/storage/roles/nfs_iscsi/tasks/validate_nfs_config.yml +++ b/utils/obsolete/roles/nfs_iscsi/tasks/validate_nfs_config.yml @@ -14,22 +14,22 @@ --- - name: Include variable file for powervault - include_vars: "{{ role_path }}/vars/main.yml" + ansible.builtin.include_vars: "{{ role_path }}/vars/main.yml" no_log: true - name: Refresh ssh-key - command: ssh-keygen -R {{ inventory_hostname }} + ansible.builtin.command: ssh-keygen -R {{ inventory_hostname }} register: ping_result changed_when: false failed_when: false - name: Check for nfs node nic - command: nmcli device show {{ pv_nic }} + ansible.builtin.command: nmcli device show {{ pv_nic }} register: nic_output changed_when: false failed_when: false - name: Fail if wrong nic - fail: + ansible.builtin.fail: msg: "{{ nic_error }}" - when: 'pv_nic in nic_output.stderr' \ No newline at end of file + when: 'pv_nic in nic_output.stderr' diff --git a/storage/roles/nfs_iscsi/vars/main.yml b/utils/obsolete/roles/nfs_iscsi/vars/main.yml similarity index 96% rename from storage/roles/nfs_iscsi/vars/main.yml rename to utils/obsolete/roles/nfs_iscsi/vars/main.yml index 14d84e95e..d9122d46a 100644 --- a/storage/roles/nfs_iscsi/vars/main.yml +++ b/utils/obsolete/roles/nfs_iscsi/vars/main.yml @@ -15,14 +15,14 @@ # vars file for nfs_iscsi -#Usage: check_prerequisites.yml +# Usage: check_prerequisites.yml nfs_node_group_fail_msg: "nfs_node group should contain exactly 1 node" nfs_node_group_success_msg: "nfs_node group check passed" config_filename: "omnia_config.yml" config_vaultname: .omnia_vault_key login_vars_filename: input_params/login_vars.yml vault_filename: input_params/.login_vault_key -vault_file_perm: 0644 +vault_file_perm: "0644" pv_ip_undefined_msg: "Please give IP of powervault connected to nfs_node in omnia_config.yml" powervault_ip_success_msg: "Powervault IP is reachable." powervault_ip_failure_msg: "Powervault IP is not defined in omnia_config.yml or not reacheable." @@ -40,7 +40,7 @@ nfs_services: - mountd - rpc-bind - nfs -nfs_share_dir_mode: 0777 +nfs_share_dir_mode: "0777" # Usage: nfs_node_configure.yml pv_nic: "{{ powervault_server_nic }}" @@ -71,4 +71,4 @@ fail_iscsi_support: "Failed. Only ME4 is supported for iscsi protocol" access: rw login_pv_file: "{{ playbook_dir }}/control_plane/input_params/login_vars.yml" login_pv_vault_file: "{{ playbook_dir }}/control_plane/input_params/.login_vault_key" -pv_map_ip: 192.168.25.5 \ No newline at end of file +pv_map_ip: 192.168.25.5 diff --git a/utils/roles/rhsm_subscription/files/readme.txt b/utils/obsolete/roles/rhsm_subscription/files/readme.txt similarity index 100% rename from utils/roles/rhsm_subscription/files/readme.txt rename to utils/obsolete/roles/rhsm_subscription/files/readme.txt diff --git a/utils/roles/rhsm_subscription/tasks/fetch_rhsm_inputs.yml b/utils/obsolete/roles/rhsm_subscription/tasks/fetch_rhsm_inputs.yml similarity index 100% rename from utils/roles/rhsm_subscription/tasks/fetch_rhsm_inputs.yml rename to utils/obsolete/roles/rhsm_subscription/tasks/fetch_rhsm_inputs.yml diff --git a/utils/roles/rhsm_subscription/tasks/include_vars.yml b/utils/obsolete/roles/rhsm_subscription/tasks/include_vars.yml similarity index 100% rename from utils/roles/rhsm_subscription/tasks/include_vars.yml rename to utils/obsolete/roles/rhsm_subscription/tasks/include_vars.yml diff --git a/utils/roles/rhsm_subscription/tasks/main.yml b/utils/obsolete/roles/rhsm_subscription/tasks/main.yml similarity index 100% rename from utils/roles/rhsm_subscription/tasks/main.yml rename to utils/obsolete/roles/rhsm_subscription/tasks/main.yml diff --git a/utils/roles/rhsm_subscription/tasks/redhat_subscription.yml b/utils/obsolete/roles/rhsm_subscription/tasks/redhat_subscription.yml similarity index 100% rename from utils/roles/rhsm_subscription/tasks/redhat_subscription.yml rename to utils/obsolete/roles/rhsm_subscription/tasks/redhat_subscription.yml diff --git a/utils/roles/rhsm_subscription/tasks/setup_rhsm_role.yml b/utils/obsolete/roles/rhsm_subscription/tasks/setup_rhsm_role.yml similarity index 100% rename from utils/roles/rhsm_subscription/tasks/setup_rhsm_role.yml rename to utils/obsolete/roles/rhsm_subscription/tasks/setup_rhsm_role.yml diff --git a/utils/roles/rhsm_subscription/tasks/unregister.yml b/utils/obsolete/roles/rhsm_subscription/tasks/unregister.yml similarity index 100% rename from utils/roles/rhsm_subscription/tasks/unregister.yml rename to utils/obsolete/roles/rhsm_subscription/tasks/unregister.yml diff --git a/utils/roles/rhsm_subscription/vars/main.yml b/utils/obsolete/roles/rhsm_subscription/vars/main.yml similarity index 100% rename from utils/roles/rhsm_subscription/vars/main.yml rename to utils/obsolete/roles/rhsm_subscription/vars/main.yml diff --git a/utils/oim_cleanup.yml b/utils/oim_cleanup.yml new file mode 100644 index 000000000..27ab6e1a0 --- /dev/null +++ b/utils/oim_cleanup.yml @@ -0,0 +1,25 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if virtual environment is active + ansible.builtin.import_playbook: check_venv.yml + when: not ( check_venv_executed | default(false) | bool ) + tags: provision, telemetry, local_repo, nfs_server + +- name: Cleanup Omnia Infrastructure Manager + hosts: localhost + connection: local + roles: + - oim_cleanup diff --git a/utils/performance_profile/ansible.cfg b/utils/performance_profile/ansible.cfg new file mode 100644 index 000000000..7dd553650 --- /dev/null +++ b/utils/performance_profile/ansible.cfg @@ -0,0 +1,17 @@ +[defaults] +log_path = /var/log/omnia/performance_profile.log +host_key_checking = false +forks = 5 +timeout = 180 +executable = /bin/bash +collections_path = $VIRTUAL_ENV +display_skipped_hosts = false + +[persistent_connection] +command_timeout = 180 +connect_timeout = 180 + +[ssh_connection] +retries = 3 +ssh_args = -o ControlMaster=auto -o ControlPersist=180 + diff --git a/utils/kernel_param_update/kernel_param_update.yml b/utils/performance_profile/performance_profile.yml similarity index 60% rename from utils/kernel_param_update/kernel_param_update.yml rename to utils/performance_profile/performance_profile.yml index 0df4b43ce..2eedc4f21 100644 --- a/utils/kernel_param_update/kernel_param_update.yml +++ b/utils/performance_profile/performance_profile.yml @@ -12,19 +12,25 @@ # See the License for the specific language governing permissions and # limitations under the License. --- + +- name: Update Inventory with ansible_host information + ansible.builtin.import_playbook: ../servicetag_host_mapping.yml + when: not ( hostvars['127.0.0.1']['update_inventory_executed'] | default(false) | bool ) + - name: Validate inputs hosts: localhost connection: local tasks: - - name: Validate inputs for kernel_param_update + - name: Validate inputs for performance_profile ansible.builtin.include_role: - name: kcmdline_update + name: performance_profile tasks_from: validate_input.yml - -- name: Update kernel parameters for OS - hosts: all +- name: Running performance_profile tasks + hosts: slurm_control_node, slurm_node, kube_control_plane, kube_node, auth_server, login, etcd connection: ssh - gather_facts: true + become: true + vars_files: + - performance_profile_config.yml roles: - - kcmdline_update + - performance_profile diff --git a/utils/performance_profile/performance_profile_config.yml b/utils/performance_profile/performance_profile_config.yml new file mode 100644 index 000000000..04e07ecd6 --- /dev/null +++ b/utils/performance_profile/performance_profile_config.yml @@ -0,0 +1,26 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# This configuration file is used to set up performance tuning for an Intel Gaudi GPU. +# It specifies a performance profile and allows to modify or add system parameters +# using various plugins such as sysctl, cpu, disk etc. Users can provide multiple +# parameters under each plugin. If there is no need to modify the profile, the +# 'performance_profile_plugin' section can be left blank. + +intel_gpu: + performance_profile: "accelerator-performance" + performance_profile_plugin: + sysctl: + - vm.nr_hugepages: 156300 + reboot_required: false diff --git a/utils/performance_profile/roles/performance_profile/files/validate_input.py b/utils/performance_profile/roles/performance_profile/files/validate_input.py new file mode 100644 index 000000000..c721d7cec --- /dev/null +++ b/utils/performance_profile/roles/performance_profile/files/validate_input.py @@ -0,0 +1,94 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import yaml +import json +import sys, os + +def load_performance_config(file_path): + """ + Loads and validates the tuned configuration from a YAML file. + + Args: + file_path (str): Path to the YAML configuration file. + + Returns: + dict: Parsed and validated tuned configuration. + + Raises: + SystemExit: If any validation fails, exits the program with an error message. + """ + with open(file_path, 'r') as file: + data = yaml.safe_load(file) + + intel_gpu = data.get('intel_gpu', {}) + + if not intel_gpu: + sys.exit("intel_gpu not found") + + performance_profile_name = intel_gpu.get('performance_profile', {}) + + if not performance_profile_name: + sys.exit("performance_profie is empty.") + + performance_profile_plugin = intel_gpu.get('performance_profile_plugin', {}) + + if not performance_profile_plugin: + print("performance_profile_plugin is empty. Setting profile values to default.") + return + + if not isinstance(performance_profile_plugin, dict): + sys.exit("Invalid format for performance_profile_plugin") + + if not all(isinstance(value, list) for value in performance_profile_plugin.values()): + sys.exit("Invalid format for performance_profile_plugin") + + if not all(isinstance(item, dict) for value in performance_profile_plugin.values() for item in value if item is not None): + sys.exit("Invalid format for performance_profile_plugin") + + for key, value in performance_profile_plugin.items(): + if not value: + sys.exit(f"Missing values for {key}") + + for item in value: + if not item: + sys.exit(f"Missing key-value pairs in {key}") + + for key, value in item.items(): + if value is None: + sys.exit(f"Missing values for {key}") + + print(f"{key} = {value}") + + if 'reboot_required' not in intel_gpu: + sys.exit("reboot_required is missing") + + if not isinstance(intel_gpu['reboot_required'], bool): + sys.exit("reboot_required must be either 'true' or 'false'") + + return intel_gpu + +def main(): + file_path = os.path.abspath(sys.argv[1]) + + try: + result = load_performance_config(file_path) + print("All validations passed") + print(result) + except Exception as e: + print(f"Error: {str(e)}") + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/utils/performance_profile/roles/performance_profile/tasks/main.yml b/utils/performance_profile/roles/performance_profile/tasks/main.yml new file mode 100644 index 000000000..624da7f34 --- /dev/null +++ b/utils/performance_profile/roles/performance_profile/tasks/main.yml @@ -0,0 +1,44 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Saving distribution of OS + ansible.builtin.set_fact: + compute_os: "{{ ansible_facts['distribution'] | lower }}" + compute_os_version: "{{ ansible_distribution_version }}" + +- name: Setting performance profile for Intel Gaudi accelerators + when: + - compute_os == 'ubuntu' + - compute_os_version == '22.04' + block: + - name: Import variables from performance_profile_config.yml + ansible.builtin.include_vars: + file: "{{ performance_profile_config_path }}" + + - name: Check if the node has Intel Gaudi accelerators + ansible.builtin.shell: > + set -o pipefail && \ + lspci | grep -i 'Processing accelerators: Habana Labs Ltd' + changed_when: false + register: lspci_output + failed_when: false + + - name: Run tasks for nodes with Intel Gaudi accelerators + ansible.builtin.include_tasks: "{{ setup_performance_profile_path }}" + when: lspci_output.stdout | length > 0 + + - name: Print status for nodes without Intel Gaudi accelerators + ansible.builtin.debug: + msg: "No Intel Gaudi accelerator found" + when: lspci_output.stdout | length == 0 diff --git a/utils/performance_profile/roles/performance_profile/tasks/reboot_node.yml b/utils/performance_profile/roles/performance_profile/tasks/reboot_node.yml new file mode 100644 index 000000000..7948f3827 --- /dev/null +++ b/utils/performance_profile/roles/performance_profile/tasks/reboot_node.yml @@ -0,0 +1,23 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Reboot node + block: + - name: Reboot node (This task will take some time) + ansible.builtin.reboot: + rescue: + - name: Failed to reboot node + ansible.builtin.fail: + msg: "{{ reboot_fail_msg }}" diff --git a/utils/performance_profile/roles/performance_profile/tasks/setup_performance_profile.yml b/utils/performance_profile/roles/performance_profile/tasks/setup_performance_profile.yml new file mode 100644 index 000000000..0ea41ceff --- /dev/null +++ b/utils/performance_profile/roles/performance_profile/tasks/setup_performance_profile.yml @@ -0,0 +1,80 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Load configuration from performance_profile_config.yml + ansible.builtin.include_vars: + file: "{{ performance_profile_config_path }}" + name: performance_profile_config + +- name: Install tuned package + ansible.builtin.package: + name: tuned + state: present + +- name: Enable and start tuned service + ansible.builtin.service: + name: tuned + enabled: true + state: started + +- name: Ensure performance profile directory exists + ansible.builtin.stat: + path: "{{ default_profile_path }}" + register: parent_profile + +- name: Fail if performance profile does not exist + ansible.builtin.fail: + msg: "{{ profile_doesnt_exist_msg }}" + when: not parent_profile.stat.exists + +- name: Create a new directory for the modified profile + ansible.builtin.file: + path: "{{ modified_profile_path }}" + state: directory + mode: "{{ modified_profile_permissions }}" + when: performance_profile_plugins is defined and performance_profile_plugins | length > 0 + +- name: Updating tuned.conf + ansible.builtin.template: + src: "{{ tuned_conf_template_path }}" + dest: "{{ modified_tuned_conf_path }}" + mode: "{{ modified_tuned_conf_permissions }}" + when: performance_profile_plugins is defined and performance_profile_plugins | length > 0 + +- name: Remove tuned.conf if it exists when performance_profile_plugin is empty + ansible.builtin.file: + path: "{{ modified_tuned_conf_path }}" + state: absent + when: performance_profile_plugins is not defined or performance_profile_plugins | length == 0 + +- name: Apply the performance profile + ansible.builtin.command: + cmd: tuned-adm profile {{ performance_profile_name }} + register: output + changed_when: output.rc != 0 + +- name: Verify the active performance profile + ansible.builtin.command: + cmd: tuned-adm active + register: active_profile + changed_when: false + +- name: Display the active performance profile + ansible.builtin.debug: + msg: "{{ active_profile.stdout }}" + +- name: Include reboot task if required + ansible.builtin.include_tasks: "{{ reboot_node_file_path }}" + when: performance_profile_config.intel_gpu.reboot_required | default(false) diff --git a/utils/performance_profile/roles/performance_profile/tasks/validate_input.yml b/utils/performance_profile/roles/performance_profile/tasks/validate_input.yml new file mode 100644 index 000000000..91ecc2da1 --- /dev/null +++ b/utils/performance_profile/roles/performance_profile/tasks/validate_input.yml @@ -0,0 +1,43 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Saving distribution of OS + ansible.builtin.set_fact: + compute_os: "{{ ansible_facts['distribution'] | lower }}" + compute_os_version: "{{ ansible_distribution_version }}" + +- name: Validate inputs + when: + - compute_os == 'ubuntu' + - compute_os_version == '22.04' + block: + - name: Inventory not provided + ansible.builtin.fail: + msg: "{{ empty_inventory_fail_msg }}" + when: + - groups['all'] is defined + - (groups['all'] | length == 0) + + - name: Validate performance_profile_config.yml file + block: + - name: Validating performance_profile_config.yml + ansible.builtin.command: "{{ python_version }} {{ validate_input_py }} {{ performance_profile_config_path }}" + register: script_output + changed_when: false + + rescue: + - name: Rescue block + ansible.builtin.fail: + msg: "{{ script_output.stderr }}" + when: script_output.stderr is defined diff --git a/upgrade/roles/update_metadata/tasks/update.yml b/utils/performance_profile/roles/performance_profile/templates/tuned.conf.j2 similarity index 66% rename from upgrade/roles/update_metadata/tasks/update.yml rename to utils/performance_profile/roles/performance_profile/templates/tuned.conf.j2 index 020e3549c..32d43218e 100644 --- a/upgrade/roles/update_metadata/tasks/update.yml +++ b/utils/performance_profile/roles/performance_profile/templates/tuned.conf.j2 @@ -11,14 +11,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ---- -- name: Read file path parameters from upgrade_config.yml - ansible.builtin.include_vars: - file: upgrade_config.yml - changed_when: false +[main] +include={{ intel_gpu.performance_profile }} -- name: Upgrade task files to destination path using shell command - ansible.builtin.shell: - cmd: "cp -r {{ old_input_location }} ../" - changed_when: false +{% for plugin, parameters in intel_gpu.performance_profile_plugin.items() %} +[{{ plugin }}] +{% for param in parameters %} +{% if param is mapping %} +{% for key, value in param.items() %} +{{ key }}={{ value }} +{% endfor %} +{% endif %} +{% endfor %} +{% endfor %} \ No newline at end of file diff --git a/utils/performance_profile/roles/performance_profile/vars/main.yml b/utils/performance_profile/roles/performance_profile/vars/main.yml new file mode 100644 index 000000000..2cb0d7faf --- /dev/null +++ b/utils/performance_profile/roles/performance_profile/vars/main.yml @@ -0,0 +1,41 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Usage: main.yml + +setup_performance_profile_path: "{{ role_path }}/tasks/setup_performance_profile.yml" + +# Usage: validate_inputs.yml + +empty_inventory_fail_msg: "Failed. inventory not provided. Re-run playbook with inventory providing -i inventory." +python_version: "{{ ansible_python_interpreter }}" +validate_input_py: "{{ role_path }}/files/validate_input.py" + +# Usage: setup_performance_profile.yml + +reboot_node_file_path: "{{ role_path }}/tasks/reboot_node.yml" +performance_profile_config_path: "{{ playbook_dir }}/performance_profile_config.yml" +tuned_conf_template_path: "{{ role_path }}/templates/tuned.conf.j2" +performance_profile_name: "{{ intel_gpu.performance_profile }}" +performance_profile_plugins: "{{ intel_gpu.performance_profile_plugin }}" +default_profile_path: "/usr/lib/tuned/{{ performance_profile_name }}" +profile_doesnt_exist_msg: "performance profile {{ performance_profile_name }} does not exist." +modified_profile_path: "/etc/tuned/{{ performance_profile_name }}" +modified_profile_permissions: '0755' +modified_tuned_conf_path: "{{ modified_profile_path }}/tuned.conf" +modified_tuned_conf_permissions: '0644' + +# Usage: reboot_node.yml +reboot_fail_msg: "Failed. Nodes should be rebooted manually." diff --git a/utils/provision/delete_ports.py b/utils/provision/delete_ports.py index 9e7c69251..7eacf01c3 100644 --- a/utils/provision/delete_ports.py +++ b/utils/provision/delete_ports.py @@ -39,7 +39,7 @@ def check_switch_table(): def delete_node_object(nodename): # Delete the entry from /etc/hosts print("hello=", nodename) - command = f"makehosts -d {nodename}" + command = f"/opt/xcat/sbin/makehosts -d {nodename}" temp = subprocess.run([f'{command}'], shell=True) # Delete the nodes from xcat @@ -48,13 +48,13 @@ def delete_node_object(nodename): temp = subprocess.run([f'{command}'], shell=True) # Run DHCP and dns - command = f"makedhcp -a" + command = f"/opt/xcat/sbin/makedhcp -a" temp = subprocess.run([f'{command}'], shell=True) - command = f"makedhcp -n" + command = f"/opt/xcat/sbin/makedhcp -n" temp = subprocess.run([f'{command}'], shell=True) - command = f"makedns -n" + command = f"/opt/xcat/sbin/makedns -n" temp = subprocess.run([f'{command}'], shell=True) diff --git a/utils/provision/delete_switch_ports.yml b/utils/provision/delete_switch_ports.yml index 42af52996..a2885e695 100644 --- a/utils/provision/delete_switch_ports.yml +++ b/utils/provision/delete_switch_ports.yml @@ -17,12 +17,12 @@ hosts: localhost connection: local vars: - switch_ports_undefined: "Failed.Please provide switch_ports for the switch_ip in the switch inventory" - switch_ports_empty: "Failed. Please provide switch_ports details to be deleted in switch inventory" - fail_switch_inventory: "Failed. Please provide proper switch details in switch_based_deletion_config.yml" - delete_ports: delete_ports.py - python_version: "python3.9" - ping_file_path: "{{ playbook_dir }}/../../provision/roles/provision_validation/files/switch_v3_ping.py" + switch_ports_undefined: "Failed.Please provide switch_ports for the switch_ip in the switch inventory" + switch_ports_empty: "Failed. Please provide switch_ports details to be deleted in switch inventory" + fail_switch_inventory: "Failed. Please provide proper switch details in switch_based_deletion_config.yml" + delete_ports: delete_ports.py + python_version: "{{ ansible_python_interpreter }}" + ping_file_path: "{{ playbook_dir }}/../../discovery/roles/discovery_validations/switch_based/files/switch_v3_ping.py" tasks: - name: Include vars ansible.builtin.include_vars: switch_based_deletion_config.yml @@ -44,10 +44,11 @@ ansible.builtin.command: | {{ python_version }} {{ ping_file_path }} {{ item.ip }} with_items: "{{ switch_based_details }}" + changed_when: true - name: Fail if switch_port not defined ansible.builtin.fail: - msg: "{{ switch_ports_undefined }}" + msg: "{{ switch_ports_undefined }}" when: item.delete_ports is not defined with_items: "{{ switch_based_details }}" @@ -66,3 +67,4 @@ ansible.builtin.command: | {{ python_version }} {{ delete_ports }} {{ item.ip }} {{ item.delete_ports }} with_items: "{{ switch_based_details }}" + changed_when: true diff --git a/utils/provision/switch_based_deletion_config.yml b/utils/provision/switch_based_deletion_config.yml index 0b86e5d6e..1e22600c6 100644 --- a/utils/provision/switch_based_deletion_config.yml +++ b/utils/provision/switch_based_deletion_config.yml @@ -13,10 +13,10 @@ # limitations under the License. --- -#*********************************************************************** -# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. # SIMPLY APPEND THE REQUIRD VALUES AGAINST THE PARAMETER OF YOUR CHOICE. -#*********************************************************************** +# *********************************************************************** # delete_ports indicates which port no. to delete from the DB. # If split port is needed, use colon (:) as shown in example below. @@ -26,4 +26,4 @@ # - { ip: 172.96.28.12, delete_ports: '1-48,49:3,50' } # - { ip: 172.96.28.14, delete_ports: '1,2,3,5' } switch_based_details: - - { ip: ,delete_ports: '' } \ No newline at end of file + - { ip: ,delete_ports: '' } diff --git a/utils/pull_images_to_nodes.yml b/utils/pull_images_to_nodes.yml index 059a01721..7b850fcb8 100644 --- a/utils/pull_images_to_nodes.yml +++ b/utils/pull_images_to_nodes.yml @@ -13,6 +13,10 @@ # limitations under the License. --- +- name: Check if virtual environment is active + ansible.builtin.import_playbook: check_venv.yml + when: not ( check_venv_executed | default(false) | bool ) + - name: Update Inventory with ansible_host information ansible.builtin.import_playbook: ../utils/servicetag_host_mapping.yml when: not ( hostvars['127.0.0.1']['update_inventory_executed'] | default(false) | bool ) diff --git a/utils/remove_node_configuration.yml b/utils/remove_node_configuration.yml index 8d222af04..443112a79 100644 --- a/utils/remove_node_configuration.yml +++ b/utils/remove_node_configuration.yml @@ -13,6 +13,10 @@ # limitations under the License. --- +- name: Check if virtual environment is active + ansible.builtin.import_playbook: check_venv.yml + when: not ( check_venv_executed | default(false) | bool ) + - name: Warning and User confirmation for removing slurm and kube node hosts: localhost connection: local diff --git a/utils/reset_cluster_configuration.yml b/utils/reset_cluster_configuration.yml index 197668d3f..9c98e1f9d 100644 --- a/utils/reset_cluster_configuration.yml +++ b/utils/reset_cluster_configuration.yml @@ -13,6 +13,10 @@ # limitations under the License. --- +- name: Check if virtual environment is active + ansible.builtin.import_playbook: check_venv.yml + when: not ( check_venv_executed | default(false) | bool ) + - name: Warning and User confirmation for removing cluster hosts: localhost connection: local diff --git a/utils/roles/check_package_lock/files/check_apt_lock.sh b/utils/roles/check_package_lock/files/check_apt_lock.sh new file mode 100644 index 000000000..77cdbb71f --- /dev/null +++ b/utils/roles/check_package_lock/files/check_apt_lock.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# This script checks for active apt locks on an Ubuntu system. +# It verifies if the following lock files are held: +# - /var/lib/apt/lists +# - /var/lib/dpkg/lock +# - /var/lib/dpkg/lock-frontend +# If any locks are found, it outputs a message and exits with a status of 1. +# If no locks are held, it outputs a confirmation message and exits with a status of 0. + + +# Check if /var/lib/apt/lists lock is held +if fuser /var/lib/apt/lists >/dev/null 2>&1; then + echo "/var/lib/apt/lists is locked" + exit 1 +fi + +# Check if /var/lib/dpkg/lock is held +if fuser /var/lib/dpkg/lock >/dev/null 2>&1; then + echo "/var/lib/dpkg/lock is locked" + exit 1 +fi + +# Check if /var/lib/dpkg/lock-frontend is held +if fuser /var/lib/dpkg/lock-frontend >/dev/null 2>&1; then + echo "/var/lib/dpkg/lock-frontend is locked" + exit 1 +fi + +# If no lock is held +echo "No APT locks are held" +exit 0 diff --git a/utils/roles/check_package_lock/tasks/check_apt_lock.yml b/utils/roles/check_package_lock/tasks/check_apt_lock.yml new file mode 100644 index 000000000..9f5472422 --- /dev/null +++ b/utils/roles/check_package_lock/tasks/check_apt_lock.yml @@ -0,0 +1,36 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +- name: Ensure the check_apt_lock.sh script is executable + ansible.builtin.file: + path: "{{ check_apt_lock }}" + mode: "{{ file_mode }}" + state: file + +- name: Check if any APT lock is held + ansible.builtin.shell: "{{ check_apt_lock }}" # noqa: command-instead-of-shell + register: apt_lock_check + changed_when: false + failed_when: false + +- name: Print APT lock status + ansible.builtin.debug: + msg: "{{ apt_lock_status }}" + +- name: Fail if APT lock is held + ansible.builtin.fail: + msg: "{{ apt_lock_failure_msg }}" + when: apt_lock_check.rc != 0 diff --git a/upgrade/roles/backup_omniadb/tasks/main.yml b/utils/roles/check_package_lock/tasks/main.yml similarity index 80% rename from upgrade/roles/backup_omniadb/tasks/main.yml rename to utils/roles/check_package_lock/tasks/main.yml index 77b1ee49b..ab257cd4a 100644 --- a/upgrade/roles/backup_omniadb/tasks/main.yml +++ b/utils/roles/check_package_lock/tasks/main.yml @@ -11,9 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + --- -- name: Install packages - ansible.builtin.include_tasks: install_packages.yml -- name: Backup old data - ansible.builtin.include_tasks: backup_old_data.yml +- name: Configure environment variables + ansible.builtin.include_tasks: check_apt_lock.yml + when: ansible_distribution | lower == ubuntu_os diff --git a/utils/roles/check_package_lock/vars/main.yml b/utils/roles/check_package_lock/vars/main.yml new file mode 100644 index 000000000..b10189abc --- /dev/null +++ b/utils/roles/check_package_lock/vars/main.yml @@ -0,0 +1,21 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Usage: check_apt_lock.yml +check_apt_lock: "{{ role_path }}/files/check_apt_lock.sh" +file_mode: "0755" +apt_lock_failure_msg: "apt lock is held! Please unlock it to proceed with further installation. Once the apt lock is cleared, rerun the playbook." +apt_lock_status: "apt lock status: {{ apt_lock_check.stdout | default('No lock found') }}" +ubuntu_os: "ubuntu" diff --git a/utils/roles/check_venv/tasks/main.yml b/utils/roles/check_venv/tasks/main.yml new file mode 100644 index 000000000..40b93ba80 --- /dev/null +++ b/utils/roles/check_venv/tasks/main.yml @@ -0,0 +1,64 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check virtual ENV + ansible.builtin.set_fact: + venv_path: "{{ lookup('ansible.builtin.env', 'VIRTUAL_ENV') }}" + +- name: Determine if venv is active + ansible.builtin.set_fact: + is_venv_active: "{{ venv_path is defined and venv_path | length > 0 }}" + +- name: Fail if VIRTUAL_ENV is not set + ansible.builtin.fail: + msg: "{{ venv_active_fail_msg }}" + when: not is_venv_active + +- name: Check if venv is created by omnia + ansible.builtin.stat: + path: "{{ venv_path }}/.omnia" + register: omnia_file + +- name: Set fact if its created by omnia + ansible.builtin.set_fact: + omnia_file_exists: "{{ omnia_file.stat.exists }}" + when: is_venv_active + +- name: Fail if its not created by omnia + ansible.builtin.fail: + msg: "{{ venv_not_by_omnia_fail_msg }} " + when: is_venv_active and not omnia_file_exists + +- name: Set fact omnia collection path + ansible.builtin.set_fact: + omnia_collection_path: "{{ lookup('ansible.builtin.config', 'COLLECTIONS_PATHS') }}" + +- name: Check if omnia collection path matches venv path + ansible.builtin.set_fact: + path_match: "{{ omnia_collection_path[0] == venv_path }}" + +- name: Warning if collection path does not match venv path + ansible.builtin.debug: + msg: "{{ collections_path_not_match_waring_msg }}" + when: not path_match + +- name: Installed ansible collection path + ansible.builtin.debug: + msg: "{{ installed_ansible_collections_path_info_msg }}" + +- name: Playbook is running inside omnia created venv + ansible.builtin.debug: + msg: "{{ venv_active_success_msg }}" + when: is_venv_active and omnia_file_exists diff --git a/utils/roles/check_venv/vars/main.yml b/utils/roles/check_venv/vars/main.yml new file mode 100644 index 000000000..113b089fd --- /dev/null +++ b/utils/roles/check_venv/vars/main.yml @@ -0,0 +1,27 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Usage: check_venv.yml +venv_active_fail_msg: "It seems the Python virtual environment for Omnia isn’t active. Please activate it using the following command: source /opt/omnia/omnia17_venv/bin/activate." # noqa: yaml[line-length] +venv_not_by_omnia_fail_msg: > + The Python virtual environment at {{ venv_path }} was not created by the Omnia prereq.sh script. + Please deactivate this virtual environment and re-run the prereq.sh script to set up the Omnia virtual environment. + If prereq.sh has already been run, please activate the Omnia virtual environment using the command: source /opt/omnia/omnia17_venv/bin/activate. +collections_path_not_match_waring_msg: > + [WARNING]: Expected collection path {{ venv_path }}, but got {{ omnia_collection_path[0] }}. + In this case, ansible collections might get installed outside the virtual environment. +installed_ansible_collections_path_info_msg: > + Ansible collections will be installed under path {{ omnia_collection_path[0] }} with the current ansible configuration +venv_active_success_msg: "The playbook is running inside a virtual environment: {{ venv_path }}" diff --git a/utils/roles/cluster_preperation/tasks/fetch_provision_password.yml b/utils/roles/cluster_preperation/tasks/fetch_provision_password.yml index b1450df30..eac25e222 100644 --- a/utils/roles/cluster_preperation/tasks/fetch_provision_password.yml +++ b/utils/roles/cluster_preperation/tasks/fetch_provision_password.yml @@ -13,11 +13,6 @@ # limitations under the License. --- -- name: Fetch ansible-vault path - ansible.builtin.command: whereis ansible-vault - changed_when: false - register: ansible_vault_path - - name: Check provision_config.yml file is encrypted ansible.builtin.command: cat {{ provision_config_filename }} changed_when: false @@ -26,7 +21,7 @@ - name: Decrpyt provision_config.yml ansible.builtin.command: >- - {{ ansible_vault_path.stdout.split(' ')[1] }} decrypt {{ provision_config_filename }} + ansible-vault decrypt {{ provision_config_filename }} --vault-password-file {{ provision_vault_path }} changed_when: false when: ansible_vault_search_key in provision_config_content.stdout @@ -51,7 +46,7 @@ - name: Encrypt provision_config.yml ansible.builtin.command: >- - {{ ansible_vault_path.stdout.split(' ')[1] }} encrypt {{ provision_config_filename }} + ansible-vault encrypt {{ provision_config_filename }} --vault-password-file {{ provision_vault_path }} changed_when: false diff --git a/utils/roles/common/tasks/include_omnia_config.yml b/utils/roles/common/tasks/include_omnia_config.yml index 37a87b1f1..0b785131c 100644 --- a/utils/roles/common/tasks/include_omnia_config.yml +++ b/utils/roles/common/tasks/include_omnia_config.yml @@ -13,11 +13,6 @@ # limitations under the License. --- -- name: Fetch ansible-vault path - ansible.builtin.command: whereis ansible-vault - changed_when: false - register: ansible_vault_path - - name: Check omnia_config.yml file is encrypted ansible.builtin.command: cat {{ omnia_config_filename }} changed_when: false @@ -26,7 +21,7 @@ - name: Decrpyt omnia_config.yml ansible.builtin.command: >- - {{ ansible_vault_path.stdout.split(' ')[1] }} decrypt {{ omnia_config_filename }} + ansible-vault decrypt {{ omnia_config_filename }} --vault-password-file {{ omnia_vault_path }} changed_when: false when: ansible_vault_search_key in omnia_config_content.stdout @@ -44,7 +39,7 @@ - name: Encrypt omnia_config.yml ansible.builtin.command: >- - {{ ansible_vault_path.stdout.split(' ')[1] }} encrypt {{ omnia_config_filename }} + ansible-vault encrypt {{ omnia_config_filename }} --vault-password-file {{ omnia_vault_path }} changed_when: false when: ansible_vault_search_key in omnia_config_content.stdout diff --git a/utils/roles/common/tasks/include_provision_config_credentials.yml b/utils/roles/common/tasks/include_provision_config_credentials.yml index eb2d0e4f7..7b99713e0 100644 --- a/utils/roles/common/tasks/include_provision_config_credentials.yml +++ b/utils/roles/common/tasks/include_provision_config_credentials.yml @@ -13,11 +13,6 @@ # limitations under the License. --- -- name: Fetch ansible-vault path - ansible.builtin.command: whereis ansible-vault - changed_when: false - register: ansible_vault_path - - name: Check provision_config_credentials.yml file is encrypted ansible.builtin.command: cat {{ credentials_config_filename }} changed_when: false @@ -26,7 +21,7 @@ - name: Decrpyt provision_config_credentials.yml ansible.builtin.command: >- - {{ ansible_vault_path.stdout.split(' ')[1] }} decrypt {{ credentials_config_filename }} + ansible-vault decrypt {{ credentials_config_filename }} --vault-password-file {{ credentials_vault_path }} changed_when: false when: ansible_vault_search_key in file_content.stdout @@ -44,7 +39,7 @@ - name: Encrypt provision_config_credentials.yml ansible.builtin.command: >- - {{ ansible_vault_path.stdout.split(' ')[1] }} encrypt {{ credentials_config_filename }} + ansible-vault encrypt {{ credentials_config_filename }} --vault-password-file {{ credentials_vault_path }} changed_when: false when: ansible_vault_search_key in file_content.stdout diff --git a/utils/roles/control_plane_cleanup/vars/rocky.yml b/utils/roles/control_plane_cleanup/vars/rocky.yml deleted file mode 120000 index ba2f905fb..000000000 --- a/utils/roles/control_plane_cleanup/vars/rocky.yml +++ /dev/null @@ -1 +0,0 @@ -redhat.yml \ No newline at end of file diff --git a/utils/roles/inventory_tagging/files/inventory_tagging.py b/utils/roles/inventory_tagging/files/inventory_tagging.py index 99e1f40c6..96caf5328 100644 --- a/utils/roles/inventory_tagging/files/inventory_tagging.py +++ b/utils/roles/inventory_tagging/files/inventory_tagging.py @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,169 +13,196 @@ # limitations under the License. import os -import configparser +import json import sys +import logging +from typing import List, Tuple +import argparse +import commentedconfigparser +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) -db_path = sys.argv[1] -omnia_inventory_dir_path = sys.argv[2] -sys.path.insert(0, db_path) - -import omniadb_connection - -def add_inventory_files(inventory_filename_list: list[str]) -> None: - """ - Create inventory files. - - Args: - inventory_filename_list (list[str]): A list of filenames for the inventory files. - """ - # Create the directory if it doesn't exist - if not os.path.exists(omnia_inventory_dir_path): - os.makedirs(omnia_inventory_dir_path, mode=0o644) - - # Iterate over the inventory filenames - for filename in inventory_filename_list: - file_path = os.path.join(omnia_inventory_dir_path, filename) - with open(file_path, 'w', encoding='utf-8') as file: - # Write the group name to the file - group_name = f"[{filename}]" - file.write(group_name) - - -def get_cluster_details_db(): - """ - Retrieves the cluster details from the database. - Returns: - nodes_info (list): A list of tuples containing the service tag, admin IP, CPU, and GPU of each node in the cluster. - """ - # Create a connection to the database - conn = omniadb_connection.create_connection() - cursor = conn.cursor() - - # Define the SQL query to retrieve the nodes information - query = """ - SELECT - node, - service_tag, - admin_ip, - cpu, - gpu - FROM - cluster.nodeinfo - WHERE - status = 'booted' - """ - - # Execute the SQL query - cursor.execute(query) - - # Fetch all the rows returned by the query - nodes_info = cursor.fetchall() - - # Close the database connection - conn.close() - - # Return the nodes information - return nodes_info - - -def add_servicetag_inventory(inventory_file: str, service_tag: str) -> None: - """ - Adds a service tag to the inventory file. - Args: - inventory_file (str): The path to the inventory file. - service_tag (str): The service tag to add. +class InventoryManager: + """Inventory tagging module. + Handles inventory updates and database interactions for nodes. """ - try: - # Read the config file - config = configparser.ConfigParser(allow_no_value=True) - config.read(inventory_file, encoding='utf-8') - - # Set the service tag - config.set(inventory_file, service_tag) - - # Write the inventory file - with open(inventory_file, 'w', encoding='utf-8') as configfile: - config.write(configfile, space_around_delimiters=False) - except (configparser.DuplicateOptionError, - configparser.DuplicateSectionError, - configparser.NoSectionError, - Exception) as err: - print(f'''inventory_tagging:add_servicetag_inventory: - Error adding service tag {service_tag} to inventory file {inventory_file}. - Error type: {str(type(err))}. - Error message: {str(err)}''') - - -def update_inventory(node_detail): - """ - Update the inventory based on the given node details. - Args: - node_detail (tuple): A tuple containing the service tag, admin IP, CPU, and GPU. - Returns: - None - """ - # Unpack the node_detail tuple - node, service_tag, admin_ip, cpu, gpu = node_detail - - # Check if service_tag is empty or None - if not service_tag: - # Inventory files will not be updated if service_tag is empty or None - print(f'''inventory_tagging:update_inventory: - Service tag is unavailable for node {node}, skipping inventory update.''') - return - - try: - # Change the working directory to /opt/omnia/omnia_inventory - if os.getcwd() != omnia_inventory_dir_path: - os.chdir(omnia_inventory_dir_path) - except OSError as err: - # Log an error message if changing directory fails - print(f'''inventory_tagging:update_inventory: - Error changing current working directory to {omnia_inventory_dir_path}. - Error type: {str(type(err))}. - Error message: {str(err)}''') - - # Update inventory files based on CPU info - if cpu: - # Add service tag to corresponding inventory file - inventory_file_str = "compute_cpu_intel" if cpu == "intel" else "compute_cpu_amd" - add_servicetag_inventory(inventory_file_str, service_tag) - # Add service tag and admin ip to compute_servicetag_ip inventory file - service_tag_ip_str = f"{service_tag} ansible_host={admin_ip}" - add_servicetag_inventory("compute_servicetag_ip", service_tag_ip_str) - - # Update inventory files based on GPU info - if gpu: - inventory_file_str = "compute_gpu_nvidia" if gpu == "nvidia" else "compute_gpu_amd" - add_servicetag_inventory(inventory_file_str, service_tag) - - -def change_inventory_file_permission(inventory_files: list[str]): - """ - Change the permission of the inventory files to read-only. - Args: - inventory_files (list[str]): A list of inventory files to change permission. - """ - # Iterate over the inventory files - for inventory_file in inventory_files: + def __init__(self, inventory_filenames, vendors, inventory_dir_path, path_to_db_file): + self.inventory_filenames = inventory_filenames + self.vendors = vendors + self.inventory_dir_path = os.path.abspath(inventory_dir_path) + self.db_path = path_to_db_file + + def add_inventory_files(self) -> None: + """ + Create inventory files based on the configured filenames. + """ + inventory_header = "# This file is generated by omnia, and should not be edited\n" + # Create the directory if it doesn't exist + if not os.path.exists(self.inventory_dir_path): + os.makedirs(self.inventory_dir_path, mode=0o644) # Use directory permissions + # Iterate over the inventory filenames + for filename in self.inventory_filenames: + file_path = os.path.join(self.inventory_dir_path, filename) + file_path = os.path.abspath(file_path) + with open(file_path, 'w', encoding='utf-8') as file: + # Write the header to the file + file.write(inventory_header) + # Write the group name to the file + group_name = f"[{filename}]\n" + file.write(group_name) + file.flush() + + def get_cluster_details_db(self) -> List[Tuple[str, str, str, str, str, str]]: + """ + Retrieves the cluster details from the database. + Returns: + List[Tuple[str, str, str, str, str]]: A list of tuples containing the + node, service tag, admin IP, CPU, and GPU of each node in the cluster. + """ + if self.db_path: + sys.path.insert(0, self.db_path) + try: + # pylint: disable=C0415 + from omniadb_connection import create_connection + except ImportError: + logger.error("Failed to import omniadb_connection module from db_path: %s", + self.db_path) + return [] + with create_connection() as conn: + cursor = conn.cursor() + query = """ + SELECT + node, + service_tag, + hostname, + admin_ip, + cpu, + gpu + FROM + cluster.nodeinfo + WHERE + status = 'booted' + """ + cursor.execute(query) + nodes_info = cursor.fetchall() + return nodes_info + else: + logger.error("The value is missing for db_path: %s", self.db_path) + return [] + + def add_hostname_inventory(self, inventory_file: str, hostname: str) -> None: + """ + Adds a hostname to the inventory file, if does not exist. + Args: + inventory_file (str): The path to the inventory file. + hostname (str): The hostname to add. + """ + try: + # Read Content of file if it exist + if os.path.exists(os.path.abspath(inventory_file)): + with open(os.path.abspath(inventory_file), 'r', encoding='utf-8') as file: + lines = file.readlines() + else: + lines = [] + if lines: + # Check if the hostname is already in the file + if any(hostname in line for line in lines): + logger.info("Hostname '%s' already exists in %s. Skipping addition.", hostname, inventory_file) + return + # Read the config file + config = commentedconfigparser.CommentedConfigParser(allow_no_value=True) + config.read(inventory_file, encoding='utf-8') + + # Check if the section exists, otherwise create it + if not config.has_section(inventory_file): + config.add_section(inventory_file) + + # Set the hostname under the correct section + config.set(inventory_file, hostname, None) # Use None as value since no value is required + + # Write the inventory file + with open(os.path.abspath(inventory_file), 'w', encoding='utf-8') as configfile: + config.write(configfile, space_around_delimiters=False) + configfile.flush() + except KeyError as e: + logger.error("inventory_tagging:add_hostname_inventory: " + "Error adding hostname %s to inventory file %s. " + "Error type: %s. Error message: %s", + hostname, inventory_file, type(e), e) + except (OSError, Exception) as err: # pylint: disable=W0718 + logger.error("inventory_tagging:add_hostname_inventory: " + "Error adding hostname %s to inventory file %s. " + "Error type: %s. " + "Error message: %s", + hostname, inventory_file, type(err), err + ) + + def update_inventory(self, node_detail: Tuple[str, str, str, str, str, str]) -> None: + """ + Update the inventory based on the given node details. + Args: + node_detail (Tuple[str, str, str, str, str]): A tuple containing the node, + service tag, hostname, admin IP, CPU, and GPU. + """ + # Unpack the node_detail tuple + node, service_tag, hostname, admin_ip, cpu, gpu = node_detail + if not hostname: + logger.warning("inventory_tagging:update_inventory: " + "hostname is unavailable for node %s; skipping inventory update", node) + return try: - # Change the permission of the file to read-only - os.chmod(inventory_file, 0o444) + if os.getcwd() != self.inventory_dir_path: + os.chdir(self.inventory_dir_path) except OSError as err: - # Log the error if changing the permission fails - print(f'''inventory_tagging:change_inventory_file_permission: - Error changing file permission to read-only for {inventory_file}. - Error type: {str(type(err))}. - Error message: {str(err)}''') - + logger.error("inventory_tagging:update_inventory: " + "Error changing current working directory to %s.\n" + "Error type: %s.\n" + "Error message: %s", + self.inventory_dir_path, type(err), str(err)) + if cpu: + inventory_file_name = self.vendors.get("cpu", {}).get(cpu) + if inventory_file_name: + self.add_hostname_inventory(inventory_file_name, hostname) + hostname_ip_str = f"{hostname} ansible_host={admin_ip}" + self.add_hostname_inventory("compute_hostname_ip", hostname_ip_str) + if gpu: + inventory_file_name = self.vendors.get("gpu", {}).get(gpu) + if inventory_file_name: + self.add_hostname_inventory(inventory_file_name, hostname) + + def change_inventory_file_permission(self, inventory_files: List[str]) -> None: + """ + Change the permission of the inventory files to read-only. + """ + try: + if os.getcwd() != self.inventory_dir_path: + os.chdir(self.inventory_dir_path) + except OSError as err: + logger.error("Error changing directory to %s: %s", self.inventory_dir_path, err) + for inventory_file in inventory_files: + try: + os.chmod(inventory_file, 0o444) + except OSError as err: + logger.error("Error changing file permission to read-only for %s: %s", + inventory_file, err) if __name__ == "__main__": - # Define the list of inventory filenames - inventory_files = ["compute_cpu_intel", "compute_cpu_amd", "compute_gpu_nvidia", "compute_gpu_amd", "compute_servicetag_ip"] - add_inventory_files(inventory_files) # Add new inventory files - node_detail_list = get_cluster_details_db() # Get details for all node from DB. - for node_detail in node_detail_list: - update_inventory(node_detail) # Update inventory files with service tag entries. - change_inventory_file_permission(inventory_files) # Change permission of inventory files to read-only. + parser = argparse.ArgumentParser(description="Inventory Manager Configuration") + parser.add_argument('--inventory_files', type=json.loads, help='List of inventory files.') + parser.add_argument('--vendors', type=json.loads, help='Vendors configuration.') + parser.add_argument('--inventory_dir_location', type=str, help='Directory path.') + parser.add_argument('--db_path', type=str, help='Path to the database files.') + args = parser.parse_args() + + manager = InventoryManager( + inventory_filenames=args.inventory_files, + vendors=args.vendors, + inventory_dir_path=args.inventory_dir_location, + path_to_db_file=args.db_path + ) + manager.add_inventory_files() + node_detail_list = manager.get_cluster_details_db() + for info in node_detail_list: + manager.update_inventory(info) + manager.change_inventory_file_permission(manager.inventory_filenames) diff --git a/utils/roles/inventory_tagging/tasks/main.yml b/utils/roles/inventory_tagging/tasks/main.yml index e3e19ac85..ae8b26bba 100644 --- a/utils/roles/inventory_tagging/tasks/main.yml +++ b/utils/roles/inventory_tagging/tasks/main.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,8 +13,17 @@ # limitations under the License. --- +- name: Install python commented config parser + ansible.builtin.command: "{{ python_version }} -m pip install {{ commentedconfigparser_python_package }}" + changed_when: true + - name: Generate inventory files - ansible.builtin.command: "{{ python_version }} {{ script_name }} {{ db_path }} {{ omnia_inventory_dir_path }}" + ansible.builtin.command: > + {{ python_version }} {{ script_name }} + --inventory_files='{{ inventory_files | to_json }}' + --vendors='{{ vendors | to_json }}' + --inventory_dir_location='{{ omnia_inventory_dir_path }}' + --db_path='{{ db_path }}' changed_when: false - name: Display Inventory File Location diff --git a/utils/roles/inventory_tagging/vars/main.yml b/utils/roles/inventory_tagging/vars/main.yml index 5546b74e2..515a1a10f 100644 --- a/utils/roles/inventory_tagging/vars/main.yml +++ b/utils/roles/inventory_tagging/vars/main.yml @@ -14,8 +14,25 @@ --- # Usage: main.yml -python_version: python3.9 +python_version: "{{ ansible_python_interpreter }}" script_name: "{{ role_path }}/files/inventory_tagging.py" db_path: "{{ role_path }}/../../../discovery/roles/db_operations/files/" omnia_inventory_dir_path: "/opt/omnia/omnia_inventory" inventory_files_creation_msg: "Inventory files created at '/opt/omnia/omnia_inventory/'" +commentedconfigparser_python_package: "commented-configparser" +inventory_files: + - compute_cpu_intel + - compute_cpu_amd + - compute_gpu_nvidia + - compute_gpu_amd + - compute_gpu_intel + - compute_hostname_ip + +vendors: + cpu: + intel: compute_cpu_intel + amd: compute_cpu_amd + gpu: + nvidia: compute_gpu_nvidia + amd: compute_gpu_amd + intel: compute_gpu_intel diff --git a/utils/roles/node_deletion/delete_node/files/delete_node_info.py b/utils/roles/node_deletion/delete_node/files/delete_node_info.py index 64556ba56..ea524b23e 100644 --- a/utils/roles/node_deletion/delete_node/files/delete_node_info.py +++ b/utils/roles/node_deletion/delete_node/files/delete_node_info.py @@ -12,21 +12,21 @@ # See the License for the specific language governing permissions and # limitations under the License. -#!/usr/bin/env python3.9 +#!/usr/bin/env python3.11 ''' - This module contains tasks required to delete node details from control plane- DB and inventory files + This module contains tasks required to delete node details from Omnia Infrastructure Manager- DB and inventory files ''' import sys import subprocess +import os - -def delete_node_info_from_cp(nodename): +def delete_node_info_from_oim(nodename): ''' - This modules deletes node object + This modules deletes node object ''' - + try: # Delete the entry from /etc/hosts command = ['/opt/xcat/sbin/makehosts', '-d', nodename] @@ -45,11 +45,11 @@ def delete_node_info_from_cp(nodename): command = ['/opt/xcat/sbin/makedns', '-n'] temp = subprocess.run(command, shell=False, check=True) - + except subprocess.CalledProcessError as e: - print(f"delete_node_info_from_cp: {e}") + print(f"delete_node_info_from_oim: {e}") + - def delete_node_info_from_inventory_files(inv_file_folder, nodeinfo): ''' @@ -59,21 +59,26 @@ def delete_node_info_from_inventory_files(inv_file_folder, nodeinfo): servicetag = '' found = False - inv_files = ["compute_servicetag_ip", "compute_gpu_amd", "compute_gpu_nvidia", "compute_cpu_amd", "compute_cpu_intel"] - for file_name in inv_files: + inv_files = ["compute_hostname_ip", "compute_gpu_amd", "compute_gpu_nvidia", "compute_cpu_amd", "compute_cpu_intel", "compute_gpu_intel"] + for file_name in inv_files: try: - with open(inv_file_folder+file_name,"r") as f: + file_path = os.path.join(inv_file_folder, file_name) + with open(file_path, "r") as f: new_f = f.readlines() - - with open(inv_file_folder+file_name, "w") as f: - for line in new_f: - if nodeinfo.lower() not in line: - f.write(line) - + print(f"Original contents of {file_name}: {new_f}") + + with open(file_path, "w") as f: + if new_f: + for line in new_f: + if nodeinfo.lower() not in line.lower(): + f.write(line) + else: + print(f"Deleting line: {line.strip()}") + except FileNotFoundError: print(file_name + " not found") if __name__ == '__main__': - delete_node_info_from_cp(sys.argv[1]) - delete_node_info_from_inventory_files(sys.argv[2], sys.argv[3]) + delete_node_info_from_oim(sys.argv[1]) + delete_node_info_from_inventory_files(os.path.abspath(sys.argv[2]), sys.argv[1]) diff --git a/utils/roles/node_deletion/delete_node/tasks/delete_nodes.yml b/utils/roles/node_deletion/delete_node/tasks/delete_nodes.yml index 7356fd588..81679d627 100644 --- a/utils/roles/node_deletion/delete_node/tasks/delete_nodes.yml +++ b/utils/roles/node_deletion/delete_node/tasks/delete_nodes.yml @@ -20,7 +20,7 @@ MANPATH: "{{ xcat_manpath_env }}" PERL_BADLANG: "{{ perl_badlang_env }}" ansible.builtin.command: | - {{ python_version }} {{ delete_node_info }} {{ item.value.node }} {{ inv_file_folder }} {{ item.value.service_tag }} + {{ python_version }} {{ delete_node_info }} {{ item.value.node }} {{ inv_file_folder }} failed_when: false changed_when: false diff --git a/utils/roles/node_deletion/delete_node/tasks/main.yml b/utils/roles/node_deletion/delete_node/tasks/main.yml index eb55a87e3..4c28ef6e1 100644 --- a/utils/roles/node_deletion/delete_node/tasks/main.yml +++ b/utils/roles/node_deletion/delete_node/tasks/main.yml @@ -37,8 +37,9 @@ community.postgresql.postgresql_query: db: omniadb login_user: postgres - query: SELECT node,status,admin_ip,service_tag,hostname FROM cluster.nodeinfo where (node!='control_plane') AND (admin_ip='{{ item }}'); + query: SELECT node,status,admin_ip,service_tag,hostname FROM cluster.nodeinfo where (node!='oim') AND (admin_ip='{{ item }}'); login_password: "{{ hostvars['localhost']['postgresdb_password'] }}" + become: true become_user: postgres register: ip_query_status with_items: "{{ ip_addresses }}" @@ -48,8 +49,9 @@ community.postgresql.postgresql_query: db: omniadb login_user: postgres - query: SELECT node,status,admin_ip,service_tag,hostname FROM cluster.nodeinfo where (node!='control_plane') AND (service_tag='{{ item | upper }}' OR node='{{ item }}' OR hostname='{{ item }}'); # noqa: yaml[line-length] + query: SELECT node,status,admin_ip,service_tag,hostname FROM cluster.nodeinfo where (node!='oim') AND (service_tag='{{ item | upper }}' OR node='{{ item }}' OR hostname='{{ item }}'); # noqa: yaml[line-length] login_password: "{{ hostvars['localhost']['postgresdb_password'] }}" + become: true become_user: postgres register: non_ip_query_status with_items: "{{ non_ip_addresses }}" @@ -148,8 +150,9 @@ community.postgresql.postgresql_query: db: omniadb login_user: postgres - query: DELETE FROM cluster.nodeinfo where ('{{ item.value.node }}'<>'control_plane') AND node='{{ item.value.node }}'; + query: DELETE FROM cluster.nodeinfo where ('{{ item.value.node }}'<>'oim') AND node='{{ item.value.node }}'; login_password: "{{ hostvars['localhost']['postgresdb_password'] }}" + become: true become_user: postgres no_log: true register: query_status diff --git a/utils/roles/node_deletion/delete_node/vars/main.yml b/utils/roles/node_deletion/delete_node/vars/main.yml index 9f39c318f..eaf7ee610 100644 --- a/utils/roles/node_deletion/delete_node/vars/main.yml +++ b/utils/roles/node_deletion/delete_node/vars/main.yml @@ -21,8 +21,7 @@ omnia_telemetry_file: "/opt/omnia/telemetry/omnia_telemetry" omnia_telemetry_servicepath: "/etc/systemd/omnia_telemetry.service" delete_node_info: "{{ role_path }}/files/delete_node_info.py" inv_file_folder: /opt/omnia/omnia_inventory/ -python_version: "python3.9" - +python_version: "{{ ansible_python_interpreter }}" xcat_root_env: "/opt/xcat" xcat_path_env: "/opt/xcat/bin:/opt/xcat/sbin:/opt/xcat/share/xcat/tools" xcat_manpath_env: "/opt/xcat/share/man:$MANPATH" diff --git a/utils/roles/control_plane_cleanup/tasks/clean_local_repo_setup.yml b/utils/roles/oim_cleanup/tasks/clean_local_repo_setup.yml similarity index 89% rename from utils/roles/control_plane_cleanup/tasks/clean_local_repo_setup.yml rename to utils/roles/oim_cleanup/tasks/clean_local_repo_setup.yml index a169cacc6..c6b2d0ed6 100644 --- a/utils/roles/control_plane_cleanup/tasks/clean_local_repo_setup.yml +++ b/utils/roles/oim_cleanup/tasks/clean_local_repo_setup.yml @@ -13,12 +13,12 @@ # limitations under the License. --- -- name: Set control_plane_os +- name: Set oim_os ansible.builtin.set_fact: - control_plane_os: "{{ ansible_distribution | lower }}" + oim_os: "{{ ansible_distribution | lower }}" -- name: Include vars for {{ control_plane_os }} - ansible.builtin.include_vars: "{{ role_path }}/vars/{{ control_plane_os }}.yml" +- name: Include vars for {{ oim_os }} + ansible.builtin.include_vars: "{{ role_path }}/vars/{{ oim_os }}.yml" - name: Stop nerdctl-registry service ansible.builtin.service: @@ -65,8 +65,8 @@ name: containerd.io state: absent disable_gpg_check: true - when: control_plane_os in control_plane_os_redhat or - control_plane_os in control_plane_os_rocky + when: oim_os in oim_os_redhat or + oim_os in oim_os_rocky - name: Remove containerd.io package -Ubuntu ansible.builtin.apt: @@ -75,7 +75,7 @@ autoremove: true purge: true failed_when: false - when: control_plane_os in control_plane_os_ubuntu + when: oim_os in oim_os_ubuntu - name: Remove nerdctl and containerd files ansible.builtin.file: diff --git a/utils/roles/control_plane_cleanup/tasks/clean_nfs_server.yml b/utils/roles/oim_cleanup/tasks/clean_nfs_server.yml similarity index 100% rename from utils/roles/control_plane_cleanup/tasks/clean_nfs_server.yml rename to utils/roles/oim_cleanup/tasks/clean_nfs_server.yml diff --git a/utils/roles/control_plane_cleanup/tasks/clean_provision_setup.yml b/utils/roles/oim_cleanup/tasks/clean_provision_setup.yml similarity index 80% rename from utils/roles/control_plane_cleanup/tasks/clean_provision_setup.yml rename to utils/roles/oim_cleanup/tasks/clean_provision_setup.yml index 1fa33cd78..45463e40a 100644 --- a/utils/roles/control_plane_cleanup/tasks/clean_provision_setup.yml +++ b/utils/roles/oim_cleanup/tasks/clean_provision_setup.yml @@ -13,12 +13,12 @@ # limitations under the License. --- -- name: Set control_plane_os +- name: Set oim_os ansible.builtin.set_fact: - control_plane_os: "{{ ansible_distribution | lower }}" + oim_os: "{{ ansible_distribution | lower }}" -- name: Include vars for {{ control_plane_os }} - ansible.builtin.include_vars: "{{ role_path }}/vars/{{ control_plane_os }}.yml" +- name: Include vars for {{ oim_os }} + ansible.builtin.include_vars: "{{ role_path }}/vars/{{ oim_os }}.yml" - name: Include metadata.yml file ansible.builtin.include_vars: "{{ provision_metadata_path }}" @@ -34,43 +34,43 @@ changed_when: false - name: Cleanup node info from dhcp - ansible.builtin.command: makedhcp -d -a + ansible.builtin.command: "{{ xcat_sbin_path }}/makedhcp -d -a" when: ("running" in xcat_status.stdout) failed_when: false changed_when: false - name: Fetch all the osimage - ansible.builtin.command: lsdef -t osimage + ansible.builtin.command: "{{ xcat_path }}/lsdef -t osimage" register: os_image_list failed_when: false changed_when: false - name: Remove the OS images - ansible.builtin.command: rmdef -t osimage {{ item.split(' ')[0] }} + ansible.builtin.command: "{{ xcat_path }}/rmdef -t osimage {{ item.split(' ')[0] }}" with_items: "{{ os_image_list.stdout_lines }}" failed_when: false changed_when: false - name: Clean host entries of switch - ansible.builtin.command: makehosts -d switch + ansible.builtin.command: "{{ xcat_sbin_path }}/makehosts -d switch" when: ("running" in xcat_status.stdout) failed_when: false changed_when: false - name: Clean host entries of nodes - ansible.builtin.command: makehosts -d all + ansible.builtin.command: "{{ xcat_sbin_path }}/makehosts -d all" when: ("running" in xcat_status.stdout) failed_when: false changed_when: false - name: Cleanup node info from tftpboot - ansible.builtin.command: rmdef all + ansible.builtin.command: "{{ xcat_path }}/rmdef all" when: ("running" in xcat_status.stdout) failed_when: false changed_when: false - name: Cleanup node info from DNS - ansible.builtin.command: makedns -n + ansible.builtin.command: "{{ xcat_sbin_path }}/makedns -n" when: ("running" in xcat_status.stdout) failed_when: false changed_when: false @@ -125,8 +125,8 @@ state: absent failed_when: false when: - - control_plane_os in control_plane_os_redhat or - control_plane_os in control_plane_os_rocky + - oim_os in oim_os_redhat or + oim_os in oim_os_rocky - name: Remove xCAT and postgres packages - ubuntu ansible.builtin.apt: @@ -135,7 +135,7 @@ autoremove: true purge: true failed_when: false - when: control_plane_os in control_plane_os_ubuntu + when: oim_os in oim_os_ubuntu - name: Remove postgres directory ansible.builtin.file: @@ -149,6 +149,18 @@ with_items: "{{ xcat_download_files }}" tags: downloads + - name: Remove /etc/exports entries + ansible.builtin.lineinfile: + path: "{{ exports_path }}" + regexp: "{{ item }}" + state: absent + with_items: "{{ exports_regexp }}" + + - name: Exporting the shared directories + ansible.builtin.command: /usr/sbin/exportfs -r + changed_when: true + failed_when: false + - name: Create omnia folder in opt folder ansible.builtin.file: path: "{{ omnia_dir }}" @@ -177,7 +189,7 @@ ansible.builtin.command: dnf remove epel-release -y changed_when: true failed_when: false - when: control_plane_os_redhat in control_plane_os + when: oim_os_redhat in oim_os - name: Stop Squid service ansible.builtin.service: @@ -190,8 +202,8 @@ ansible.builtin.yum: name: squid state: absent - when: control_plane_os in control_plane_os_redhat or - control_plane_os in control_plane_os_rocky + when: oim_os in oim_os_redhat or + oim_os in oim_os_rocky - name: Remove squid package - Ubuntu ansible.builtin.apt: @@ -200,4 +212,4 @@ autoremove: true purge: true failed_when: false - when: control_plane_os in control_plane_os_ubuntu + when: oim_os in oim_os_ubuntu diff --git a/utils/roles/oim_cleanup/tasks/clean_telemetry_setup.yml b/utils/roles/oim_cleanup/tasks/clean_telemetry_setup.yml new file mode 100644 index 000000000..dba3c7ead --- /dev/null +++ b/utils/roles/oim_cleanup/tasks/clean_telemetry_setup.yml @@ -0,0 +1,231 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Set oim_os + ansible.builtin.set_fact: + oim_os: "{{ ansible_distribution | lower }}" + +- name: Reset kubeadm + ansible.builtin.command: kubeadm reset --cri-socket={{ crio_socket }} -f + changed_when: false + failed_when: false + +- name: Stop kubelet and etcd service + ansible.builtin.service: + name: "{{ item }}" + state: stopped + with_items: "{{ k8s_services }}" + failed_when: false + +- name: Remove kubelet and etcd service files + ansible.builtin.file: + path: "{{ item }}" + state: absent + with_items: "{{ k8s_service_files }}" + failed_when: false + +- name: Check if crictl is present + ansible.builtin.stat: + path: "{{ bin_dir }}/crictl" + get_attributes: false + get_checksum: false + get_mime: false + register: crictl + +- name: Remove crictl containers + when: crictl.stat.exists + block: + - name: Stop all cri containers + ansible.builtin.shell: "set -o pipefail && {{ bin_dir }}/crictl ps -q | xargs -r {{ bin_dir }}/crictl -t 60s stop -t {{ grace_period }}" + args: + executable: /bin/bash + register: remove_all_cri_containers + retries: 5 + until: remove_all_cri_containers.rc == 0 + delay: 5 + failed_when: false + changed_when: true + + - name: Force remove all cri containers + ansible.builtin.command: "{{ bin_dir }}/crictl rm -a -f" + register: remove_all_cri_containers + retries: 5 + until: remove_all_cri_containers.rc == 0 + delay: 5 + failed_when: false + changed_when: true + + - name: Stop all cri pods + ansible.builtin.shell: "set -o pipefail && {{ bin_dir }}/crictl pods -q | xargs -r {{ bin_dir }}/crictl -t 60s stop" + args: + executable: /bin/bash + register: remove_all_cri_containers + retries: 5 + until: remove_all_cri_containers.rc == 0 + delay: 5 + failed_when: false + changed_when: true + + - name: Force remove all cri pods + ansible.builtin.command: "{{ bin_dir }}/crictl rmp -a -f" + register: remove_all_cri_containers + retries: 5 + until: remove_all_cri_containers.rc == 0 + delay: 5 + failed_when: false + changed_when: true + rescue: + - name: Force remove all cri pods (rescue) + ansible.builtin.shell: "set -o pipefail && ip netns list | cut -d' ' -f 1 | xargs -n1 ip netns delete && {{ bin_dir }}/crictl rmp -a -f" + failed_when: false + changed_when: true + +- name: Remove k8s bin file + ansible.builtin.file: + path: "{{ item }}" + state: absent + with_items: "{{ k8s_bin_files }}" + failed_when: false + +- name: Remove k8s config file + ansible.builtin.file: + path: "{{ item }}" + state: absent + with_items: "{{ k8s_del_files }}" + failed_when: false + +- name: Get container ID for buildkitd + ansible.builtin.shell: > + set -o pipefail + && nerdctl ps -q -a -f name=buildkitd | head -n 1 + register: buildkit_container_id + changed_when: false + failed_when: false + +- name: Stop buildkitd container + ansible.builtin.command: nerdctl stop {{ buildkit_container_id.stdout }} + when: buildkit_container_id.stdout is defined and buildkit_container_id.stdout != '' + changed_when: false + failed_when: false + +- name: Remove buildkitd container + ansible.builtin.command: nerdctl rm {{ buildkit_container_id.stdout }} + when: buildkit_container_id.stdout is defined and buildkit_container_id.stdout != '' + changed_when: false + failed_when: false + +- name: Remove runc package + ansible.builtin.package: + name: runc + state: absent + failed_when: false + +- name: Include telemetry_config file + ansible.builtin.include_tasks: include_telemetry_config.yml + +- name: Remove grafana persistent data + ansible.builtin.file: + path: "{{ item }}" + state: absent + with_items: "{{ grafana_folders }}" + +- name: Remove telemetry github data + ansible.builtin.file: + path: "{{ item }}" + state: absent + with_items: "{{ telemetry_folders }}" + +- name: Remove telemetry database persistent data + ansible.builtin.file: + path: "{{ item }}" + state: absent + with_items: "{{ database_folders }}" + tags: database + +- name: Remove metallb data + ansible.builtin.file: + path: "{{ item }}" + state: absent + with_items: "{{ metallb_files }}" + +- name: Stop docker service + ansible.builtin.service: + name: docker.service + state: stopped + enabled: false + failed_when: false + +- name: Reload systemd + ansible.builtin.systemd: + daemon_reload: true + changed_when: false + failed_when: false + +- name: Remove docker packages for RHEL/Rocky + ansible.builtin.command: dnf remove -y {{ docker_packages }} + changed_when: true + failed_when: false + when: oim_os in oim_os_redhat or + oim_os in oim_os_rocky + +- name: Remove docker packages for Ubuntu + ansible.builtin.command: apt remove -y {{ docker_packages }} + changed_when: true + failed_when: false + when: oim_os in oim_os_ubuntu + +- name: Remove docker files + ansible.builtin.file: + path: "{{ item }}" + state: absent + with_items: "{{ docker_del_files }}" + +- name: Reload systemd + ansible.builtin.systemd: + daemon_reload: true + failed_when: false + +- name: Remove docker0 interface from firewall + ansible.builtin.command: firewall-cmd --remove-interface docker0 + changed_when: true + failed_when: false + +- name: Remove docker0 nic + ansible.builtin.command: ip link delete docker0 + changed_when: true + failed_when: false + +- name: Free Omnia Infrastructure Manager k8s by killing related processes + block: + - name: Check for Kubernetes-related processes occupying ports + ansible.builtin.shell: > + set -o pipefail && \ + netstat -tulpn | grep 'kube' + register: kube_processes + changed_when: false + failed_when: false + + - name: Extract PIDs of Kubernetes processes using ports + ansible.builtin.set_fact: + kube_pids: "{{ kube_processes.stdout_lines | map('regex_search', '\\s(\\d+)/', '\\1') | flatten }}" + when: kube_processes.stdout_lines | default("", true) | length > 1 + failed_when: false + + - name: Kill Kubernetes processes to free the ports + ansible.builtin.command: "kill -9 {{ item }}" + with_items: "{{ kube_pids }}" + when: kube_pids | default("", true) | length > 1 + changed_when: false + failed_when: false diff --git a/upgrade/roles/backup_telemetry/tasks/include_telemetry_config.yml b/utils/roles/oim_cleanup/tasks/include_telemetry_config.yml similarity index 100% rename from upgrade/roles/backup_telemetry/tasks/include_telemetry_config.yml rename to utils/roles/oim_cleanup/tasks/include_telemetry_config.yml diff --git a/utils/roles/control_plane_cleanup/tasks/main.yml b/utils/roles/oim_cleanup/tasks/main.yml similarity index 100% rename from utils/roles/control_plane_cleanup/tasks/main.yml rename to utils/roles/oim_cleanup/tasks/main.yml diff --git a/utils/roles/control_plane_cleanup/vars/main.yml b/utils/roles/oim_cleanup/vars/main.yml similarity index 65% rename from utils/roles/control_plane_cleanup/vars/main.yml rename to utils/roles/oim_cleanup/vars/main.yml index 30aa50718..cf1c93a07 100644 --- a/utils/roles/control_plane_cleanup/vars/main.yml +++ b/utils/roles/oim_cleanup/vars/main.yml @@ -1,4 +1,4 @@ -# Copyright 2023 Dell Inc. or its subsidiaries. All Rights Reserved. +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,9 +14,9 @@ --- # Usage: main.yml -control_plane_os_redhat: "redhat" -control_plane_os_rocky: "rocky" -control_plane_os_ubuntu: "ubuntu" +oim_os_redhat: "redhat" +oim_os_rocky: "rocky" +oim_os_ubuntu: "ubuntu" # Usage: clean_provision_setup.yml xcat_packages_path: "/opt/xcat/share/xcat/tools/go-xcat" @@ -55,23 +55,23 @@ xcat_del_files: - "/opt/omnia/static.stanzas" - "/opt/omnia/discover.stanzas" - "/opt/omnia/dynamic_ip_list" - - /tmp/certs.d/ - - /tmp/conf - - /tmp/repos + - "/opt/omnia/syncfiles" - /opt/omnia/telemetry - /opt/omnia/temp_pxe_file.csv - /opt/omnia/.postgres/ - /opt/omnia/.bcm_roce + - /opt/omnia/offline/local_repo_access.yml omnia_dir: "/opt/omnia" -file_permission: "754" - -# Usage:clean_telemetry_setup.yml +file_permission: "755" +xcat_path: /opt/xcat/bin +xcat_sbin_path: /opt/xcat/sbin exports_path: /etc/exports exports_regexp: - "/install" - "/tftpboot" - - "/var/nfs_repo" + +# Usage:clean_telemetry_setup.yml crio_socket: /var/run/crio/crio.sock grafana_folders: - "{{ mount_location }}grafana" @@ -86,12 +86,65 @@ metallb_files: - "/var/lib/ipaddresspool.yaml" - "/var/lib/l2advertisement.yaml" bin_dir: /usr/local/bin +usr_bin_dir: /usr/bin grace_period: 0 docker_packages: "docker-ce docker-ce-cli docker-buildx-plugin" docker_del_files: - /var/lib/docker - /etc/yum.repos.d/docker-ce.repo -kube_folder_path: /root/.kube + - /etc/systemd/system/docker.service.d/http-proxy.conf + - /etc/apt/sources.list.d/docker.list +k8s_bin_files: + - "{{ bin_dir }}/kubelet" + - "{{ bin_dir }}/kubectl" + - "{{ bin_dir }}/crictl" + - "{{ bin_dir }}/etcd" + - "{{ bin_dir }}/calicoctl" + - "{{ bin_dir }}/kubeadm" + - "{{ bin_dir }}/calicoctl.sh" + - "{{ bin_dir }}/etcdctl" + - "{{ bin_dir }}/etcdctl.sh" + - "{{ bin_dir }}/k8s-certs-renew.sh" + - "{{ bin_dir }}/helm" + - "{{ usr_bin_dir }}/kubelet" + - "{{ usr_bin_dir }}/kubectl" + - "{{ usr_bin_dir }}/crictl" + - "{{ usr_bin_dir }}/etcd" + - "{{ usr_bin_dir }}/calicoctl" + - "{{ usr_bin_dir }}/kubeadm" + - "{{ usr_bin_dir }}/calicoctl.sh" + - "{{ usr_bin_dir }}/etcdctl" + - "{{ usr_bin_dir }}/etcdctl.sh" + - "{{ usr_bin_dir }}/k8s-certs-renew.sh" + - "{{ usr_bin_dir }}/helm" +k8s_service_files: + - /etc/systemd/system/kubelet.service + - /etc/systemd/system/etcd.service +k8s_services: + - kubelet + - etcd +k8s_del_files: + - /usr/local/share/ca-certificates/etcd-ca.crt + - /usr/local/share/ca-certificates/kube-ca.crt + - /etc/ssl/certs/etcd-ca.pem + - /etc/ssl/certs/kube-ca.pem + - /etc/pki/ca-trust/source/anchors/etcd-ca.crt + - /etc/pki/ca-trust/source/anchors/kube-ca.crt + - /var/log/calico + - /etc/calico + - /var/lib/kubelet + - /var/lib/etcd + - /run/calico + - /etc/bash_completion.d/kubectl.sh + - /etc/modules-load.d/kubespray-br_netfilter.conf + - /usr/libexec/kubernetes + - /etc/NetworkManager/conf.d/calico.conf + - /etc/NetworkManager/conf.d/k8s.conf + - /root/.helm + - /root/.config/helm + - /root/.cache/helm + - /root/.local/share/helm + - /root/.kube # Usage:include_telemetry_config.yml telemetry_config_file: "{{ role_path }}/../../../input/telemetry_config.yml" @@ -103,7 +156,6 @@ vault_file_perm: '0644' local_repo_config_file: "{{ role_path }}/../../../input/local_repo_config.yml" local_repo_config_syntax_fail_msg: "Failed. Syntax errors present in local_repo_config.yml. Fix errors and re-run playbook again. Common syntax Errors: indentation errors, improper quotes, improper space or tab, missing colon, missing comma etc. " -docker_list_path: "/etc/apt/sources.list.d/docker.list" # Usage: clean_nfs_server.yml storage_config_path: "{{ role_path }}/../../../input/storage_config.yml" diff --git a/utils/roles/control_plane_cleanup/vars/redhat.yml b/utils/roles/oim_cleanup/vars/redhat.yml similarity index 82% rename from utils/roles/control_plane_cleanup/vars/redhat.yml rename to utils/roles/oim_cleanup/vars/redhat.yml index 34d68c9da..a9c71119f 100644 --- a/utils/roles/control_plane_cleanup/vars/redhat.yml +++ b/utils/roles/oim_cleanup/vars/redhat.yml @@ -28,13 +28,19 @@ nerdctl_del_files: - /var/lib/containerd/ - /etc/systemd/system/nerdctl-registry.service - /var/lib/nerdctl/1935db59 - - /tmp/nerdctl + - /opt/cni + - /tmp/nerdctl/ + - /tmp/nerdctl-1.7.4-linux-amd64.tar.gz - /tmp/nerdctl-1.5.0-linux-amd64.tar.gz - /tmp/containerd.io-1.6.16-3.1.el8.x86_64.rpm - /tmp/containerd-rootless-setuptool.sh - /tmp/containerd-rootless.sh + - /etc/omnia_environment + - /etc/systemd/system/containerd.service.d/http-proxy.conf + - /root/.docker/config.json offline_del_files: - - /opt/omnia/offline/ + - /opt/omnia/offline/.data + - /opt/omnia/offline/download_package_status.csv - /opt/omnia/ldap/ - /opt/omnia/nerdctl-registry/ - /etc/cluster.repos.d diff --git a/utils/roles/oim_cleanup/vars/rocky.yml b/utils/roles/oim_cleanup/vars/rocky.yml new file mode 100644 index 000000000..ba2f905fb --- /dev/null +++ b/utils/roles/oim_cleanup/vars/rocky.yml @@ -0,0 +1 @@ +redhat.yml \ No newline at end of file diff --git a/utils/roles/control_plane_cleanup/vars/ubuntu.yml b/utils/roles/oim_cleanup/vars/ubuntu.yml similarity index 82% rename from utils/roles/control_plane_cleanup/vars/ubuntu.yml rename to utils/roles/oim_cleanup/vars/ubuntu.yml index 4fc666fbe..34686b89a 100644 --- a/utils/roles/control_plane_cleanup/vars/ubuntu.yml +++ b/utils/roles/oim_cleanup/vars/ubuntu.yml @@ -25,16 +25,20 @@ xcat_download_files: - /etc/apt/sources.list.d/xcat-core.list - /etc/apt/sources.list.d/xcat-dep.list nerdctl_del_files: - - /tmp/nerdctl-1.5.0-linux-amd64.tar.gz - - /tmp/bin/nerdctl + - /tmp/nerdctl-1.7.4-linux-amd64.tar.gz + - /tmp/nerdctl/ - /opt/cni - /usr/local/bin/nerdctl - /var/lib/containerd/ - /etc/systemd/system/nerdctl-registry.service - /var/lib/nerdctl/1935db59 - /tmp/containerd.io_1.6.16-1_amd64.deb + - /etc/omnia_environment + - /etc/systemd/system/containerd.service.d/http-proxy.conf + - /root/.docker/config.json offline_del_files: - - /opt/omnia/offline/ + - /opt/omnia/offline/.data + - /opt/omnia/offline/download_package_status.csv - /opt/omnia/ldap/ - /opt/omnia/nerdctl-registry/ - /opt/containerd diff --git a/utils/roles/remove_cluster/remove_slurm_cluster/tasks/main.yml b/utils/roles/remove_cluster/remove_slurm_cluster/tasks/main.yml index d8ede2f8e..010582c0f 100644 --- a/utils/roles/remove_cluster/remove_slurm_cluster/tasks/main.yml +++ b/utils/roles/remove_cluster/remove_slurm_cluster/tasks/main.yml @@ -57,8 +57,14 @@ - name: Remove Slurm user ansible.builtin.user: name: slurm + uid: "{{ slurm_uid }}" + group: slurm state: absent - failed_when: false + force: true + register: remove_slurm_user_result + retries: "{{ slurm_user_deletion }}" + delay: "{{ slurm_user_deletion_retry_delay_seconds }}" + until: remove_slurm_user_result is success - name: Remove Slurm packages ansible.builtin.package: @@ -70,23 +76,9 @@ - name: Remove slurmd service ansible.builtin.file: state: absent - path: "{{ slurmd_servicepath }}" - ignore_errors: true # noqa: ignore-errors - when: inventory_hostname in groups['slurm_node'] - - - name: Remove slurmdbd service - ansible.builtin.file: - state: absent - path: "{{ slurmdbd_servicepath }}" - ignore_errors: true # noqa: ignore-errors - when: inventory_hostname in groups['slurm_control_node'] - - - name: Remove slurmctld service - ansible.builtin.file: - state: absent - path: "{{ slurmctld_servicepath }}" - ignore_errors: true # noqa: ignore-errors - when: inventory_hostname in groups['slurm_control_node'] + path: "{{ item }}" + with_items: "{{ slurm_services_files }}" + failed_when: false - name: Remove Slurm common configuration files ansible.builtin.file: @@ -114,6 +106,12 @@ - hostvars['localhost']['slurm_installation_type'] | length > 0 - hostvars['localhost']['slurm_installation_type'] == "nfs_share" block: + - name: Remove SLURM_CONF variable + ansible.builtin.lineinfile: + state: absent + path: "{{ bashrc_path }}" + regexp: '^export SLURM_CONF=.*$' + - name: Get NFS share path from storage_config.yml ansible.builtin.set_fact: share_path: "{{ hostvars['localhost']['nfs_client_params'] @@ -123,21 +121,30 @@ - hostvars['localhost']['nfs_client_params'] is defined and hostvars['localhost']['nfs_client_params'] | length > 0 - hostvars['localhost']['nfs_client_params'] | selectattr('slurm_share', 'defined') | selectattr('slurm_share', 'equalto', true) | list | length == 1 # noqa: yaml[line-length] - - name: Check whether slurm folder exists in NFS share + - name: Remove slurm related files from NFS share when: share_path is defined - ansible.builtin.stat: - path: "{{ share_path }}{{ slurm_nfs_folder }}" - register: file_stat - - - name: Remove slurm directory - when: share_path is defined and file_stat.stat.exists - ansible.builtin.file: - path: "{{ share_path }}{{ slurm_nfs_folder }}" - state: absent - failed_when: false - - - name: Remove SLURM_CONF variable - ansible.builtin.lineinfile: - state: absent - path: "{{ bashrc_path }}" - regexp: '^export SLURM_CONF=.*$' + block: + - name: Check whether slurm folder exists in NFS share + ansible.builtin.stat: + path: "{{ share_path }}{{ slurm_nfs_folder }}" + register: file_stat + + - name: Remove slurm directory + when: file_stat.stat.exists + ansible.builtin.file: + path: "{{ share_path }}{{ slurm_nfs_folder }}" + state: absent + failed_when: false + + - name: Remove NFS share path from LD_LIBRARY_PATH + ansible.builtin.replace: + path: '{{ bashrc_path }}' + regexp: '^(export LD_LIBRARY_PATH=.*?)({{ share_path }}{{ slurm_nfs_folder }}/usr/lib64/:|:{{ share_path }}{{ slurm_nfs_folder }}/usr/lib64/:|{{ share_path }}{{ slurm_nfs_folder }}/usr/lib64/)(.*?)' # noqa: yaml[line-length] + replace: '\1\3' + failed_when: false + + - name: Remove LD_LIBRARY_PATH it is empty + ansible.builtin.lineinfile: + path: "{{ bashrc_path }}" + regexp: '^export LD_LIBRARY_PATH=$' + state: absent diff --git a/utils/roles/remove_cluster/remove_slurm_cluster/vars/main.yml b/utils/roles/remove_cluster/remove_slurm_cluster/vars/main.yml index b11426b0e..3c6bc93cd 100644 --- a/utils/roles/remove_cluster/remove_slurm_cluster/vars/main.yml +++ b/utils/roles/remove_cluster/remove_slurm_cluster/vars/main.yml @@ -17,10 +17,10 @@ slurm_inv_fail_msg: "Required slurm groups are not defined in inventory. Please check inventory format" slurm_not_exists: "Slurm services does not exist" slurm_nfs_folder: "/slurm" -slurmd_servicepath: /usr/lib/systemd/system/slurmd.service -slurmdbd_servicepath: /etc/systemd/system/slurmdbd.service -slurmctld_servicepath: /etc/systemd/system/slurmctld.service bashrc_path: /etc/bashrc +slurm_uid: "6001" +slurm_user_deletion: 3 +slurm_user_deletion_retry_delay_seconds: 10 check_slurm_services: - slurmctld @@ -46,6 +46,14 @@ slurm_packages: - mariadb-server - mariadb-devel +slurm_services_files: + - /usr/lib/systemd/system/slurmd.service + - /usr/lib/systemd/system/slurmdbd.service + - /usr/lib/systemd/system/slurmctld.service + - /etc/systemd/system/slurmd.service + - /etc/systemd/system/slurmdbd.service + - /etc/systemd/system/slurmctld.service + slurm_common_files: - /etc/slurm - /var/spool/mail/slurm diff --git a/utils/roles/remove_node/remove_kube_node/tasks/main.yml b/utils/roles/remove_node/remove_kube_node/tasks/main.yml index cd0f90b73..8831b49c3 100644 --- a/utils/roles/remove_node/remove_kube_node/tasks/main.yml +++ b/utils/roles/remove_node/remove_kube_node/tasks/main.yml @@ -32,8 +32,9 @@ community.postgresql.postgresql_query: db: omniadb login_user: postgres - query: SELECT status FROM cluster.nodeinfo where (node!='control_plane') AND (admin_ip='{{ item }}'); + query: SELECT status FROM cluster.nodeinfo where (node!='oim') AND (admin_ip='{{ item }}'); login_password: "{{ hostvars['localhost']['postgresdb_password'] }}" + become: true become_user: postgres register: ctrl_ip_query_status with_items: "{{ groups['kube_control_plane'] | first }}" @@ -51,8 +52,9 @@ community.postgresql.postgresql_query: db: omniadb login_user: postgres - query: SELECT status FROM cluster.nodeinfo where (node!='control_plane') AND (service_tag='{{ item | upper }}' OR node='{{ item }}' OR hostname='{{ item }}'); # noqa: yaml[line-length] + query: SELECT status FROM cluster.nodeinfo where (node!='oim') AND (service_tag='{{ item | upper }}' OR node='{{ item }}' OR hostname='{{ item }}'); # noqa: yaml[line-length] login_password: "{{ hostvars['localhost']['postgresdb_password'] }}" + become: true become_user: postgres register: ctrl_non_ip_query_status with_items: "{{ groups['kube_control_plane'] | first }}" @@ -118,8 +120,9 @@ community.postgresql.postgresql_query: db: "{{ omnia_db_name }}" login_user: "{{ db_user }}" - query: SELECT admin_ip, service_tag, hostname, node, status FROM cluster.nodeinfo where (node!='control_plane') AND (service_tag='{{ item | upper }}' OR node='{{ item }}' OR hostname='{{ item }}'); # noqa: yaml[line-length] + query: SELECT admin_ip, service_tag, hostname, node, status FROM cluster.nodeinfo where (node!='oim') AND (service_tag='{{ item | upper }}' OR node='{{ item }}' OR hostname='{{ item }}'); # noqa: yaml[line-length] login_password: "{{ hostvars['localhost']['postgresdb_password'] }}" + become: true become_user: postgres register: non_ip_query_status with_items: "{{ non_ip_addresses }}" @@ -129,8 +132,9 @@ community.postgresql.postgresql_query: db: "{{ omnia_db_name }}" login_user: "{{ db_user }}" - query: SELECT admin_ip, service_tag, hostname, node, status FROM cluster.nodeinfo where (node!='control_plane') AND admin_ip='{{ item }}'; + query: SELECT admin_ip, service_tag, hostname, node, status FROM cluster.nodeinfo where (node!='oim') AND admin_ip='{{ item }}'; login_password: "{{ hostvars['localhost']['postgresdb_password'] }}" + become: true become_user: postgres register: ip_query_status with_items: "{{ ip_addresses }}" diff --git a/utils/roles/remove_node/remove_slurm_node/tasks/main.yml b/utils/roles/remove_node/remove_slurm_node/tasks/main.yml index 03ffa833a..bc17edaed 100644 --- a/utils/roles/remove_node/remove_slurm_node/tasks/main.yml +++ b/utils/roles/remove_node/remove_slurm_node/tasks/main.yml @@ -124,8 +124,16 @@ when: status == 'booted' ansible.builtin.user: name: slurm + uid: "{{ slurm_uid }}" + group: slurm state: absent + force: true + register: remove_slurm_user_result + retries: "{{ slurm_user_deletion }}" + delay: "{{ slurm_user_deletion_retry_delay_seconds }}" + until: remove_slurm_user_result is success failed_when: false + ignore_errors: true - name: Remove Slurm packages when: status == 'booted' @@ -135,10 +143,12 @@ with_items: "{{ slurm_packages }}" failed_when: false -- name: Remove slurmd service +- name: Remove slurm service ansible.builtin.file: state: absent - path: "{{ slurmd_servicepath }}" + path: "{{ item }}" + with_items: "{{ slurm_services_files }}" + failed_when: false - name: Remove Slurm common configuration files when: status == 'booted' @@ -160,13 +170,31 @@ with_items: "{{ slurm_configless_files }}" failed_when: false -- name: Remove SLURM_CONF variable if slurm is installed on NFS shared +- name: Remove environment variables related to NFS share mode if slurm is installed in nfs_share mode when: - share_path is defined - hostvars['localhost']['slurm_installation_type'] is defined - hostvars['localhost']['slurm_installation_type'] | length > 0 - hostvars['localhost']['slurm_installation_type'] == "nfs_share" - ansible.builtin.lineinfile: - state: absent - path: "{{ bashrc_path }}" - regexp: '^export SLURM_CONF=.*$' + block: + - name: Remove SLURM_CONF variable + ansible.builtin.lineinfile: + state: absent + path: "{{ bashrc_path }}" + regexp: '^export SLURM_CONF=.*$' + + - name: Remove NFS share path from LD_LIBRARY_PATH + ansible.builtin.replace: + path: '{{ bashrc_path }}' + regexp: '^(export LD_LIBRARY_PATH=.*?)({{ share_path }}{{ slurm_nfs_folder }}/usr/lib64/:|:{{ share_path }}{{ slurm_nfs_folder }}/usr/lib64/:|{{ share_path }}{{ slurm_nfs_folder }}/usr/lib64/)(.*?)' # noqa: yaml[line-length] + replace: '\1\3' + failed_when: false + changed_when: false + + - name: Clean empty LD_LIBRARY_PATH + ansible.builtin.lineinfile: + path: "{{ bashrc_path }}" + regexp: '^export LD_LIBRARY_PATH=$' + state: absent + failed_when: false + changed_when: false diff --git a/utils/roles/remove_node/remove_slurm_node/vars/main.yml b/utils/roles/remove_node/remove_slurm_node/vars/main.yml index e0013b9dd..b9e961aa5 100644 --- a/utils/roles/remove_node/remove_slurm_node/vars/main.yml +++ b/utils/roles/remove_node/remove_slurm_node/vars/main.yml @@ -16,11 +16,13 @@ # Usage: main.yml change_state_retries: 3 change_state_retry_delay_seconds: 30 +slurm_user_deletion: 3 +slurm_user_deletion_retry_delay_seconds: 10 slurm_conf_path: /etc/slurm/slurm.conf slurm_nfs_folder: /slurm bashrc_path: /etc/bashrc -slurmd_servicepath: /usr/lib/systemd/system/slurmd.service +slurm_uid: "6001" slurm_services: - slurmd @@ -37,6 +39,10 @@ slurm_packages: - mariadb-server - mariadb-devel +slurm_services_files: + - /usr/lib/systemd/system/slurmd.service + - /etc/systemd/system/slurmd.service + slurm_common_files: - /etc/slurm - /var/spool/mail/slurm diff --git a/utils/roles/remove_node/validate_slurm_node/tasks/main.yml b/utils/roles/remove_node/validate_slurm_node/tasks/main.yml index eaa044662..696d90123 100644 --- a/utils/roles/remove_node/validate_slurm_node/tasks/main.yml +++ b/utils/roles/remove_node/validate_slurm_node/tasks/main.yml @@ -32,8 +32,9 @@ community.postgresql.postgresql_query: db: omniadb login_user: postgres - query: SELECT status FROM cluster.nodeinfo where (node!='control_plane') AND (admin_ip='{{ item }}'); + query: SELECT status FROM cluster.nodeinfo where (node!='oim') AND (admin_ip='{{ item }}'); login_password: "{{ hostvars['localhost']['postgresdb_password'] }}" + become: true become_user: postgres register: ctrl_ip_query_status with_items: "{{ groups['slurm_control_node'] | first }}" @@ -51,8 +52,9 @@ community.postgresql.postgresql_query: db: omniadb login_user: postgres - query: SELECT status FROM cluster.nodeinfo where (node!='control_plane') AND (service_tag='{{ item | upper }}' OR node='{{ item }}' OR hostname='{{ item }}'); # noqa: yaml[line-length] + query: SELECT status FROM cluster.nodeinfo where (node!='oim') AND (service_tag='{{ item | upper }}' OR node='{{ item }}' OR hostname='{{ item }}'); # noqa: yaml[line-length] login_password: "{{ hostvars['localhost']['postgresdb_password'] }}" + become: true become_user: postgres register: ctrl_non_ip_query_status with_items: "{{ groups['slurm_control_node'] | first }}" @@ -87,8 +89,9 @@ community.postgresql.postgresql_query: db: omniadb login_user: postgres - query: SELECT hostname,status,admin_ip FROM cluster.nodeinfo where (node!='control_plane') AND (admin_ip='{{ item }}'); + query: SELECT hostname,status,admin_ip FROM cluster.nodeinfo where (node!='oim') AND (admin_ip='{{ item }}'); login_password: "{{ hostvars['localhost']['postgresdb_password'] }}" + become: true become_user: postgres register: ip_query_status with_items: "{{ ip_addresses }}" @@ -98,8 +101,9 @@ community.postgresql.postgresql_query: db: omniadb login_user: postgres - query: SELECT hostname,status,admin_ip FROM cluster.nodeinfo where (node!='control_plane') AND (service_tag='{{ item | upper }}' OR node='{{ item }}' OR hostname='{{ item }}'); # noqa: yaml[line-length] + query: SELECT hostname,status,admin_ip FROM cluster.nodeinfo where (node!='oim') AND (service_tag='{{ item | upper }}' OR node='{{ item }}' OR hostname='{{ item }}'); # noqa: yaml[line-length] login_password: "{{ hostvars['localhost']['postgresdb_password'] }}" + become: true become_user: postgres register: non_ip_query_status with_items: "{{ non_ip_addresses }}" diff --git a/utils/roles/rocm_installation/tasks/install_rocm.yml b/utils/roles/rocm_installation/tasks/install_rocm.yml index 90817a129..6d7f28bbf 100644 --- a/utils/roles/rocm_installation/tasks/install_rocm.yml +++ b/utils/roles/rocm_installation/tasks/install_rocm.yml @@ -13,28 +13,64 @@ # limitations under the License. --- -- name: Install ROCm packages - ansible.builtin.package: - name: "{{ rocm_packages }}" - state: present - -- name: Check if environment variables are set - ansible.builtin.command: echo $PATH - changed_when: false - failed_when: false - register: path_output - -- name: Perform Post Installation steps - when: "'rocm' not in path_output.stdout" +- name: Verify Repo and Install ROCm packages block: - - name: Check current environment variables - ansible.builtin.shell: echo $PATH + - name: Local local_repo_access.yml file + ansible.builtin.include_vars: "{{ local_repo_access_path }}" + + - name: Check if the ROCm preference source file exists + ansible.builtin.stat: + path: "{{ rocm_prefrence_src }}" + register: rocm_preference_src_stat + + - name: Create ROCm preference file + ansible.builtin.template: + src: "{{ rocm_prefrence_src }}" + dest: "{{ rocm_prefrence_dst }}" + mode: "{{ prefrence_file_mode }}" + when: + - ansible_distribution | lower in ubuntu_os + - not rocm_preference_src_stat.stat.exists + + - name: Install ROCm packages + ansible.builtin.package: + name: "{{ rocm_packages }}" + state: present + + - name: Check if environment variables are set + ansible.builtin.command: echo $PATH changed_when: false - register: environment_output - - - name: Replace PATH variable - ansible.builtin.lineinfile: - path: /root/.bashrc - regexp: '^PATH=*' - insertafter: EOF - line: 'PATH={{ environment_output.stdout }}:/opt/rocm/bin/' + failed_when: false + register: path_output + + - name: Perform Post Installation steps + when: "'rocm' not in path_output.stdout" + block: + - name: Check current environment variables + ansible.builtin.shell: echo $PATH + changed_when: false + register: environment_output + + - name: Replace PATH variable + ansible.builtin.lineinfile: + path: /root/.bashrc + regexp: '^PATH=*' + insertafter: EOF + line: 'PATH={{ environment_output.stdout }}:/opt/rocm/bin/' + + - name: Ensure rocm.conf exists and add library paths + ansible.builtin.copy: + dest: "{{ linker_dest_path }}" + content: | + /opt/rocm/lib + /opt/rocm/lib64 + mode: "{{ file_permission }}" + + - name: Run ldconfig to update dynamic linker bindings + ansible.builtin.command: ldconfig + changed_when: false + rescue: + - name: Warning, rocm repo not configured + ansible.builtin.pause: + prompt: "{{ rocm_warning_msg }}" + seconds: "{{ warning_time }}" diff --git a/utils/roles/rocm_installation/templates/rocm_preferences_ubuntu.j2 b/utils/roles/rocm_installation/templates/rocm_preferences_ubuntu.j2 new file mode 100644 index 000000000..1baa52c47 --- /dev/null +++ b/utils/roles/rocm_installation/templates/rocm_preferences_ubuntu.j2 @@ -0,0 +1,3 @@ +Package: rocm* +Pin: origin {{ admin_nic_ip }} +Pin-Priority: 600 \ No newline at end of file diff --git a/utils/roles/rocm_installation/vars/main.yml b/utils/roles/rocm_installation/vars/main.yml index b2b5e72da..8884dcf71 100644 --- a/utils/roles/rocm_installation/vars/main.yml +++ b/utils/roles/rocm_installation/vars/main.yml @@ -14,8 +14,17 @@ --- # Used: install_rocm.yml +local_repo_access_path: "/opt/omnia/offline/local_repo_access.yml" +rocm_prefrence_dst: "/etc/apt/preferences.d/rocm-pin-600" +rocm_prefrence_src: "rocm_preferences_ubuntu.j2" +ubuntu_os: "ubuntu" +prefrence_file_mode: '0644' rocm_packages: - - "rocm-hip-sdk{{ hostvars['127.0.0.1']['rocm_version'] }}*" + - "rocm" amdgpu_warning_msg: "ROCm will not be installed, AMDGPU drivers not found on the node. Run local_repo.yml with amdgpu software stack in software_config and reprovision the node." warning_time: 10 +rocm_warning_msg: "Unable to install ROCM on {{ ansible_host }} node. ROCm repository not configured on the node. +Run local_repo.yml with rocm software stack in software_config or ROCm repo in user_repo_url." +file_permission: "0644" +linker_dest_path: "/etc/ld.so.conf.d/rocm.conf" diff --git a/utils/roles/rocm_validation/tasks/main.yml b/utils/roles/rocm_validation/tasks/main.yml index 3683009ff..fb5f49e3d 100644 --- a/utils/roles/rocm_validation/tasks/main.yml +++ b/utils/roles/rocm_validation/tasks/main.yml @@ -15,7 +15,7 @@ - name: Saving distribution of os ansible.builtin.set_fact: - control_plane_os: "{{ ansible_distribution | lower }}" + oim_os: "{{ ansible_distribution | lower }}" - name: Include local_repo variables ansible.builtin.include_tasks: include_local_repo_config.yml diff --git a/utils/roles/rocm_validation/tasks/validate_amd.yml b/utils/roles/rocm_validation/tasks/validate_amd.yml index b2e1d5940..32bf82691 100644 --- a/utils/roles/rocm_validation/tasks/validate_amd.yml +++ b/utils/roles/rocm_validation/tasks/validate_amd.yml @@ -23,8 +23,8 @@ file: "{{ software_config_json_file }}" name: user_config -- name: Include vars for {{ control_plane_os }} - ansible.builtin.include_vars: "{{ role_path }}/vars/{{ control_plane_os }}.yml" +- name: Include vars for {{ oim_os }} + ansible.builtin.include_vars: "{{ role_path }}/vars/{{ oim_os }}.yml" - name: Get rocm status only if amdgpu present amd_status is true ansible.builtin.set_fact: @@ -41,8 +41,17 @@ seconds: "{{ warning_time }}" when: not rocm_input_status -- name: Set rocm_config_status +- name: Check if the rocm offline repo exists + ansible.builtin.stat: + path: "{{ offline_rocm_directory }}/rocm/" + register: check_rocm_repo when: rocm_input_status + +- name: Set rocm_config_status + when: + - rocm_input_status + - user_config.repo_config == 'always' or user_config.repo_config == 'partial' + - check_rocm_repo.stat.exists block: - name: Fetch rocm_version ansible.builtin.set_fact: @@ -72,3 +81,22 @@ ansible.builtin.pause: prompt: "{{ rocm_version_msg }}" seconds: "{{ warning_time }}" + +- name: Set rocm_config_status + when: + - rocm_input_status + - user_config.repo_config == 'never' or user_config.repo_config == 'partial' + - not check_rocm_repo.stat.exists + block: + - name: Fetch rocm_version + ansible.builtin.set_fact: + rocm_version: "{{ user_config.amdgpu | selectattr('name', 'equalto', 'rocm') | map(attribute='version') | first }}" + + - name: Set rocm_config_status to true + ansible.builtin.set_fact: + rocm_config_status: true + rescue: + - name: Warning, rocm version not found + ansible.builtin.pause: + prompt: "{{ rocm_version_msg }}" + seconds: "{{ warning_time }}" diff --git a/utils/roles/servicetag_host_mapping/files/servicetag_host_mapping.py b/utils/roles/servicetag_host_mapping/files/servicetag_host_mapping.py index 9ac708bfa..c8af2f2b7 100644 --- a/utils/roles/servicetag_host_mapping/files/servicetag_host_mapping.py +++ b/utils/roles/servicetag_host_mapping/files/servicetag_host_mapping.py @@ -29,19 +29,22 @@ def service_tag_host_mapping(): """ Modifies the inventory files by adding the corresponding host IP for each service tag. - This function iterates through a list of inventory files and - modifies them by adding the host IP for each service tag. + This function iterates through a list of inventory files and + modifies them by adding the host IP for each service tag. """ try: # Create a database connection connection = omniadb.create_connection() cursor = connection.cursor() - - # Get the list of inventory files - inventory_sources_list = inventory_sources_str[1:-1].split(',') + inventory_sources_list = [] + if inventory_sources_str: + # Get the list of inventory files + inventory_sources_list = inventory_sources_str[1:-1].split(',') # Iterate through all inventory files and modify them for inventory_file_path in inventory_sources_list: + inventory_file_path = os.path.abspath(inventory_file_path.strip("'| ")) + print("inventory_file_path: " + inventory_file_path) # If inventory file don't exist ignore. if not os.path.exists(inventory_file_path) or not os.path.basename(inventory_file_path): @@ -54,42 +57,27 @@ def service_tag_host_mapping(): # Variable to store modified lines result_lines = [] - + lines = [] # Open file in read mode with open(inventory_file_path, "r", encoding='utf-8') as f: - # Read the content of the file lines = f.readlines() + if lines: # Iterate content line by line for line in lines: - line = line.strip().lower() - - if line == 'localhost': - raise ValueError(f"localhost entry is an invalid entry in '{inventory_file_path}'") - - # Check if the line have a service tag, node name or hostname but don't have ansible_host - if line and line[0].isalnum() and "ansible_host=" not in line: - - # Query string: get host IP if service tag or node name is given - query = "select admin_ip from cluster.nodeinfo where service_tag=%s or node=%s" - params = (line.upper(), line) - - # Query execution - cursor.execute(query, params) - row = cursor.fetchone() - - if row: - # Collect host ip if result is valid - host_ip = row[0] - # Append host IP to service tag/node name - line = f"{line} ansible_host={host_ip}" - # Mark content as modified - is_content_modified = True - else: - # Query string get host IP if hostname is given - query = "select admin_ip from cluster.nodeinfo where hostname=%s" - params = (line,) + if 'Categories' not in line: + line = line.strip().lower() + + if line == 'localhost': + raise ValueError(f"localhost entry is an invalid entry in '{inventory_file_path}'") + + # Check if the line have a service tag, node name or hostname but don't have ansible_host + if line and line[0].isalnum() and "ansible_host=" not in line: + + # Query string: get host IP if service tag or node name is given + query = "select admin_ip from cluster.nodeinfo where service_tag=%s or node=%s" + params = (line.upper(), line) # Query execution cursor.execute(query, params) @@ -98,10 +86,26 @@ def service_tag_host_mapping(): if row: # Collect host ip if result is valid host_ip = row[0] - # Append host IP to hostname + # Append host IP to service tag/node name line = f"{line} ansible_host={host_ip}" # Mark content as modified is_content_modified = True + else: + # Query string get host IP if hostname is given + query = "select admin_ip from cluster.nodeinfo where hostname=%s" + params = (line,) + + # Query execution + cursor.execute(query, params) + row = cursor.fetchone() + + if row: + # Collect host ip if result is valid + host_ip = row[0] + # Append host IP to hostname + line = f"{line} ansible_host={host_ip}" + # Mark content as modified + is_content_modified = True # Append service tag string to result lines. diff --git a/utils/roles/servicetag_host_mapping/tasks/check_provision_status.yml b/utils/roles/servicetag_host_mapping/tasks/check_provision_status.yml new file mode 100644 index 000000000..4c5480dca --- /dev/null +++ b/utils/roles/servicetag_host_mapping/tasks/check_provision_status.yml @@ -0,0 +1,65 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Set oim_os + ansible.builtin.set_fact: + oim_os: "{{ ansible_distribution | lower }}" + +- name: Initialize variables + ansible.builtin.set_fact: + discovery_provision_status: false + +- name: Gathering service facts + ansible.builtin.service_facts: + +- name: Check omnia postgres path exist + ansible.builtin.stat: + path: "{{ omnia_postgres_path }}" + register: postres_path_check + +- name: Fetch network table entries + ansible.builtin.command: "{{ xcat_path }}/lsdef -t network" + changed_when: false + failed_when: false + register: fetch_network + +- name: Set discovery_provision_status to true for RHEL/Rocky + ansible.builtin.set_fact: + discovery_provision_status: true + when: + - oim_os == oim_os_redhat or oim_os == oim_os_rocky + - xcatd_service in ansible_facts.services + - postgresql_service_rhel in ansible_facts.services + - "'running' in ansible_facts.services[xcatd_service].state" + - "'running' in ansible_facts.services[postgresql_service_rhel].state" + - postres_path_check.stat.exists + - '"admin_network" in fetch_network.stdout' + +- name: Set discovery_provision_status to true for Ubuntu + ansible.builtin.set_fact: + discovery_provision_status: true + when: + - oim_os == oim_os_ubuntu + - xcatd_service in ansible_facts.services + - postgresql_service_ubuntu in ansible_facts.services + - "'running' in ansible_facts.services[xcatd_service].state" + - "'running' in ansible_facts.services[postgresql_service_ubuntu].state" + - postres_path_check.stat.exists + - '"admin_network" in fetch_network.stdout' + +- name: Fail if xcatd or postgresql service not running + ansible.builtin.fail: + msg: "{{ postgres_install_fail_msg }}" + when: not discovery_provision_status diff --git a/utils/roles/servicetag_host_mapping/tasks/main.yml b/utils/roles/servicetag_host_mapping/tasks/main.yml index da2fd46d9..df4714f06 100644 --- a/utils/roles/servicetag_host_mapping/tasks/main.yml +++ b/utils/roles/servicetag_host_mapping/tasks/main.yml @@ -13,20 +13,15 @@ # limitations under the License. --- -- name: Servicetag Host mapping - ansible.builtin.command: "{{ python }} {{ servicetag_host_mapping_script }} {{ role_path }} {{ file_path }} {{ ansible_inventory_sources }}" - register: servicetag_host_mapping_result - changed_when: true - -- name: Verify Error in Servicetag Host mapping +- name: Inventory not provided ansible.builtin.fail: - msg: "{{ servicetag_host_mapping_result.stdout_lines[-1] }}" - when: servicetag_host_mapping_result is defined and 'Error' in servicetag_host_mapping_result.stdout + msg: "{{ empty_inventory_fail_msg }}" + when: + - groups['all'] is defined + - (groups['all'] | length == 0) -- name: Clear facts - ansible.builtin.meta: clear_facts - changed_when: false +- name: Check discovery_provision.yml execution status + ansible.builtin.include_tasks: check_provision_status.yml -- name: Refresh inventory - ansible.builtin.meta: refresh_inventory - changed_when: false +- name: Apply service tag mapping for inventory + ansible.builtin.include_tasks: service_tag_mapping.yml diff --git a/utils/roles/servicetag_host_mapping/tasks/service_tag_mapping.yml b/utils/roles/servicetag_host_mapping/tasks/service_tag_mapping.yml new file mode 100644 index 000000000..7e6d299ee --- /dev/null +++ b/utils/roles/servicetag_host_mapping/tasks/service_tag_mapping.yml @@ -0,0 +1,32 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Servicetag Host mapping + ansible.builtin.command: "{{ python }} {{ servicetag_host_mapping_script }} {{ role_path }} {{ file_path }} {{ ansible_inventory_sources | quote }}" + register: servicetag_host_mapping_result + changed_when: true + +- name: Verify Error in Servicetag Host mapping + ansible.builtin.fail: + msg: "{{ servicetag_host_mapping_result.stdout_lines[-1] }}" + when: servicetag_host_mapping_result is defined and 'Error' in servicetag_host_mapping_result.stdout + +- name: Clear facts + ansible.builtin.meta: clear_facts + changed_when: false + +- name: Refresh inventory + ansible.builtin.meta: refresh_inventory + changed_when: false diff --git a/utils/roles/servicetag_host_mapping/vars/main.yml b/utils/roles/servicetag_host_mapping/vars/main.yml index e2190b74f..fa3999da4 100644 --- a/utils/roles/servicetag_host_mapping/vars/main.yml +++ b/utils/roles/servicetag_host_mapping/vars/main.yml @@ -13,6 +13,23 @@ # limitations under the License. --- -python: python3.9 +# Usage: main.yml +empty_inventory_fail_msg: "Failed. inventory not provided. Re-run playbook with inventory by providing -i inventory." + +# Usage: service_tag_mapping.yml +python: "{{ ansible_python_interpreter }}" servicetag_host_mapping_script: "{{ role_path }}/files/servicetag_host_mapping.py" file_path: "/../../../discovery/roles/db_operations/files/" + +# Usage: check_provision_status.yml +omnia_postgres_path: /opt/omnia/.postgres +xcatd_service: "xcatd.service" +postgresql_service_rhel: "postgresql.service" +postgresql_service_ubuntu: "postgresql" +oim_os_redhat: "redhat" +oim_os_rocky: "rocky" +oim_os_ubuntu: "ubuntu" +xcat_path: /opt/xcat/bin +postgres_install_fail_msg: | + "Failed. postgresql or xcatd services not running. + Please run discovery_provision.yml to discover and provision nodes first, then rerun the playbook." diff --git a/utils/roles/update_synclist/files/get_nodes_from_inventory.py b/utils/roles/update_synclist/files/get_nodes_from_inventory.py new file mode 100644 index 000000000..40afaef8a --- /dev/null +++ b/utils/roles/update_synclist/files/get_nodes_from_inventory.py @@ -0,0 +1,83 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys, os +import subprocess +import re + +# Paths to the inventory file and database module +inventory_file_paths = sys.argv[2][1:-1].split(',') # Extract the inventory file path correctly +db_path = os.path.abspath(sys.argv[1]) + +sys.path.insert(0, db_path) +import omniadb_connection + +# Read the inventory file line-by-line, ignoring sections like [nodes] +node_identifiers = [] +if inventory_file_paths: + for inventory_file_path in inventory_file_paths: + inventory_file_path = os.path.abspath(inventory_file_path.strip("'| ")) + with open(os.path.abspath(inventory_file_path), "r") as file: + for line in file: + line = line.strip() + if line and not line.startswith("["): + node_identifiers.append(line) + +def is_ip(identifier): + """ + Check if the identifier is a valid IP address. + """ + return re.match(r"^\d{1,3}(\.\d{1,3}){3}$", identifier) is not None + +def fetch_node_name(cursor, identifier): + """ + Fetch the node name by treating the identifier as IP. + """ + sql_query = """ + SELECT node, status + FROM cluster.nodeinfo + WHERE admin_ip = %s + """ + if is_ip(identifier): + cursor.execute(sql_query, (identifier,)) + else: + identifier = identifier.split('ansible_host=')[1] + cursor.execute(sql_query, (identifier,)) + + node_row = cursor.fetchone() + + return node_row[0] if node_row and node_row[1] == 'failed' else None + +def get_nodes_name(): + """ + Retrieve the names of the nodes that are in a failed state from the inventory file. + + """ + + conn = omniadb_connection.create_connection() + cursor = conn.cursor() + node_names = [] + for identifier in node_identifiers: + # Try to fetch the node name based on the identifier + node_name = fetch_node_name(cursor, identifier) + if node_name: + node_names.append(node_name.strip()) + + # Close the cursor and connection + cursor.close() + conn.close() + return node_names + +node_names = get_nodes_name() +print(','.join(set(node_names))) \ No newline at end of file diff --git a/upgrade/roles/uninstall_k8s_cluster/tasks/validation.yml b/utils/roles/update_synclist/files/update_omniadb.py similarity index 51% rename from upgrade/roles/uninstall_k8s_cluster/tasks/validation.yml rename to utils/roles/update_synclist/files/update_omniadb.py index 593a9ea5e..420e4c480 100644 --- a/upgrade/roles/uninstall_k8s_cluster/tasks/validation.yml +++ b/utils/roles/update_synclist/files/update_omniadb.py @@ -11,23 +11,31 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ---- -- name: Validate kubernets groups - ansible.builtin.fail: - msg: "{{ kube_inv_fail_msg }}" - when: groups['manager'] is not defined or groups['compute'] is not defined +import sys, os +import subprocess -- name: Checking K8s service status - ansible.builtin.systemd: - name: kubelet - register: kubelet_service +db_path = os.path.abspath(sys.argv[1]) +node = sys.argv[2] -- name: Set the k8s installation status - ansible.builtin.set_fact: - k8s_installation_status: true - when: '"not-found" not in kubelet_service.status.LoadState' +sys.path.insert(0, db_path) +import omniadb_connection -- name: Print the k8s installation status - ansible.builtin.debug: - var: k8s_installation_status +def update_node_status(node): + """ + Updates the status of a node in the cluster.nodeinfo table in omniadb. + """ + + conn = omniadb_connection.create_connection() + cursor = conn.cursor() + + update_status_query = """ + UPDATE cluster.nodeinfo + SET status = 'booted' + WHERE node = %s + """ + cursor.execute(update_status_query, (node,)) + cursor.close() + conn.close() + +update_node_status(node) \ No newline at end of file diff --git a/utils/roles/update_synclist/tasks/main.yml b/utils/roles/update_synclist/tasks/main.yml new file mode 100644 index 000000000..b73afea57 --- /dev/null +++ b/utils/roles/update_synclist/tasks/main.yml @@ -0,0 +1,87 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Get node names from inventory + ansible.builtin.command: | + {{ ansible_python_interpreter }} {{ get_nodes_python_script }} {{ db_path }} {{ ansible_inventory_sources | quote }} + changed_when: false + register: node_names + no_log: false + +- name: Update synclist on node - {{ node_name_list }} + when: node_names.stdout | select('string') | list | length > 0 + block: + - name: User confirmation for synclist + ansible.builtin.include_tasks: user_confirmation.yml + + - name: Parse node names into a list + ansible.builtin.set_fact: + node_name_list: "{{ node_names.stdout.split(',') | map('trim') | list }}" + + - name: Update synclist on each node + ansible.builtin.command: + cmd: "/opt/xcat/bin/updatenode {{ item }} -F" + loop: "{{ node_name_list }}" + changed_when: true + register: updatenode_result + failed_when: false + when: item | length > 0 # Ensure item is not empty + + - name: Initialize success and failure lists + ansible.builtin.set_fact: + success_nodes: [] + failed_nodes: [] + + - name: Collect success nodes + ansible.builtin.set_fact: + success_nodes: "{{ success_nodes + [item.item] }}" + when: '"failed" not in item.stdout and item.rc == 0' + loop: "{{ updatenode_result.results }}" + + - name: Collect failed nodes and errors + ansible.builtin.set_fact: + failed_nodes: "{{ failed_nodes + [item.item] }}" + when: '"failed" in item.stdout or item.rc != 0' + loop: "{{ updatenode_result.results }}" + + - name: Change node status to booted + ansible.builtin.command: + cmd: "/opt/xcat/bin/chdef {{ item }} status=booted" + loop: "{{ success_nodes }}" + changed_when: true + when: item | length > 0 + + - name: Update omniadb node status to booted for success nodes + ansible.builtin.command: | + {{ ansible_python_interpreter }} {{ update_dp_python_script }} {{ db_path }} {{ item }} + loop: "{{ success_nodes }}" + changed_when: true + no_log: false + when: item | length > 0 + + - name: Display success results + ansible.builtin.debug: + msg: "Update synclist success on nodes: {{ success_nodes | join(', ') }}" + when: success_nodes | length > 0 + + - name: Display failure results + ansible.builtin.debug: + msg: "Update synclist failed on nodes: {{ failed_nodes }}" + when: failed_nodes | length > 0 + +- name: No nodes need updating synclist + ansible.builtin.debug: + msg: "No nodes from inventory need updating synclist." + when: node_names.stdout | length == 0 diff --git a/utils/roles/update_synclist/tasks/user_confirmation.yml b/utils/roles/update_synclist/tasks/user_confirmation.yml new file mode 100644 index 000000000..5e4bfad17 --- /dev/null +++ b/utils/roles/update_synclist/tasks/user_confirmation.yml @@ -0,0 +1,32 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Warning message for updating synclist + ansible.builtin.pause: + prompt: "{{ warning_msg }}" + seconds: "{{ warning_wait_time }}" + run_once: true + +- name: Confirmation required to proceed + ansible.builtin.pause: + prompt: "{{ confirmation_msg }}" + register: pause_result + when: + - not (skip_confirmation | default(false) | bool) + +- name: Fail if user does not confirm update synclist + ansible.builtin.fail: + msg: "{{ confirmation_fail_msg }}" + when: pause_result.user_input | default('yes') != 'yes' diff --git a/upgrade/roles/upgrade_omniadb/vars/main.yml b/utils/roles/update_synclist/vars/main.yml similarity index 53% rename from upgrade/roles/upgrade_omniadb/vars/main.yml rename to utils/roles/update_synclist/vars/main.yml index bf0762fea..8419c165b 100644 --- a/upgrade/roles/upgrade_omniadb/vars/main.yml +++ b/utils/roles/update_synclist/vars/main.yml @@ -14,7 +14,12 @@ --- # Usage: main.yml -provision_credentials_config_filename: "{{ role_path }}/../../../input/provision_config_credentials.yml" -provision_credentials_vault_path: "{{ role_path }}/../../../input/.provision_credential_vault_key" -provision_config_cred_syntax_fail_msg: "Failed. Syntax errors present in provision_credentials_config.yml. Fix errors and re-run playbook again." -path_to_discovery_commons: "{{ role_path }}/../../../discovery/roles/discovery_validations/common" +get_nodes_python_script: "{{ role_path }}/files/get_nodes_from_inventory.py" +update_dp_python_script: "{{ role_path }}/files/update_omniadb.py" +db_path: "{{ role_path }}/../../../discovery/roles/db_operations/files" + +# Usage: user_confirmation.yml +warning_msg: "[WARNING] This synclist utility will be updating syncfiles for nodes mentioned in inventory and node status is failed." +warning_wait_time: 10 +confirmation_msg: "Are you sure you want to run syncslist on below nodes- {{ node_names.stdout }}" +confirmation_fail_msg: "Synclist confirmation failed" diff --git a/utils/roles/update_user_repo/tasks/check_os_versions.yml b/utils/roles/update_user_repo/tasks/check_os_versions.yml new file mode 100644 index 000000000..90d5b8197 --- /dev/null +++ b/utils/roles/update_user_repo/tasks/check_os_versions.yml @@ -0,0 +1,51 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Initialize facts + ansible.builtin.set_fact: + os_no_match: false + distro: "" + +- name: Load vars from software_config.json + ansible.builtin.include_vars: + file: "{{ software_config_json_file }}" + name: software_config + +- name: Set fact for cluster OS type + ansible.builtin.set_fact: + cluster_os_type: "{{ software_config.cluster_os_type }}" + +- name: Set fact for cluster OS version + ansible.builtin.set_fact: + cluster_os_version: "{{ software_config.cluster_os_version }}" + +- name: Set fact for Omnia Infrastructure Manager OS version + ansible.builtin.set_fact: + oim_os_version: "{{ ansible_distribution_version | lower }}" + +- name: Set flag if os versions donot match + ansible.builtin.set_fact: + os_no_match: true + when: cluster_os_version != oim_os_version + +- name: Set distro to jammy (Ubuntu 22 OS) + ansible.builtin.set_fact: + distro: "jammy" + when: cluster_os_type == 'ubuntu' and cluster_os_version == '22.04' + +- name: Set distro to focal (Ubuntu 20 OS) + ansible.builtin.set_fact: + distro: "focal" + when: cluster_os_type == 'ubuntu' and cluster_os_version == '20.04' diff --git a/utils/roles/update_user_repo/tasks/create_software_repo_redhat.yml b/utils/roles/update_user_repo/tasks/create_software_repo_redhat.yml index 4651a8d52..6868382ba 100644 --- a/utils/roles/update_user_repo/tasks/create_software_repo_redhat.yml +++ b/utils/roles/update_user_repo/tasks/create_software_repo_redhat.yml @@ -20,7 +20,7 @@ - item.version | default("",true) | length>0 fail_msg: "{{ version_invalid_fail_msg }} {{ item.name }}" -- name: Check software repo exists in omnia repo on ControlPlane +- name: Check software repo exists in omnia repo on Omnia Infrastructure Manager ansible.builtin.stat: path: "{{ cluster_software_path }}/{{ item.name }}/{{ item.version }}" register: check_path_stat @@ -37,7 +37,7 @@ changed_when: false failed_when: check_repo_url.stderr!='' -- name: Create repo file if software version exists on CP +- name: Create repo file if software version exists on OIM ansible.builtin.template: src: "{{ repo_config_template_src }}" dest: "{{ repo_path }}/{{ item.name }}-{{ item.version }}-repo.repo" diff --git a/utils/roles/update_user_repo/tasks/create_software_repo_ubuntu.yml b/utils/roles/update_user_repo/tasks/create_software_repo_ubuntu.yml index 9a5aa3714..3e4938a57 100644 --- a/utils/roles/update_user_repo/tasks/create_software_repo_ubuntu.yml +++ b/utils/roles/update_user_repo/tasks/create_software_repo_ubuntu.yml @@ -20,7 +20,7 @@ - item.version | default("",true) | length>0 fail_msg: "{{ version_invalid_fail_msg }} {{ item.name }}" -- name: Check software repo exists in omnia repo on ControlPlane +- name: Check software repo exists in omnia repo on Omnia Infrastructure Manager ansible.builtin.stat: path: "{{ cluster_software_path }}/{{ item.name }}/{{ item.version }}" register: check_path_stat @@ -37,7 +37,7 @@ changed_when: false failed_when: check_repo_url.stderr!='' -- name: Create repo file if software version exists on CP +- name: Create repo file if software version exists on OIM ansible.builtin.template: src: "{{ repo_config_template_src }}" dest: "{{ repo_path }}/{{ item.name }}-{{ item.version }}.list" diff --git a/utils/roles/update_user_repo/tasks/main.yml b/utils/roles/update_user_repo/tasks/main.yml index 0f84640e9..e669ca3d0 100644 --- a/utils/roles/update_user_repo/tasks/main.yml +++ b/utils/roles/update_user_repo/tasks/main.yml @@ -23,6 +23,9 @@ - name: Validate the input files and values ansible.builtin.include_tasks: validation.yml +- name: Validate repos configured during provisioning + ansible.builtin.include_tasks: validate_repo.yml + - name: Update repos on cluster for {{ cluster_os }} block: - name: Update user repos for nodes on {{ cluster_os }} diff --git a/utils/roles/update_user_repo/tasks/update_software_repo_redhat.yml b/utils/roles/update_user_repo/tasks/update_software_repo_redhat.yml index e8e8433c9..b20bdca30 100644 --- a/utils/roles/update_user_repo/tasks/update_software_repo_redhat.yml +++ b/utils/roles/update_user_repo/tasks/update_software_repo_redhat.yml @@ -66,7 +66,7 @@ - name: Configure omnia repositories ansible.builtin.template: - src: '"{{ omnia_repo_config_template_src }}"' + src: "{{ omnia_repo_config_template_src }}" dest: "{{ repo_path }}/omnia_repo_{{ item.0 + 1 }}{{ansible_date_time.iso8601_basic}}.repo" mode: "{{ file_permission }}" with_indexed_items: "{{ omnia_repo_url }}" @@ -76,9 +76,20 @@ ansible.builtin.command: dnf clean all changed_when: true -- name: Update the yum repos cache - ansible.builtin.command: dnf makecache - changed_when: true +- name: Execute update repos + block: + - name: Update yum repos cache + ansible.builtin.dnf: + update_cache: true + register: update_repos + until: update_repos is not failed + retries: "{{ repo_retries }}" + delay: "{{ repo_delay }}" + + rescue: + - name: Update cache failure + ansible.builtin.fail: + msg: "{{ repo_update_failure_msg }}" - name: Run repolist command ansible.builtin.command: dnf repolist diff --git a/utils/roles/update_user_repo/tasks/update_software_repo_ubuntu.yml b/utils/roles/update_user_repo/tasks/update_software_repo_ubuntu.yml index fffe588b2..5a80dbf7d 100644 --- a/utils/roles/update_user_repo/tasks/update_software_repo_ubuntu.yml +++ b/utils/roles/update_user_repo/tasks/update_software_repo_ubuntu.yml @@ -13,6 +13,11 @@ # limitations under the License. --- +- name: Set default intel_config_status + ansible.builtin.set_fact: + intel_config_status: false + intelgaudi_config_status: false + - name: Update beegfs,amdgpu,rocm repo when repo_config=partial,always when: (software_config['repo_config']|lower == "partial") or (software_config['repo_config']|lower == "always") block: @@ -22,7 +27,33 @@ - name: Generate software repository configurations ansible.builtin.include_tasks: create_software_repo_ubuntu.yml loop: "{{ software_config.softwares + software_config.amdgpu | default([]) }}" - when: "'beegfs' in item.name or 'amdgpu' in item.name or 'rocm' in item.name" + when: "'beegfs' in item.name or 'amdgpu' in item.name or 'rocm' in item.name or 'intelgaudi' in item.name" + loop_control: + loop_var: item + + - name: Set intelgaudi config status + ansible.builtin.set_fact: + intelgaudi_config_status: true + intelgaudi_version: "{{ item.version }}" + loop: "{{ software_config.softwares | default([]) }}" + when: "'intelgaudi' in item.name" + loop_control: + loop_var: item + + - name: Set intel config status + ansible.builtin.set_fact: + intel_config_status: true + intel_version: "{{ intelgaudi_version }}" + loop: "{{ software_config.intelgaudi | default([]) }}" + when: "intelgaudi_config_status and 'intel' in item.name" + loop_control: + loop_var: item + + - name: Generate software repository configurations for intelgaudi + ansible.builtin.include_tasks: create_software_repo_ubuntu.yml + loop: + - { name: "intel", version: "{{ intel_version }}" } + when: intel_config_status loop_control: loop_var: item @@ -110,7 +141,7 @@ - name: Configure omnia repositories ansible.builtin.template: - src: '"{{ omnia_repo_config_template_src }}"' + src: "{{ omnia_repo_config_template_src }}" dest: "{{ tmp_omnia_repo_path }}/omnia_repo{{ item.0 + 1 }}{{ansible_date_time.iso8601_basic}}.list" mode: "{{ file_permission }}" with_indexed_items: "{{ omnia_repo_url }}" @@ -136,6 +167,10 @@ - name: Update packages ansible.builtin.apt: update_cache: true + register: update_repos + until: update_repos is not failed + retries: "{{ repo_retries }}" + delay: "{{ repo_delay }}" rescue: - name: Update cache failure diff --git a/utils/roles/update_user_repo/tasks/update_user_repo_redhat.yml b/utils/roles/update_user_repo/tasks/update_user_repo_redhat.yml index 49f6e0bea..b0e976c9d 100644 --- a/utils/roles/update_user_repo/tasks/update_user_repo_redhat.yml +++ b/utils/roles/update_user_repo/tasks/update_user_repo_redhat.yml @@ -52,7 +52,7 @@ - name: Configure user repositories ansible.builtin.template: - src: '"{{ usr_repo_config_template_src }}"' + src: "{{ usr_repo_config_template_src }}" dest: "{{ repo_path }}/a_user_repo{{ item.0 + 1 }}{{ansible_date_time.iso8601_basic}}.repo" mode: "{{ file_permission }}" with_indexed_items: "{{ user_repo_url }}" diff --git a/utils/roles/update_user_repo/tasks/update_user_repo_ubuntu.yml b/utils/roles/update_user_repo/tasks/update_user_repo_ubuntu.yml index cc067f3a7..5ba7bc5d7 100644 --- a/utils/roles/update_user_repo/tasks/update_user_repo_ubuntu.yml +++ b/utils/roles/update_user_repo/tasks/update_user_repo_ubuntu.yml @@ -18,6 +18,7 @@ beegfs_version: "{{ software_version_default }}" amdgpu_version: "{{ software_version_default }}" rocm_version: "{{ software_version_default }}" + intelgaudi_version: "{{ software_version_default }}" - name: Set facts for cluster ansible.builtin.set_fact: @@ -122,7 +123,7 @@ - name: Configure user repositories ansible.builtin.template: - src: '"{{ usr_repo_config_template_src }}"' + src: "{{ usr_repo_config_template_src }}" dest: "{{ tmp_user_repo_path }}/a_user_repo{{ item.0 + 1 }}{{ansible_date_time.iso8601_basic}}.list" mode: "{{ file_permission }}" with_indexed_items: "{{ user_repo_url | default([]) }}" @@ -142,3 +143,23 @@ src: "{{ tmp_user_repo_path }}/" dest: "{{ repo_path }}/" mode: "{{ file_permission }}" + +- name: Set fact for dist + ansible.builtin.set_fact: + dist: "{{ hostvars['127.0.0.1']['distro'] }}" + +- name: Configure deadsnakes PPA repository (Cross-OS) + when: hostvars['127.0.0.1']['os_no_match'] | bool + block: + - name: Add deadsnake repo to apt sources list # noqa risky-file-permissions + ansible.builtin.lineinfile: + path: "{{ sources_repo_path }}" + line: "{{ deadsnake_repo }}" + create: true + state: present + become: true + + - name: Update apt package cache + ansible.builtin.apt: + update_cache: true + become: true diff --git a/utils/kernel_param_update/roles/kcmdline_update/tasks/validate_input.yml b/utils/roles/update_user_repo/tasks/validate_repo.yml similarity index 50% rename from utils/kernel_param_update/roles/kcmdline_update/tasks/validate_input.yml rename to utils/roles/update_user_repo/tasks/validate_repo.yml index e32b329a8..493bd6dc5 100644 --- a/utils/kernel_param_update/roles/kcmdline_update/tasks/validate_input.yml +++ b/utils/roles/update_user_repo/tasks/validate_repo.yml @@ -13,22 +13,24 @@ # limitations under the License. --- -- name: Inventory not provided +- name: Check omnia cluster repo exist + ansible.builtin.stat: + path: "{{ omnia_cluster_repo_path }}" + register: cluster_repo_path_check + +- name: Fail if omnia cluster repo does not exist ansible.builtin.fail: - msg: "{{ empty_inventory_fail_msg }}" + msg: "{{ omnia_cluster_repo_fail_msg }}" when: - - groups['all'] is defined - - (groups['all'] | length == 0) + - (software_config.repo_config == 'always' or software_config.repo_config == 'partial') + - not cluster_repo_path_check.stat.exists -- name: Include variables from kernel_param_update_config.yml - ansible.builtin.include_vars: - file: "{{ kcmdline_config_file }}" - name: kcmdline_vars +- name: Check pip.conf exist + ansible.builtin.stat: + path: "{{ omnia_pip_conf_path }}" + register: pip_conf_check -- name: Validate grub_commandline_kernel variable - ansible.builtin.assert: - that: - - kcmdline_vars.grub_commandline_kernel is defined - - kcmdline_vars.grub_commandline_kernel is string - - kcmdline_vars.grub_commandline_kernel | length > 0 - fail_msg: "{{ validation_fail_msg }}" +- name: Fail if pip.conf does not exist + ansible.builtin.fail: + msg: "{{ pip_conf_fail_msg }}" + when: not pip_conf_check.stat.exists diff --git a/utils/roles/update_user_repo/tasks/validation.yml b/utils/roles/update_user_repo/tasks/validation.yml index d05979add..6736b4121 100644 --- a/utils/roles/update_user_repo/tasks/validation.yml +++ b/utils/roles/update_user_repo/tasks/validation.yml @@ -31,7 +31,7 @@ - software_config.softwares is defined fail_msg: "{{ software_config_parameters_fail_msg }}" -- name: Assert the control_os_type is {{ os_type }} +- name: Assert the oim_os_type is {{ os_type }} ansible.builtin.assert: that: software_config.cluster_os_type == os_type fail_msg: "{{ cluster_os_type_fail_msg }}" diff --git a/utils/roles/update_user_repo/templates/repo_config_template.j2 b/utils/roles/update_user_repo/templates/repo_config_template.j2 index d9cc4e316..e300a8cb8 100644 --- a/utils/roles/update_user_repo/templates/repo_config_template.j2 +++ b/utils/roles/update_user_repo/templates/repo_config_template.j2 @@ -4,3 +4,6 @@ baseurl=http://{{ admin_nic_ip }}:80/install{{ omnia_repo_path }}/cluster/yum/{{ enabled=1 gpgcheck=0 skip_if_unavailable=True +{% if proxy_status %} +proxy=_none_ +{% endif %} diff --git a/utils/roles/update_user_repo/vars/redhat.yml b/utils/roles/update_user_repo/vars/redhat.yml index 91cbfef20..9dc6e71c9 100644 --- a/utils/roles/update_user_repo/vars/redhat.yml +++ b/utils/roles/update_user_repo/vars/redhat.yml @@ -25,6 +25,18 @@ usr_repo_config_template_src: "{{ role_path }}/templates/user_repo_config.j2" # Usage:update_software_repo_redhat.yml omnia_repo_config_template_src: "{{ role_path }}/templates/omnia_repo_config.j2" +repo_retries: 5 +repo_delay: 10 +repo_update_failure_msg: "The playbook failed due to an error while running dnf makecache. + Please check user_repo_url and omnia_repo_url_redhat in local_repo_config.yml and ensure they are correct and reachable" # Usage:update_user_repo_redhat.yml,update_software_repo_redhat.yml os_type: "rhel" + +# Usage: validate_repo.yml +omnia_cluster_repo_path: /etc/yum.repos.d/cluster-rpm-repo.repo +omnia_pip_conf_path: /etc/pip.conf +omnia_cluster_repo_fail_msg: "Failed. The playbook failed because the Omnia cluster repository file {{ omnia_cluster_repo_path }} is missing from the node. +This could be due to an issue during the OS installation. To resolve this, please reprovision the node and re-run the playbook." +pip_conf_fail_msg: "Failed. The playbook failed because the Omnia pip config file {{ omnia_pip_conf_path }} is missing from the node. +This could be due to an issue during the OS installation. To resolve this, please reprovision the node and re-run the playbook." diff --git a/utils/roles/update_user_repo/vars/rocky.yml b/utils/roles/update_user_repo/vars/rocky.yml index 029ac95e9..8e4ae93fe 100644 --- a/utils/roles/update_user_repo/vars/rocky.yml +++ b/utils/roles/update_user_repo/vars/rocky.yml @@ -25,6 +25,16 @@ usr_repo_config_template_src: "{{ role_path }}/templates/user_repo_config.j2" # Usage:update_software_repo_redhat.yml omnia_repo_config_template_src: "{{ role_path }}/templates/omnia_repo_config.j2" +repo_retries: 5 +repo_delay: 10 # Usage:update_user_repo_redhat.yml,update_software_repo_redhat.yml os_type: "rocky" + +# Usage: validate_repo.yml +omnia_cluster_repo_path: /etc/yum.repos.d/cluster-rpm-repo.repo +omnia_pip_conf_path: /etc/pip.conf +omnia_cluster_repo_fail_msg: "Failed. The playbook failed because the Omnia cluster repository file {{ omnia_cluster_repo_path }} is missing from the node. +This could be due to an issue during the OS installation. To resolve this, please reprovision the node and re-run the playbook." +pip_conf_fail_msg: "Failed. The playbook failed because the Omnia pip config file {{ omnia_pip_conf_path }} is missing from the node. +This could be due to an issue during the OS installation. To resolve this, please reprovision the node and re-run the playbook." diff --git a/utils/roles/update_user_repo/vars/ubuntu.yml b/utils/roles/update_user_repo/vars/ubuntu.yml index 333b8049a..008ec8e96 100644 --- a/utils/roles/update_user_repo/vars/ubuntu.yml +++ b/utils/roles/update_user_repo/vars/ubuntu.yml @@ -15,6 +15,8 @@ # Usage: update_user_repo_ubuntu.yml,update_software_repo_ubuntu.yml,create_software_repo_ubuntu.yml repo_path: "/etc/apt/sources.list.d/" +sources_repo_path: "/etc/apt/sources.list.d/deadsnakes-ppa.list" +deadsnake_repo: "deb [trusted=yes] http://ppa.launchpad.net/deadsnakes/ppa/ubuntu {{ dist }} main" # Usage: create_software_repo_ubuntu.yml repo_config_template_src: "{{ role_path }}/templates/repo_config_template_ubuntu.j2" @@ -32,6 +34,16 @@ tmp_omnia_keyring_file_path: "{{ tmp_update_repo_file_path }}/omnia_repo_keyring tmp_omnia_repo_path: "{{ tmp_update_repo_file_path }}/omnia_repo_files" apt_update_failure_msg: "The playbook failed due to an error while running apt update. Please check user_repo_url and omnia_repo_url_ubuntu in local_repo_config.yml and ensure they are correct and reachable" +repo_retries: 5 +repo_delay: 10 # Usage:update_user_repo_ubuntu.yml,update_software_repo_ubuntu.yml os_type: "ubuntu" + +# Usage: validate_repo.yml +omnia_cluster_repo_path: /etc/apt/sources.list.d/cluster-deb.list +omnia_pip_conf_path: /etc/pip.conf +omnia_cluster_repo_fail_msg: "Failed. The playbook failed because the Omnia cluster repository file {{ omnia_cluster_repo_path }} is missing from the node. +This could be due to an issue during the OS installation. To resolve this, please reprovision the node and re-run the playbook." +pip_conf_fail_msg: "Failed. The playbook failed because the Omnia pip config file {{ omnia_pip_conf_path }} is missing from the node. +This could be due to an issue during the OS installation. To resolve this, please reprovision the node and re-run the playbook." diff --git a/server_spec_update/ansible.cfg b/utils/server_spec_update/ansible.cfg similarity index 72% rename from server_spec_update/ansible.cfg rename to utils/server_spec_update/ansible.cfg index 32c2fc8ef..5372a782f 100644 --- a/server_spec_update/ansible.cfg +++ b/utils/server_spec_update/ansible.cfg @@ -4,6 +4,7 @@ host_key_checking = false forks = 5 timeout = 180 executable = /bin/bash +collections_path = $VIRTUAL_ENV [persistent_connection] command_timeout = 180 @@ -11,4 +12,4 @@ connect_timeout = 180 [ssh_connection] retries = 3 -ssh_args = -o ControlMaster=auto -o ControlPersist=180 \ No newline at end of file +ssh_args = -o ControlMaster=auto -o ControlPersist=180 diff --git a/server_spec_update/roles/initialize_hosts/tasks/initialize_facts_nodes.yml b/utils/server_spec_update/roles/initialize_hosts/tasks/initialize_facts_nodes.yml similarity index 94% rename from server_spec_update/roles/initialize_hosts/tasks/initialize_facts_nodes.yml rename to utils/server_spec_update/roles/initialize_hosts/tasks/initialize_facts_nodes.yml index ee491cff6..72946b775 100644 --- a/server_spec_update/roles/initialize_hosts/tasks/initialize_facts_nodes.yml +++ b/utils/server_spec_update/roles/initialize_hosts/tasks/initialize_facts_nodes.yml @@ -20,5 +20,5 @@ - name: Set the values for variable ansible.builtin.set_fact: - node_detail: "{{ inventory_hostname }}" + node_detail: "{{ ansible_host }}" categories: "{{ Categories }}" diff --git a/server_spec_update/roles/initialize_hosts/tasks/main.yml b/utils/server_spec_update/roles/initialize_hosts/tasks/main.yml similarity index 100% rename from server_spec_update/roles/initialize_hosts/tasks/main.yml rename to utils/server_spec_update/roles/initialize_hosts/tasks/main.yml diff --git a/server_spec_update/roles/add_nic_network/files/add_nic_xcat_network.py b/utils/server_spec_update/roles/network_update/files/add_nic_xcat_network.py similarity index 92% rename from server_spec_update/roles/add_nic_network/files/add_nic_xcat_network.py rename to utils/server_spec_update/roles/network_update/files/add_nic_xcat_network.py index 340758c52..387ffe4f9 100644 --- a/server_spec_update/roles/add_nic_network/files/add_nic_xcat_network.py +++ b/utils/server_spec_update/roles/network_update/files/add_nic_xcat_network.py @@ -13,7 +13,7 @@ # limitations under the License. import subprocess -import sys +import sys, os import yaml nic_info = {} @@ -21,8 +21,8 @@ sys.path.insert(0, cal_path) import calculate_ip_details -network_spec_path = sys.argv[2] -metadata_nic_info_path = sys.argv[3] +network_spec_path = os.path.abspath(sys.argv[2]) +metadata_nic_info_path = os.path.abspath(sys.argv[3]) with open(network_spec_path, "r") as file: data = yaml.safe_load(file) @@ -32,7 +32,7 @@ def run_command_nw_update(col, start_ip, end_ip, netmask_bits, nic_mode, mtu): netmask = details[0] subnet = details[1] nic_range = start_ip + '-' + end_ip - command = f"chdef -t network -o {col} net={subnet} mask={netmask} staticrange={start_ip}-{end_ip} mtu={mtu}" + command = f"/opt/xcat/bin/chdef -t network -o {col} net={subnet} mask={netmask} staticrange={start_ip}-{end_ip} mtu={mtu}" command_list = command.split() try: subprocess.run(command_list, capture_output=True) diff --git a/server_spec_update/roles/create_nicinfo_db/files/additional_nic_table.py b/utils/server_spec_update/roles/network_update/files/additional_nic_table.py similarity index 96% rename from server_spec_update/roles/create_nicinfo_db/files/additional_nic_table.py rename to utils/server_spec_update/roles/network_update/files/additional_nic_table.py index 6f25c269b..69ae16afe 100644 --- a/server_spec_update/roles/create_nicinfo_db/files/additional_nic_table.py +++ b/utils/server_spec_update/roles/network_update/files/additional_nic_table.py @@ -13,13 +13,13 @@ # limitations under the License. import yaml -import sys +import sys, os node_db_path = sys.argv[2] sys.path.insert(0, node_db_path) import omniadb_connection -network_spec_file_path = sys.argv[1] +network_spec_file_path = os.path.abspath(sys.argv[1]) with open(network_spec_file_path, "r") as file: data = yaml.safe_load(file) diff --git a/server_spec_update/roles/metadata_creation/files/nic_metadata_validation.py b/utils/server_spec_update/roles/network_update/files/nic_metadata_validation.py similarity index 70% rename from server_spec_update/roles/metadata_creation/files/nic_metadata_validation.py rename to utils/server_spec_update/roles/network_update/files/nic_metadata_validation.py index 16e817bfe..312f304c1 100644 --- a/server_spec_update/roles/metadata_creation/files/nic_metadata_validation.py +++ b/utils/server_spec_update/roles/network_update/files/nic_metadata_validation.py @@ -34,25 +34,26 @@ def fetch_nic_metadata_params(metadata_path): def validate_nic_metadata_params(network_data, md_data): """ Validates the network details in the NIC metadata file against the server specification data. - + Args: network_data (dict): A dictionary containing the network details in the NIC metadata file. category_data (dict): A dictionary containing the server specification data. - + Returns: None """ - for net_key,net_value in network_data.items(): - if net_key not in ['admin_network', 'bmc_network']: - if net_key in md_data.keys(): - if('CIDR' in net_value.keys()): - if(net_value['CIDR'] != md_data['nic_metadata']['md_'+net_key+'_CIDR']): - sys.exit("md_"+net_key+"_CIDR"+" provided during previous execution is different from the value provided in current execution") - if('static_range' in net_value.keys()): - if(net_value['static_range'] != md_data['nic_metadata']['md_'+net_key+'_static_range']): - sys.exit("md_"+net_key+"_static_range"+" provided during previous execution is different from the value provided in current execution") - if(net_value['netmask_bits'] != md_data['nic_metadata']['md_'+net_key+'_netmask_bits']): - sys.exit("md_"+net_key+"_netmask_bits"+" provided during previous execution is different from the value provided in current execution") + if network_data: + for net_key,net_value in network_data.items(): + if net_key not in ['admin_network', 'bmc_network']: + if net_key in md_data.keys(): + if('CIDR' in net_value.keys()): + if(net_value['CIDR'] != md_data['nic_metadata']['md_'+net_key+'_CIDR']): + sys.exit("md_"+net_key+"_CIDR"+" provided during previous execution is different from the value provided in current execution") + if('static_range' in net_value.keys()): + if(net_value['static_range'] != md_data['nic_metadata']['md_'+net_key+'_static_range']): + sys.exit("md_"+net_key+"_static_range"+" provided during previous execution is different from the value provided in current execution") + if(net_value['netmask_bits'] != md_data['nic_metadata']['md_'+net_key+'_netmask_bits']): + sys.exit("md_"+net_key+"_netmask_bits"+" provided during previous execution is different from the value provided in current execution") def main(): """ @@ -62,7 +63,7 @@ def main(): fetches the metadata data from the NIC metadata file, validates the metadata data against the network data, and returns nothing. """ - nic_md_file_path = sys.argv[1] + nic_md_file_path = os.path.abspath(sys.argv[1]) network_string = os.environ.get('net_data') network_data = json.loads(network_string) md_data = fetch_nic_metadata_params(nic_md_file_path) diff --git a/server_spec_update/roles/metadata_update/files/update_nic_metadata.py b/utils/server_spec_update/roles/network_update/files/update_nic_metadata.py similarity index 72% rename from server_spec_update/roles/metadata_update/files/update_nic_metadata.py rename to utils/server_spec_update/roles/network_update/files/update_nic_metadata.py index 69d702894..89b6c1231 100644 --- a/server_spec_update/roles/metadata_update/files/update_nic_metadata.py +++ b/utils/server_spec_update/roles/network_update/files/update_nic_metadata.py @@ -29,13 +29,14 @@ def insert_nic_metadata_params(network_data, metadata_path): None """ nic_info = {'nic_metadata': {}} - for net_key,net_value in network_data.items(): - if(net_key not in ['admin_network', 'bmc_network']): - if('CIDR' in net_value.keys()): - nic_info['nic_metadata']['md_'+net_key+'_CIDR'] = net_value['CIDR'] - if('static_range' in net_value.keys()): - nic_info['nic_metadata']['md_'+net_key+'_static_range'] = net_value['static_range'] - nic_info['nic_metadata']['md_'+net_key+'_netmask_bits'] = net_value['netmask_bits'] + if network_data: + for net_key,net_value in network_data.items(): + if(net_key not in ['admin_network', 'bmc_network']): + if('CIDR' in net_value.keys()): + nic_info['nic_metadata']['md_'+net_key+'_CIDR'] = net_value['CIDR'] + if('static_range' in net_value.keys()): + nic_info['nic_metadata']['md_'+net_key+'_static_range'] = net_value['static_range'] + nic_info['nic_metadata']['md_'+net_key+'_netmask_bits'] = net_value['netmask_bits'] with open(metadata_path, 'w+') as file: yaml.dump(nic_info, file, default_flow_style=False) @@ -50,7 +51,7 @@ def main(): Returns: None """ - nic_md_file_path = sys.argv[1] + nic_md_file_path = os.path.abspath(sys.argv[1]) network_string = os.environ.get('net_data') network_data = json.loads(network_string) insert_nic_metadata_params(network_data, nic_md_file_path) diff --git a/server_spec_update/roles/create_nicinfo_db/tasks/add_nic_db.yml b/utils/server_spec_update/roles/network_update/tasks/add_nic_db.yml similarity index 100% rename from server_spec_update/roles/create_nicinfo_db/tasks/add_nic_db.yml rename to utils/server_spec_update/roles/network_update/tasks/add_nic_db.yml diff --git a/server_spec_update/roles/metadata_creation/tasks/create_nic_metadata.yml b/utils/server_spec_update/roles/network_update/tasks/create_nic_metadata.yml similarity index 100% rename from server_spec_update/roles/metadata_creation/tasks/create_nic_metadata.yml rename to utils/server_spec_update/roles/network_update/tasks/create_nic_metadata.yml diff --git a/utils/server_spec_update/roles/network_update/tasks/main.yml b/utils/server_spec_update/roles/network_update/tasks/main.yml new file mode 100644 index 000000000..bae6f3fdc --- /dev/null +++ b/utils/server_spec_update/roles/network_update/tasks/main.yml @@ -0,0 +1,38 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +- name: Update NIC details + when: add_network_status + block: + - name: Create Table for NIC Info + ansible.builtin.include_tasks: add_nic_db.yml + + - name: Create NIC Metadata File + ansible.builtin.include_tasks: create_nic_metadata.yml + + - name: Validate Input Parameters if Metadata Exists + when: metadata_status.stat.exists + ansible.builtin.include_tasks: validate_metadata_params.yml + + - name: Update Metadata + ansible.builtin.include_tasks: update_nic_metadata.yml + + - name: Create Files for Stanzas + ansible.builtin.file: + path: "{{ metadata_nicinfo_path }}" + state: touch + mode: "{{ file_perm }}" + + - name: Update Additional NIC Info in xCAT Networks Table + ansible.builtin.include_tasks: update_new_nic_network.yml diff --git a/server_spec_update/roles/add_nic_network/tasks/update_new_nic_network.yml b/utils/server_spec_update/roles/network_update/tasks/update_new_nic_network.yml similarity index 100% rename from server_spec_update/roles/add_nic_network/tasks/update_new_nic_network.yml rename to utils/server_spec_update/roles/network_update/tasks/update_new_nic_network.yml diff --git a/server_spec_update/roles/metadata_update/tasks/update_nic_metadata.yml b/utils/server_spec_update/roles/network_update/tasks/update_nic_metadata.yml similarity index 100% rename from server_spec_update/roles/metadata_update/tasks/update_nic_metadata.yml rename to utils/server_spec_update/roles/network_update/tasks/update_nic_metadata.yml diff --git a/server_spec_update/roles/metadata_creation/tasks/validate_metadata_params.yml b/utils/server_spec_update/roles/network_update/tasks/validate_metadata_params.yml similarity index 100% rename from server_spec_update/roles/metadata_creation/tasks/validate_metadata_params.yml rename to utils/server_spec_update/roles/network_update/tasks/validate_metadata_params.yml diff --git a/utils/server_spec_update/roles/network_update/vars/main.yml b/utils/server_spec_update/roles/network_update/vars/main.yml new file mode 100644 index 000000000..ee8e34f8c --- /dev/null +++ b/utils/server_spec_update/roles/network_update/vars/main.yml @@ -0,0 +1,41 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# Usage: update_new_nic_network.yml +python_version: "{{ ansible_python_interpreter }}" +update_nic_nw_path: "{{ role_path }}/files/add_nic_xcat_network.py" +nw_spec_path: "{{ role_path }}/../../../../input/network_spec.yml" +cal_path: "{{ role_path }}/../../../../discovery/roles/discovery_mechanism/mtms/files" +metadata_nicinfo_path: "/opt/omnia/.data/nic_metadata.yml" +file_perm: "0644" + +# Usage: add_nic_db.yml +add_nic_db_path: "{{ role_path }}/files/additional_nic_table.py" +network_spec_path: "{{ role_path }}/../../../../input/network_spec.yml" +node_db_path: "{{ role_path }}/../../../../discovery/roles/db_operations/files" + +# Usage: create_nic_metadata.yml +meta_path: "/opt/omnia/.data/nic_metadata.yml" +meta_dest: "/opt/omnia/.data/" +conf_file_mode: "0644" +mount_dir_perm: "0775" +meta_user: "root" +meta_group: "root" + +# Usage: validate_metadata_params.yml +validate_nic_metadata_py: "{{ role_path }}/files/nic_metadata_validation.py" + +# Usage: update_metadata.yml +update_nic_metadata_py: "{{ role_path }}/files/update_nic_metadata.py" diff --git a/utils/kernel_param_update/roles/kcmdline_update/tasks/kcmdline_update_redhat.yml b/utils/server_spec_update/roles/os_update/tasks/kcmdline_update_redhat.yml similarity index 76% rename from utils/kernel_param_update/roles/kcmdline_update/tasks/kcmdline_update_redhat.yml rename to utils/server_spec_update/roles/os_update/tasks/kcmdline_update_redhat.yml index 7cadd45b7..ff72589f1 100644 --- a/utils/kernel_param_update/roles/kcmdline_update/tasks/kcmdline_update_redhat.yml +++ b/utils/server_spec_update/roles/os_update/tasks/kcmdline_update_redhat.yml @@ -13,12 +13,15 @@ # limitations under the License. --- -- name: Include variables from kernel_param_update_config.yml - ansible.builtin.include_vars: - file: "{{ kcmdline_config_file }}" - name: kcmdline_vars +- name: Take a backup of the grub file + ansible.builtin.copy: + src: "{{ grub_path }}" + dest: "{{ grub_path }}.bak" + mode: "{{ file_permission }}" + remote_src: true - name: Update kernel parameters - ansible.builtin.command: "sudo grubby --update-kernel=ALL --args='{{ kcmdline_vars.grub_commandline_kernel }}'" + ansible.builtin.command: "sudo grubby --update-kernel=ALL --args='{{ cmdline_value }}'" register: output changed_when: output.rc != 0 + when: cmdline_value != "" diff --git a/utils/server_spec_update/roles/os_update/tasks/kcmdline_update_rocky.yml b/utils/server_spec_update/roles/os_update/tasks/kcmdline_update_rocky.yml new file mode 100644 index 000000000..fbb60d4fd --- /dev/null +++ b/utils/server_spec_update/roles/os_update/tasks/kcmdline_update_rocky.yml @@ -0,0 +1 @@ +kcmdline_update_redhat.yml \ No newline at end of file diff --git a/utils/server_spec_update/roles/os_update/tasks/kcmdline_update_ubuntu.yml b/utils/server_spec_update/roles/os_update/tasks/kcmdline_update_ubuntu.yml new file mode 100644 index 000000000..377fd0b03 --- /dev/null +++ b/utils/server_spec_update/roles/os_update/tasks/kcmdline_update_ubuntu.yml @@ -0,0 +1,42 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Take a backup of the grub file + ansible.builtin.copy: + src: "{{ grub_path }}" + dest: "{{ grub_path }}.bak" + mode: "{{ file_permission }}" + remote_src: true + +- name: Update kernel parameters on Ubuntu systems + when: cmdline_value != "" + block: + - name: Reading existing kernel parameters + ansible.builtin.shell: > + set -o pipefail && \ + cat "{{ grub_path }}" | grep '^GRUB_CMDLINE_LINUX=' | cut -d'"' -f2 + register: existing_grub_cmdline + changed_when: existing_grub_cmdline.rc != 0 + + - name: Adding grub_commandline_kernel to existing parameters + ansible.builtin.lineinfile: + path: "{{ grub_path }}" + regexp: '^GRUB_CMDLINE_LINUX=' + line: 'GRUB_CMDLINE_LINUX="{{ existing_grub_cmdline.stdout }} {{ cmdline_value }} "' + + - name: Update Grub configuration + ansible.builtin.command: sudo update-grub + register: output + changed_when: output.rc != 0 diff --git a/utils/server_spec_update/roles/os_update/tasks/main.yml b/utils/server_spec_update/roles/os_update/tasks/main.yml new file mode 100644 index 000000000..718f9c2d9 --- /dev/null +++ b/utils/server_spec_update/roles/os_update/tasks/main.yml @@ -0,0 +1,56 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +- name: Getting the kernel parameters for each node + ansible.builtin.set_fact: + cmdline_value: >- + {{ + (item[hostvars[inventory_hostname]['Categories']] + | selectattr('os', 'defined') + | map(attribute='os') + | map('first') + | selectattr('kernel', 'defined') + | map(attribute='kernel') + | map('first') + | map(attribute='cmdline') + | list | length > 0) | ternary( + (item[hostvars[inventory_hostname]['Categories']] + | selectattr('os', 'defined') + | map(attribute='os') + | map('first') + | selectattr('kernel', 'defined') + | map(attribute='kernel') + | map('first') + | map(attribute='cmdline') + | list)[0], + '' + ) + }} + with_items: "{{ Categories }}" + when: hostvars[inventory_hostname]['Categories'] in item + +- name: Display warning if cmdline_value is empty + ansible.builtin.debug: + msg: "{{ warning_msg }}" + when: cmdline_value == "" or cmdline_value | length < 1 + +- name: Kernel parameters update + when: cmdline_value != "" + block: + - name: Adding kernel parameters for OS + ansible.builtin.include_tasks: "kcmdline_update_{{ ansible_distribution | lower }}.yml" + + - name: Reboot nodes + ansible.builtin.include_tasks: reboot_nodes.yml diff --git a/utils/kernel_param_update/roles/kcmdline_update/tasks/reboot_nodes.yml b/utils/server_spec_update/roles/os_update/tasks/reboot_nodes.yml similarity index 100% rename from utils/kernel_param_update/roles/kcmdline_update/tasks/reboot_nodes.yml rename to utils/server_spec_update/roles/os_update/tasks/reboot_nodes.yml diff --git a/utils/kernel_param_update/roles/kcmdline_update/vars/main.yml b/utils/server_spec_update/roles/os_update/vars/main.yml similarity index 67% rename from utils/kernel_param_update/roles/kcmdline_update/vars/main.yml rename to utils/server_spec_update/roles/os_update/vars/main.yml index 18ae43903..4e7ee2bcd 100644 --- a/utils/kernel_param_update/roles/kcmdline_update/vars/main.yml +++ b/utils/server_spec_update/roles/os_update/vars/main.yml @@ -12,16 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. --- +# Usage: main.yml +warning_msg: "Warning: cmdline_value is empty for {{ inventory_hostname }}. No kernel command-line parameters will be set for this node." # Usage: kcmdline_update_redhat.yml kcmdline_update_rocky.yml kcmdline_update_ubuntu.yml -kcmdline_config_file: "{{ role_path }}../../../kernel_param_update_config.yml" grub_path: /etc/default/grub +file_permission: "0755" # Usage: reboot_nodes.yml reboot_fail_msg: "Failed. Nodes should be rebooted manually." -reboot_warning_msg: "[Warning] - Nodes will be rebooted" -warning_wait_time: 30 - -# Usage: validate_input.yml -validation_fail_msg: "grub_commandline_kernel variable is either undefined, not a string, or empty" -empty_inventory_fail_msg: "Failed. inventory not provided. Re-run playbook with inventory providing -i inventory." +reboot_warning_msg: "[WARNING] - NODES WILL BE REBOOTED. IF NODES ARE IN USE, YOU CAN ABORT THE TASK AND REBOOT MANUALLY LATER." +warning_wait_time: 60 diff --git a/utils/server_spec_update/roles/server_spec_validation/files/validate_inventory_file.py b/utils/server_spec_update/roles/server_spec_validation/files/validate_inventory_file.py new file mode 100644 index 000000000..5854d94f1 --- /dev/null +++ b/utils/server_spec_update/roles/server_spec_validation/files/validate_inventory_file.py @@ -0,0 +1,84 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import ipaddress +import json + +def validate_inventory(category_list, hostvars): + """ + Validates the inventory file by checking the validity of the host IP addresses and the presence of categories. + + Parameters: + category_data (dict): A dictionary containing the categories and their corresponding values. + hostvars (dict): A dictionary containing the host variables. + + Raises: + SystemExit: If the host IP is invalid or the categories are not provided in the inventory. + + Returns: + None + """ + # Validate hosts in inventory file + for host, host_data in hostvars.items(): + if 'ansible_host' in host_data.keys(): + host_ip = host_data['ansible_host'] + else: + host_ip = host_data['inventory_hostname'] + if len(host_ip.split('.')) != 4: + sys.exit(f"Failed, invalid host-ip in inventory: {host_ip}") + if not ipaddress.ip_address(host_ip): + sys.exit(f"Failed, invalid host-ip in inventory: {host_ip}") + + for host, host_data in hostvars.items(): + if 'Categories' not in host_data.keys(): + sys.exit(f"Failed, Categories not provided in inventory for host: {host}") + if len(host_data['Categories']) == 0: + sys.exit(f"Failed, Categories not provided in inventory for host: {host}") + + # Check if host is part of multiple groups + group_names = host_data.get('group_names', []) + if len(group_names) > 1: + sys.exit(f"Failed, host {host_ip} is part of multiple groups: {group_names}. A host can only belong to one group.") + print(f"Host {host_ip} belongs to group: {group_names[0]}") + + # Validate categories in inventory with server_spec + for host, host_data in hostvars.items(): + if 'Categories' in host_data.keys() and host_data['Categories'] not in category_list: + sys.exit(f"Failed, {host_ip}: {host_data['Categories']} category in additional nic inventory not found in server_spec.yml.") + +def main(): + """ + Executes the main function of the program. + + This function takes in two command line arguments: `category_data` and `hostvars`. + It then calls the `validate_inventory` function with these arguments. + + Parameters: + None + + Returns: + None + """ + + category_list = os.environ.get('category_list') + hostvars_str = os.environ.get('host_data') + if not category_list or not hostvars_str: + sys.exit("Failed, invalid input") + hostvars = json.loads(hostvars_str) + validate_inventory(category_list, hostvars) + +if __name__ == "__main__": + main() diff --git a/server_spec_update/roles/nic_validation/files/validate_server_spec.py b/utils/server_spec_update/roles/server_spec_validation/files/validate_server_spec.py similarity index 51% rename from server_spec_update/roles/nic_validation/files/validate_server_spec.py rename to utils/server_spec_update/roles/server_spec_validation/files/validate_server_spec.py index 591c732bd..f2e1ef39e 100644 --- a/server_spec_update/roles/nic_validation/files/validate_server_spec.py +++ b/utils/server_spec_update/roles/server_spec_validation/files/validate_server_spec.py @@ -20,75 +20,94 @@ def fetch_server_spec_data(server_spec_file_path): """ Fetches server specification data from a YAML file and returns it as a dictionary. - + Args: server_spec_file_path (str): The path to the server specification YAML file. - + Returns: dict: A dictionary containing the server specification data. """ with open(server_spec_file_path, "r") as file: data = yaml.safe_load(file) + json_data = json.dumps(data) server_spec_data = json.loads(json_data) - category_data={} + category_data = {} for category in server_spec_data['Categories']: - for ctg_key , ctg_value in category.items(): - grp_dict={} - if(ctg_key in category_data.keys()): + for ctg_key, ctg_value in category.items(): + grp_dict = {} + if ctg_key in category_data.keys(): sys.exit("Duplicate group details found in server spec.") + for group in ctg_value: - for grp_key,grp_value in group.items(): - net_dict={} + for grp_key, grp_value in group.items(): + net_dict = {} + nicnetwork_set = set() # To track duplicate nicnetwork values for network in grp_value: - if(all(keys in net_dict.keys() for keys in dict(network).keys())): - sys.exit("Duplicate network details found in server spec.") - net_dict = net_dict | dict(network) + for net_key, net_val in network.items(): + if 'nicnetwork' in net_val: + if net_val['nicnetwork'] in nicnetwork_set: + sys.exit(f"Duplicate nicnetwork '{net_val['nicnetwork']}' found in group '{ctg_key}' in server spec.") + nicnetwork_set.add(net_val['nicnetwork']) + if all(keys in net_dict.keys() for keys in dict(network).keys()): + sys.exit("Duplicate network details found in server spec.") + net_dict.update(dict(network)) grp_dict[grp_key] = net_dict - category_data[ ctg_key ] = grp_dict + category_data[ctg_key] = grp_dict + return category_data def validate_network_details(network_data, category_data): - """ - Validates the network details provided in the network_data and category_data dictionaries. - + """Validates the network details provided in the network_data and category_data dictionaries. Parameters: - network_data (dict): A dictionary containing network details. - category_data (dict): A dictionary containing category details. - Returns: - None - Raises: - SystemExit: If any validation fails, a SystemExit exception is raised with an appropriate error message. """ for ctg_val in category_data.values(): - for grp_val in ctg_val.values(): - for net_val in grp_val.values(): - if(net_val.keys()): - if('nicnetwork' not in net_val.keys() or len(net_val['nicnetwork']) == 0): + for grp_key, grp_val in ctg_val.items(): + if grp_key == "os": + for ker_key, ker_val in grp_val.items(): + if ker_key == "kernel": + if ker_val is None: + sys.exit("Failed, cmdline variable is missing in server spec.") + if 'cmdline' not in ker_val[0] : + sys.exit("Failed, cmdline not defined") + if grp_key == "network": + for net_val in grp_val.values(): + if 'nicnetwork' not in net_val or not net_val['nicnetwork']: sys.exit("Failed, nicnetwork details missing in server spec.") - - if(net_val['nicnetwork'] not in network_data.keys()): + + if net_val['nicnetwork'] not in network_data.keys(): sys.exit("Invalid network name provided in server spec.") - if('nictypes' not in net_val.keys() or len(net_val['nictypes']) == 0): + if 'nictypes' not in net_val or not net_val['nictypes']: sys.exit("Failed, nictypes details missing in server spec.") - if(net_val['nictypes'] not in ['ethernet', 'infiniband', 'vlan']): + if net_val['nictypes'] not in ['ethernet', 'infiniband', 'vlan']: sys.exit("Invalid network type provided in server spec.") - - if(net_val['nictypes'] == 'vlan'): - if('nicdevices' not in net_val.keys() or len(net_val['nicdevices']) == 0): + + if net_val['nictypes'] == 'vlan': + if 'nicdevices' not in net_val or not net_val['nicdevices']: sys.exit("Nic device details missing in server spec.") - if(net_val['nictypes'] == 'vlan'): - if('VLAN' not in network_data[net_val['nicnetwork']].keys() - or len(network_data[net_val['nicnetwork']]['VLAN']) == 0): + if (net_val['nictypes'] == 'vlan'): + if ('VLAN' not in network_data[net_val['nicnetwork']].keys() or len(network_data[net_val['nicnetwork']]['VLAN']) == 0): sys.exit("VLAN ID not provided in network spec for VLAN network.") - + + elif grp_key == "os": + for ker_key, ker_val in grp_val.items(): + if ker_key == "kernel": + if ker_val is None: + sys.exit("Failed, cmdline variable is missing in server spec.") + if 'cmdline' not in ker_val[0] : + sys.exit("Failed, cmdline not defined") + def main(): """ This function is the main entry point of the program. It takes in a network specification file path as a command line argument and retrieves network data from an environment variable. It then fetches server specification data from the network specification file and validates the network details against the server specification data. @@ -99,11 +118,11 @@ def main(): Returns: - None """ - network_spec_file_path = sys.argv[1] + network_spec_file_path = os.path.abspath(sys.argv[1]) network_string = os.environ.get('net_data') network_data = json.loads(network_string) category_data = fetch_server_spec_data(network_spec_file_path) - validate_network_details(network_data, category_data) + validate_network_details(network_data,category_data) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/server_spec_update/roles/nic_validation/tasks/include_network_spec.yml b/utils/server_spec_update/roles/server_spec_validation/tasks/include_network_spec.yml similarity index 100% rename from server_spec_update/roles/nic_validation/tasks/include_network_spec.yml rename to utils/server_spec_update/roles/server_spec_validation/tasks/include_network_spec.yml diff --git a/utils/server_spec_update/roles/server_spec_validation/tasks/include_server_spec.yml b/utils/server_spec_update/roles/server_spec_validation/tasks/include_server_spec.yml new file mode 100644 index 000000000..49f2b146d --- /dev/null +++ b/utils/server_spec_update/roles/server_spec_validation/tasks/include_server_spec.yml @@ -0,0 +1,36 @@ +--- + +- name: Initialize variables + ansible.builtin.set_fact: + add_kernel_param: false + +- name: Include server_spec.yml + block: + - name: Include server_spec file + ansible.builtin.include_vars: "{{ server_spec }}" + register: include_server_spec + no_log: true + tags: init + rescue: + - name: Failed to include server_spec.yml + ansible.builtin.fail: + msg: "{{ server_spec_syntax_fail_msg }} Error: {{ include_server_spec.message }}" + +- name: Parse server_spec data + ansible.builtin.set_fact: + category_data: "{{ category_data | default({}) | combine({item.key: item.value}) }}" + with_dict: "{{ Categories }}" + +- name: Set additional kernel parameter status + block: + - name: Set additional kernel parameter status + ansible.builtin.set_fact: + add_kernel_param: true + with_dict: "{{ category_data }}" + loop_control: + label: "{{ item.key }}" + when: item.value is defined and item.value | map(attribute='os') | select('defined') | list | length > 0 + rescue: + - name: Validating cmdline parameter + ansible.builtin.debug: + msg: " Kernel {{ item.key }} has invalid or missing cmdline parameter." diff --git a/server_spec_update/roles/nic_validation/tasks/main.yml b/utils/server_spec_update/roles/server_spec_validation/tasks/main.yml similarity index 74% rename from server_spec_update/roles/nic_validation/tasks/main.yml rename to utils/server_spec_update/roles/server_spec_validation/tasks/main.yml index 36ccda059..4c24f11eb 100644 --- a/server_spec_update/roles/nic_validation/tasks/main.yml +++ b/utils/server_spec_update/roles/server_spec_validation/tasks/main.yml @@ -17,18 +17,27 @@ ansible.builtin.set_fact: inventory_status: false +- name: Inventory not provided + ansible.builtin.fail: + msg: "{{ empty_inventory_fail_msg }}" + when: + - groups['all'] is defined + - (groups['all'] | length == 0) + - name: Include network_spec file ansible.builtin.include_tasks: include_network_spec.yml +- name: Include server_spec file + ansible.builtin.include_tasks: include_server_spec.yml + - name: Validate Admin and BMC parameters ansible.builtin.include_tasks: validate_admin_bmc_nic.yml - name: Validate network spec input - ansible.builtin.include_tasks: "{{ role_path }}/../../../discovery/roles/discovery_validations/common/tasks/validate_network_spec.yml" + ansible.builtin.include_tasks: "{{ role_path }}/../../../../discovery/roles/discovery_validations/common/tasks/validate_network_spec.yml" - name: Include validate_server_spec file ansible.builtin.include_tasks: validate_server_spec.yml - when: add_network_status - name: Include validate inventory file ansible.builtin.include_tasks: validate_inventory.yml diff --git a/server_spec_update/roles/nic_validation/tasks/validate_admin_bmc_nic.yml b/utils/server_spec_update/roles/server_spec_validation/tasks/validate_admin_bmc_nic.yml similarity index 100% rename from server_spec_update/roles/nic_validation/tasks/validate_admin_bmc_nic.yml rename to utils/server_spec_update/roles/server_spec_validation/tasks/validate_admin_bmc_nic.yml diff --git a/server_spec_update/roles/nic_validation/tasks/validate_inventory.yml b/utils/server_spec_update/roles/server_spec_validation/tasks/validate_inventory.yml similarity index 52% rename from server_spec_update/roles/nic_validation/tasks/validate_inventory.yml rename to utils/server_spec_update/roles/server_spec_validation/tasks/validate_inventory.yml index 56908fb0a..f23ac07ca 100644 --- a/server_spec_update/roles/nic_validation/tasks/validate_inventory.yml +++ b/utils/server_spec_update/roles/server_spec_validation/tasks/validate_inventory.yml @@ -25,31 +25,22 @@ ansible.builtin.fail: msg: "{{ server_spec_syntax_fail_msg }} Error: {{ include_server_spec.message }}" -- name: Parse server_spec data +- name: Parse server_spec data for categories ansible.builtin.set_fact: - category_data: "{{ category_data | default({}) | combine({item.key: item.value}) }}" + category_list: "{{ category_list | default([]) + [item.key] }}" with_dict: "{{ Categories }}" -- name: Validate hosts in inventory file - ansible.builtin.assert: - that: - - item.key.split('.') | map('int') | list | length == 4 - - item.key | ansible.utils.ipaddr - fail_msg: "{{ inventory_ip_fail_msg }}{{ item.key }}" - with_dict: "{{ hostvars }}" - -- name: Validate categories provided from host - ansible.builtin.assert: - that: - - item.value.Categories is defined - - item.value.Categories | length > 0 - fail_msg: "{{ host_category_fail_msg }} {{ item.key }}" - with_dict: "{{ hostvars }}" - -- name: Validate categories in inventory with server_spec - ansible.builtin.assert: - that: - - item.value.Categories in category_data.keys() - fail_msg: "Failed, {{ item.key }} : {{ item.value.Categories }} {{ inventory_category_fail_msg }}" - with_dict: "{{ hostvars }}" - when: item.value.Categories is defined +- name: Validate inventory file + block: + - name: Validate the inventory file + ansible.builtin.command: | + {{ python_version }} {{ validate_inventory_py }} + register: script_output + changed_when: false + environment: + host_data: "{{ hostvars | to_json }}" + category_list: "{{ category_list }}" + rescue: + - name: Failed to validate inventory file + ansible.builtin.fail: + msg: "{{ inventory_fail_msg }} Error: {{ script_output.stderr }}" diff --git a/server_spec_update/roles/nic_validation/tasks/validate_server_spec.yml b/utils/server_spec_update/roles/server_spec_validation/tasks/validate_server_spec.yml similarity index 100% rename from server_spec_update/roles/nic_validation/tasks/validate_server_spec.yml rename to utils/server_spec_update/roles/server_spec_validation/tasks/validate_server_spec.yml diff --git a/server_spec_update/roles/nic_validation/vars/main.yml b/utils/server_spec_update/roles/server_spec_validation/vars/main.yml similarity index 72% rename from server_spec_update/roles/nic_validation/vars/main.yml rename to utils/server_spec_update/roles/server_spec_validation/vars/main.yml index 3639a94fc..679eb87ba 100644 --- a/server_spec_update/roles/nic_validation/vars/main.yml +++ b/utils/server_spec_update/roles/server_spec_validation/vars/main.yml @@ -13,20 +13,22 @@ # limitations under the License. --- +# Usage: main.yml +empty_inventory_fail_msg: "Failed. inventory not provided. Re-run playbook with inventory by providing -i inventory. +Inventory should contain groups mentioned in input/server_spec.yml. Refer examples/inventory/server_spec_inv for the inventory format." + # Usage: include_network_spec.yml -network_spec: "{{ role_path }}/../../../input/network_spec.yml" +network_spec: "{{ role_path }}/../../../../input/network_spec.yml" network_spec_syntax_fail_msg: "Failed. Syntax errors present in network_spec.yml. Fix errors and re-run playbook again." # Usage: validate_server_spec.yml -python_version: "python3.9" +python_version: "{{ ansible_python_interpreter }}" validate_server_spec_py: "{{ role_path }}/files/validate_server_spec.py" -server_spec: "{{ role_path }}/../../../input/server_spec.yml" +server_spec: "{{ role_path }}/../../../../input/server_spec.yml" # Usage: validate_server_spec.yml server_spec_syntax_fail_msg: "Failed. Syntax errors present in server_spec.yml. Fix errors and re-run playbook again." host_category_fail_msg: "Failed, Categories not provided in inventory for host: " -inventory_category_fail_msg: "category in additional nic inventory not found in server_spec.yml." -inventory_ip_fail_msg: "Failed, invalid host-ip in inventory: " # Usage: validate_network_spec.yml static_range_check_fail_msg: "Failed. static_range_check variable in network_spec should be withing the netmask provided." @@ -37,11 +39,11 @@ netmask_bits_failure_msg: "Failed. admin and bmc netmask should be same." netmask_bits_success_msg: "Validated admin and bmc netmask bits" cidr_or_static_range_fail_msg: "Failed. network_spec should have either static_range or CIDR for the network." fail_msg_netmask_bits: "Failed. Invalid netmask_bits provided in network_spec file." -ip_range_netmask_script_script: "{{ role_path }}/../../../discovery/roles/discovery_validations/common/files/validate_ip_range_netmask.py" +ip_range_netmask_script_script: "{{ role_path }}/../../../../discovery/roles/discovery_validations/common/files/validate_ip_range_netmask.py" mtu_check_fail_msg: "Failed. MTU input variable in network_spec should be in proper integer format." -validate_cidr: "{{ role_path }}/../../../discovery/roles/discovery_validations/common/files/validate_cidr.py" +validate_cidr: "{{ role_path }}/../../../../discovery/roles/discovery_validations/common/files/validate_cidr.py" range_ip_check_fail_msg: "Failed. input ip range should be valid IP address (Eg. 192.168.1.1-198.168.1.255)." -validation_range_file: "{{ role_path }}/../../../discovery/roles/discovery_validations/common/files/validate_input_ranges.py" +validation_range_file: "{{ role_path }}/../../../../discovery/roles/discovery_validations/common/files/validate_input_ranges.py" fail_static_ip_range: "Failed, Network static overlaps with" fail_cidr_ip_range: "Failed, Cidr overlaps with" @@ -49,3 +51,7 @@ fail_cidr_ip_range: "Failed, Cidr overlaps with" meta_path: "/opt/omnia/.data/metadata.yml" fail_msg_metadata_missing: "Failed, discovery_provision.yml not executed hence metadata is missing. Execute discovery_provision.yml playbook and run again." fail_msg_invalid_metadata: "Failed. Invalid entries in metadata.yml. Execute discovery_provision.yml playbook and run again." + +# Usage: validate_inventory.yml +validate_inventory_py: "{{ role_path }}/files/validate_inventory_file.py" +inventory_fail_msg: "Failed, invalid inventory format: " diff --git a/server_spec_update/roles/update_db_details/files/calculate_ip_details.py b/utils/server_spec_update/roles/update_db_and_node_object/files/calculate_ip_details.py similarity index 100% rename from server_spec_update/roles/update_db_details/files/calculate_ip_details.py rename to utils/server_spec_update/roles/update_db_and_node_object/files/calculate_ip_details.py diff --git a/server_spec_update/roles/update_db_details/files/correlation_admin_add_nic.py b/utils/server_spec_update/roles/update_db_and_node_object/files/correlation_admin_add_nic.py similarity index 100% rename from server_spec_update/roles/update_db_details/files/correlation_admin_add_nic.py rename to utils/server_spec_update/roles/update_db_and_node_object/files/correlation_admin_add_nic.py diff --git a/server_spec_update/roles/update_db_details/files/insert_nicinfo_db.py b/utils/server_spec_update/roles/update_db_and_node_object/files/insert_nicinfo_db.py similarity index 75% rename from server_spec_update/roles/update_db_details/files/insert_nicinfo_db.py rename to utils/server_spec_update/roles/update_db_and_node_object/files/insert_nicinfo_db.py index a428ce3dc..e507e34e1 100644 --- a/server_spec_update/roles/update_db_details/files/insert_nicinfo_db.py +++ b/utils/server_spec_update/roles/update_db_and_node_object/files/insert_nicinfo_db.py @@ -15,12 +15,19 @@ import psycopg2 as pg from cryptography.fernet import Fernet +""" +This module contains functions for inserting NIC information into a database. +""" + +key_file_path = '/opt/omnia/.postgres/.postgres_pass.key' +pass_file_path = '/opt/omnia/.postgres/.encrypted_pwd' + def create_connection(): - with open('/opt/omnia/.postgres/.postgres_pass.key', 'rb') as passfile: + with open(key_file_path, 'rb') as passfile: key = passfile.read() fernet = Fernet(key) - with open('/opt/omnia/.postgres/.encrypted_pwd', 'rb') as datafile: + with open(pass_file_path, 'rb') as datafile: encrypted_file_data = datafile.read() decrypted_pwd = fernet.decrypt(encrypted_file_data).decode() # Create database connection @@ -37,16 +44,16 @@ def create_connection(): def check_presence_id(cursor, id): """ - Check presence of bmc ip in DB. - Parameters: - cursor: Pointer to omniadb DB. - id: id whose presence we need to check in DB. - Returns: - bool: that gives true or false if the bmc ip is present in DB. + Check presence of bmc ip in DB. + Parameters: + cursor: Pointer to omniadb DB. + id: id whose presence we need to check in DB. + Returns: + bool: that gives true or false if the bmc ip is present in DB. """ - query = f'''SELECT EXISTS(SELECT id FROM cluster.nicinfo WHERE id='{id}')''' - cursor.execute(query) + query = f'''SELECT EXISTS(SELECT id FROM cluster.nicinfo WHERE id=%s)''' + cursor.execute(query, (id,)) output = cursor.fetchone()[0] return output @@ -54,8 +61,8 @@ def check_presence_id(cursor, id): def insert_nic_info(ip, db_data): conn = create_connection() cursor = conn.cursor() - sql_query = f"SELECT id FROM cluster.nodeinfo where admin_ip='{ip}'" - cursor.execute(sql_query) + sql_query = "SELECT id FROM cluster.nodeinfo where admin_ip=%s" + cursor.execute(sql_query, (ip,)) id_no = cursor.fetchone() if id_no is not None: op = check_presence_id(cursor, id_no[0]) @@ -72,10 +79,11 @@ def insert_nic_info(ip, db_data): elif op: set_clause = ', '.join(f'{col} = COALESCE({col}, %({col})s)' if col != 'category' and col.endswith( 'ip') else f'{col} = %({col})s' for col in db_data.keys()) - query = f"UPDATE cluster.nicinfo SET {set_clause} WHERE id = {id_no[0]}" + query = f"UPDATE cluster.nicinfo SET {set_clause} WHERE id=%(id)s" + try: print("DB data=", db_data) - cursor.execute(query, db_data) + cursor.execute(query, {**db_data, 'id': id_no[0]}) except Exception as e: print(e) @@ -83,4 +91,4 @@ def insert_nic_info(ip, db_data): print(ip, " Not present in the DB. Please provide proper IP") cursor.close() - conn.close() + conn.close() \ No newline at end of file diff --git a/server_spec_update/roles/update_db_details/files/uncorrelated_add_ip.py b/utils/server_spec_update/roles/update_db_and_node_object/files/uncorrelated_add_ip.py similarity index 75% rename from server_spec_update/roles/update_db_details/files/uncorrelated_add_ip.py rename to utils/server_spec_update/roles/update_db_and_node_object/files/uncorrelated_add_ip.py index c05314ab1..e9d6d21ea 100644 --- a/server_spec_update/roles/update_db_details/files/uncorrelated_add_ip.py +++ b/utils/server_spec_update/roles/update_db_and_node_object/files/uncorrelated_add_ip.py @@ -14,7 +14,12 @@ import ipaddress import sys +from psycopg2.extensions import AsIs +""" + This module provides functionality for calculating and + validating the uncorrelated admin IP for a node. +""" def cal_nic_ip(cursor, col, ip, end_ip): end_ip = ipaddress.IPv4Address(end_ip) @@ -34,25 +39,23 @@ def cal_nic_ip(cursor, col, ip, end_ip): "We have reached the end of ranges. Please do a cleanup and provide a wider nic_range, if more nodes needs to be discovered.") return str(nic_ip) - - def check_presence_ip(cursor, col, ip): """ - Check presence of bmc ip in DB. - Parameters: - cursor: Pointer to omniadb DB. - col: the col name in the DB - ip: ip whose presence we need to check in DB. - Returns: - bool: that gives true or false if the bmc ip is present in DB. + Check presence of bmc ip in DB. + Parameters: + cursor: Pointer to omniadb DB. + col: the col name in the DB + ip: ip whose presence we need to check in DB. + Returns: + bool: that gives true or false if the bmc ip is present in DB. """ - - query = f'''SELECT EXISTS(SELECT {col}_ip FROM cluster.nicinfo WHERE {col}_ip='{ip}')''' - cursor.execute(query) + query = '''SELECT EXISTS(SELECT %s FROM cluster.nicinfo WHERE %s=%s)''' + cursor.execute(query, (AsIs(f"{col}_ip"), AsIs(f"{col}_ip"), str(ip))) output = cursor.fetchone()[0] return output + def cal_uncorrelated_add_ip(cursor, col, nic_mode, nic_range): """ Calculates the uncorrelated node admin ip, if correlation is false, or it is not possible. @@ -73,8 +76,8 @@ def cal_uncorrelated_add_ip(cursor, col, nic_mode, nic_range): rows_exist = cursor.fetchone()[0] if nic_mode == "static": if rows_exist: - sql = f'''select {col}_ip from cluster.nicinfo where {col}_ip is not NULL ORDER BY {col}_ip DESC LIMIT 1''' - cursor.execute(sql) + sql = f'''select %s from cluster.nicinfo where %s is not NULL ORDER BY %s DESC LIMIT 1''' + cursor.execute(sql, (AsIs(f"{col}_ip"), AsIs(f"{col}_ip"), AsIs(f"{col}_ip"))) last_nic_ip = cursor.fetchone() if last_nic_ip is None: return str(start_nic_ip) @@ -85,8 +88,8 @@ def cal_uncorrelated_add_ip(cursor, col, nic_mode, nic_range): return str(start_nic_ip) if nic_mode == "cidr": if rows_exist: - sql = f'''select {col}_ip from cluster.nicinfo where {col}_ip is not NULL ORDER BY {col}_ip DESC LIMIT 1''' - cursor.execute(sql) + sql = f'''select %s from cluster.nicinfo where %s is not NULL ORDER BY %s DESC LIMIT 1''' + cursor.execute(sql, (AsIs(f"{col}_ip"), AsIs(f"{col}_ip"), AsIs(f"{col}_ip"))) last_nic_ip = cursor.fetchone() if last_nic_ip is None: return str(start_nic_ip) @@ -94,4 +97,4 @@ def cal_uncorrelated_add_ip(cursor, col, nic_mode, nic_range): nic_ip = cal_nic_ip(cursor, col, last_nic_ip[0], str(end_nic_ip)) return str(nic_ip) else: - return str(start_nic_ip) + return str(start_nic_ip) \ No newline at end of file diff --git a/server_spec_update/roles/update_db_details/files/update_nicinfo_db.py b/utils/server_spec_update/roles/update_db_and_node_object/files/update_nicinfo_db.py similarity index 88% rename from server_spec_update/roles/update_db_details/files/update_nicinfo_db.py rename to utils/server_spec_update/roles/update_db_and_node_object/files/update_nicinfo_db.py index 098f75468..00820e2e3 100644 --- a/server_spec_update/roles/update_db_details/files/update_nicinfo_db.py +++ b/utils/server_spec_update/roles/update_db_and_node_object/files/update_nicinfo_db.py @@ -12,19 +12,30 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys +import sys, os import yaml import ipaddress import uncorrelated_add_ip import correlation_admin_add_nic import insert_nicinfo_db -server_spec_file_path = sys.argv[1] +def validate_input(value): + """ + Validates the input value. + Raises: + ValueError: If the value is empty. + """ + if value: + return value + + raise ValueError("Node details cannot be empty") + +server_spec_file_path = os.path.abspath(sys.argv[1]) category_nm = sys.argv[2] -metadata_path = sys.argv[3] +metadata_path = os.path.abspath(sys.argv[3]) admin_static_range = sys.argv[4] admin_nb = sys.argv[5] -node_detail = sys.argv[6] +node_detail = validate_input(sys.argv[6]) with open(server_spec_file_path, "r") as file: data = yaml.safe_load(file) @@ -77,7 +88,9 @@ def update_db_nicinfo(): db_data['category'] = cat_nm for col in value: for grp_key, grp_value in col.items(): - for network in grp_value: +# for network in grp_value: + if grp_key == 'Network' or grp_key == 'network': + for network in grp_value: for net_key, net_value in network.items(): nic_nw = net_value.get('nicnetwork') nic_nam = net_key diff --git a/server_spec_update/roles/update_node_object/files/update_node_objects.py b/utils/server_spec_update/roles/update_db_and_node_object/files/update_node_objects.py similarity index 74% rename from server_spec_update/roles/update_node_object/files/update_node_objects.py rename to utils/server_spec_update/roles/update_db_and_node_object/files/update_node_objects.py index 37548b226..6157dcdcd 100644 --- a/server_spec_update/roles/update_node_object/files/update_node_objects.py +++ b/utils/server_spec_update/roles/update_db_and_node_object/files/update_node_objects.py @@ -13,7 +13,7 @@ # limitations under the License. import subprocess -import sys +import sys, os import yaml import time @@ -21,7 +21,7 @@ sys.path.insert(0, db_path) import omniadb_connection -network_spec_file_path = sys.argv[1] +network_spec_file_path = os.path.abspath(sys.argv[1]) with open(network_spec_file_path, "r") as file: data = yaml.safe_load(file) @@ -69,30 +69,30 @@ def update_node_obj(): # updating node objects if network_nic is not None and network_ip is not None and network_type is not None: if network_type != "vlan": - command = ["chdef", node_name, f"nictypes.{network_nic}={network_type}"] + command = ["/opt/xcat/bin/chdef", node_name, f"nictypes.{network_nic}={network_type}"] subprocess.run(command) - command = ["chdef", node_name, f"nicips.{network_nic}={network_ip}"] + command = ["/opt/xcat/bin/chdef", node_name, f"nicips.{network_nic}={network_ip}"] subprocess.run(command) - command = ["chdef", node_name, f"nicnetworks.{network_nic}={network_name}"] + command = ["/opt/xcat/bin/chdef", node_name, f"nicnetworks.{network_nic}={network_name}"] subprocess.run(command) else: sql_query_network = f"SELECT {network_name}_device FROM cluster.nicinfo WHERE id = %s" cursor.execute(sql_query_network, (row[0],)) primary_nic = cursor.fetchone() - + if primary_nic[0]: - command = ["chdef", node_name, f"nictypes.{primary_nic[0]}=ethernet"] + command = ["/opt/xcat/bin/chdef", node_name, f"nictypes.{primary_nic[0]}=ethernet"] subprocess.run(command) - command = ["chdef", node_name, f"nicips.{network_nic}={network_ip}"] + command = ["/opt/xcat/bin/chdef", node_name, f"nicips.{network_nic}={network_ip}"] subprocess.run(command) - command = ["chdef", node_name, f"nicnetworks.{network_nic}={network_name}"] + command = ["/opt/xcat/bin/chdef", node_name, f"nicnetworks.{network_nic}={network_name}"] subprocess.run(command) - command = ["chdef", node_name, f"nictypes.{network_nic}={network_type}", f"nicdevices.{network_nic}={primary_nic[0]}"] + command = ["/opt/xcat/bin/chdef", node_name, f"nictypes.{network_nic}={network_type}", f"nicdevices.{network_nic}={primary_nic[0]}"] subprocess.run(command) - command = ["chdef", node_name, f"nichostnamesuffixes.{network_nic}=-{primary_nic[0]}"] + command = ["/opt/xcat/bin/chdef", node_name, f"nichostnamesuffixes.{network_nic}=-{primary_nic[0]}"] subprocess.run(command) - command = ["updatenode", node_name, "-P", "confignetwork,omnia_hostname"] + command = ["/opt/xcat/bin/updatenode", node_name, "-P", "confignetwork,omnia_hostname"] subprocess.run(command) time.sleep(1) diff --git a/server_spec_update/roles/update_db_details/tasks/main.yml b/utils/server_spec_update/roles/update_db_and_node_object/tasks/main.yml similarity index 86% rename from server_spec_update/roles/update_db_details/tasks/main.yml rename to utils/server_spec_update/roles/update_db_and_node_object/tasks/main.yml index d9d3d45ec..16da3144e 100644 --- a/server_spec_update/roles/update_db_details/tasks/main.yml +++ b/utils/server_spec_update/roles/update_db_and_node_object/tasks/main.yml @@ -11,10 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ---- -- name: Update the DB +- name: Update DB and Node Object when: add_network_status block: - name: Update network db for nic info ansible.builtin.include_tasks: update_network_info.yml + + - name: Update node object + ansible.builtin.include_tasks: update_nodes.yml diff --git a/server_spec_update/roles/update_db_details/tasks/update_network_info.yml b/utils/server_spec_update/roles/update_db_and_node_object/tasks/update_network_info.yml similarity index 100% rename from server_spec_update/roles/update_db_details/tasks/update_network_info.yml rename to utils/server_spec_update/roles/update_db_and_node_object/tasks/update_network_info.yml diff --git a/server_spec_update/roles/update_node_object/tasks/update_nodes.yml b/utils/server_spec_update/roles/update_db_and_node_object/tasks/update_nodes.yml similarity index 94% rename from server_spec_update/roles/update_node_object/tasks/update_nodes.yml rename to utils/server_spec_update/roles/update_db_and_node_object/tasks/update_nodes.yml index c612771f3..e030b8ffc 100644 --- a/server_spec_update/roles/update_node_object/tasks/update_nodes.yml +++ b/utils/server_spec_update/roles/update_db_and_node_object/tasks/update_nodes.yml @@ -18,6 +18,6 @@ {{ python_version }} {{ update_node_objects_path }} {{ network_spec_file_path }} {{ omnia_db_path }} changed_when: false -- name: Successful execution of server_spec_update.yml +- name: Network configuration completed ansible.builtin.debug: msg: "{{ server_sepc_update_success_msg }}" diff --git a/server_spec_update/roles/update_db_details/vars/main.yml b/utils/server_spec_update/roles/update_db_and_node_object/vars/main.yml similarity index 58% rename from server_spec_update/roles/update_db_details/vars/main.yml rename to utils/server_spec_update/roles/update_db_and_node_object/vars/main.yml index a9d74a6c8..0c39ce56e 100644 --- a/server_spec_update/roles/update_db_details/vars/main.yml +++ b/utils/server_spec_update/roles/update_db_and_node_object/vars/main.yml @@ -14,12 +14,20 @@ --- # Usage: update_network_info.yml -python_version: "python3.9" +python_version: "{{ ansible_python_interpreter }}" update_network_db: "{{ role_path }}/files/update_nicinfo_db.py" -server_spec_path: "{{ role_path }}/../../../input/server_spec.yml" +server_spec_path: "{{ role_path }}/../../../../input/server_spec.yml" metadata_nicinfo_path: "/opt/omnia/.data/nic_metadata.yml" -node_db_path: "{{ role_path }}/../../../discovery/roles/db_operations/files" +node_db_path: "{{ role_path }}/../../../../discovery/roles/db_operations/files" update_network_db_fail_msg: "Failed. Unable to configure networks on the nodes. Please verify inputs in server_spec.yml & network_spec.yml, and re-run the playbook. Make sure no changes are made to the existing groups in the server_spec.yml and existing networks in the network_spec.yml. For any changes add new groups in server_spec.yml or new networks in network_spec.yml" + +# Usage: update_nodes.yml +network_spec_file_path: "{{ role_path }}/../../../../input/network_spec.yml" +update_node_objects_path: "{{ role_path }}/files/update_node_objects.py" +omnia_db_path: "{{ role_path }}/../../../../discovery/roles/db_operations/files" +server_sepc_update_success_msg: "Network changes executed. Verify the networks configured on the nodes. +Networks might not be configured if invalid NIC names are provided in the input file, server_spec.yml. +If vlan is not configured for a NIC ensure vlan name is provided in the format NIC.vlan_id(eth1.101) in server_spec.yml and re-run the playbook." diff --git a/server_spec_update/server_spec_update.yml b/utils/server_spec_update/server_spec_update.yml similarity index 63% rename from server_spec_update/server_spec_update.yml rename to utils/server_spec_update/server_spec_update.yml index 5e2e12669..1c946f7e8 100644 --- a/server_spec_update/server_spec_update.yml +++ b/utils/server_spec_update/server_spec_update.yml @@ -13,15 +13,18 @@ # limitations under the License. --- -- name: Additional nic update +- name: Servicetag Host mapping # noqa: role-name[path] + hosts: localhost + gather_facts: true + roles: + - ../roles/servicetag_host_mapping + +- name: Updating server specifications hosts: localhost connection: local roles: - - role: nic_validation - - role: create_nicinfo_db - - role: metadata_creation - - role: metadata_update - - role: add_nic_network + - role: server_spec_validation + - role: network_update - name: Additional nic update hosts: all @@ -31,5 +34,13 @@ - name: Additional nic update hosts: localhost roles: - - role: update_db_details - - role: update_node_object + - role: update_db_and_node_object + +- name: Apply OS-specific kernel parameters + hosts: all + become: true + connection: ssh + vars_files: + - "{{ playbook_dir }}/../../input/server_spec.yml" + roles: + - os_update diff --git a/utils/servicetag_host_mapping.yml b/utils/servicetag_host_mapping.yml index a361721ce..e9fac7789 100644 --- a/utils/servicetag_host_mapping.yml +++ b/utils/servicetag_host_mapping.yml @@ -13,6 +13,10 @@ # limitations under the License. --- +- name: Check if virtual environment is active + ansible.builtin.import_playbook: ../utils/check_venv.yml + when: not ( hostvars['127.0.0.1']['check_venv_executed'] | default(false) | bool ) + - name: Host Mapping hosts: localhost connection: local diff --git a/utils/software_update/ansible.cfg b/utils/software_update/ansible.cfg index 1d9b5f807..751f1a104 100644 --- a/utils/software_update/ansible.cfg +++ b/utils/software_update/ansible.cfg @@ -4,6 +4,7 @@ host_key_checking = false forks = 5 timeout = 180 executable = /bin/bash +collections_path = $VIRTUAL_ENV [persistent_connection] command_timeout = 180 @@ -11,4 +12,4 @@ connect_timeout = 180 [ssh_connection] retries = 3 -ssh_args = -o ControlMaster=auto -o ControlPersist=180 \ No newline at end of file +ssh_args = -o ControlMaster=auto -o ControlPersist=180 diff --git a/utils/software_update/roles/software_update/tasks/include_software_config.yml b/utils/software_update/roles/software_update/tasks/include_software_config.yml index 63f888bea..7848f363d 100644 --- a/utils/software_update/roles/software_update/tasks/include_software_config.yml +++ b/utils/software_update/roles/software_update/tasks/include_software_config.yml @@ -18,6 +18,7 @@ beegfs_version: "omnia_default" amdgpu_version: "omnia_default" rocm_version: "omnia_default" + intelgaudi_version: "omnia_default" - name: Load software_config.json as software_config block: diff --git a/utils/software_update/software_update_config.yml b/utils/software_update/software_update_config.yml index c4f98d715..9bf701325 100644 --- a/utils/software_update/software_update_config.yml +++ b/utils/software_update/software_update_config.yml @@ -13,18 +13,19 @@ # limitations under the License. --- -#*********************************************************************** -# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. +# *********************************************************************** +# DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. # SIMPLY APPEND THE REQUIRD VALUES AGAINST THE PARAMETER OF YOUR CHOICE. -#*********************************************************************** +# *********************************************************************** # Mandatory, when package_list is not provided # This variable contains the list of software group mentioned in software_config.json # Example: # softwares_list: # - custom -# In above case, user is required to create custom.json under input/config/{{ cluster_os_type }}/{{ cluster_os_version }}/custom.json, Eg: input/config/ubuntu/22.04/custom.json file. -# This json should contain the list of packages, either .deb(Ubuntu) or .rpm(RHEL/Rocky) that should be installed on remote nodes. +# In above case, user is required to create custom.json > +# under input/config/{{ cluster_os_type }}/{{ cluster_os_version }}/custom.json, Eg: input/config/ubuntu/22.04/custom.json file. +# This json should contain the list of packages, either .deb(Ubuntu) or .rpm(RHEL/Rocky) that should be installed on remote nodes. softwares_list: # Mandatory, when softwares_list is not provided diff --git a/utils/test/test_thirdparty.yml b/utils/test/test_thirdparty.yml index 8e9da2496..4e3fa341c 100644 --- a/utils/test/test_thirdparty.yml +++ b/utils/test/test_thirdparty.yml @@ -15,7 +15,7 @@ - name: Validate pdsh, pdsh-rcmd-ssh, clustershell installation after execution of omnia.yml hosts: localhost vars_files: - - {{ playbook_dir }}/test_vars/test_thirdparty_vars.yml + - "{{ playbook_dir }}/test_vars/test_thirdparty_vars.yml" tasks: - name: Validate pdsh installation block: diff --git a/utils/timescaledb_utility/roles/timescaledb_utility/files/dump_data_from_db.py b/utils/timescaledb_utility/roles/timescaledb_utility/files/dump_data_from_db.py index 3973847bb..6a6fee86c 100644 --- a/utils/timescaledb_utility/roles/timescaledb_utility/files/dump_data_from_db.py +++ b/utils/timescaledb_utility/roles/timescaledb_utility/files/dump_data_from_db.py @@ -13,73 +13,137 @@ # limitations under the License. #!/usr/bin/env python3 +""" +This module performs the task of dumping data from the database to a CSV file. +It securely connects to the database, validates input parameters, fetches valid +column names from the database, and retrieves data based on column names and timestamps. -''' - This module contains tasks required for database update - The query should be created along with timestamp before updating - the database. -''' +- We are fetching username, password from telemetry_config.yml +- host as localhost, port - kubectl get svc commands, and dbname - from vars file through ansible +- column_name, column_value, start_time, stop_time - from timescaledb_utility_config.yml +""" import sys +import ipaddress +import re import psycopg2 -import pandas +import pandas as pd from psycopg2.extensions import AsIs +import argparse -dbuser = sys.argv[1] -dbpwd = sys.argv[2] -dbhost = sys.argv[3] -dbport = sys.argv[4] -dbtelemetry = sys.argv[5] -column_name = sys.argv[6] -column_value = sys.argv[7] -start_time = sys.argv[8] -stop_time = sys.argv[9] -filename= sys.argv[10] +# Patterns for validation - 'YYYY-MM-DD HH:MM:SS+TZ' +TIMESTAMP_PATTERN = r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}(\+\d{2}:\d{2})?$' + + +def parse_arguments(): + parser = argparse.ArgumentParser(description="Dump data from the database to a CSV file.") + parser.add_argument("user", type=str, help="Username for the database") + parser.add_argument("password", type=str, help="Password for the database") + parser.add_argument("host", type=str, help="Hostname for the database") + parser.add_argument("port", type=str, help="Port number for the database") + parser.add_argument("dbname", type=str, help="Name of the database") + parser.add_argument("column_name", type=str, help="Name of the column to update") + parser.add_argument("column_value", type=str, help="Value to set for the column") + parser.add_argument("start_time", type=str, help="Start timestamp for the data range") + parser.add_argument("stop_time", type=str, help="Stop timestamp for the data range") + parser.add_argument("filename", type=str, help="Name of the output CSV file") + args = parser.parse_args() + return args + + +def validate_inputs(value, obj): + if value.strip(): + return value + else: + raise ValueError(f"{obj} value cannot be empty") + +def validate_timestamp(timestamp): + """Validates the timestamp format or checks for 'None'.""" + if timestamp != "None" and not re.fullmatch(TIMESTAMP_PATTERN, timestamp): + raise ValueError("Invalid timestamp format. Use 'YYYY-MM-DD HH:MM:SS+TZ' format.") + return timestamp + +def validate_column_value(column_value): + """ + Validate the column value to prevent SQL injection. + Allows all characters for flexibility as per PostgreSQL text field constraints. + """ + if column_value: + return column_value + else: + raise ValueError("Invalid column value") def db_connect(): - ''' - This module creates Database Connection - ''' - conn = None - connection_string = f"postgres://{dbuser}:{dbpwd}@{dbhost}:{dbport}/{dbtelemetry}".format( - dbuser = dbuser, dbpwd = dbpwd, dbhost = dbhost, dbport = dbport, dbtelemetry = dbtelemetry) + """Creates a secure database connection.""" try: - conn = psycopg2.connect(connection_string) + conn = psycopg2.connect( + user=user, + password=password, + host=host, + port=port, + dbname=dbname + ) if conn is not None: conn.autocommit = True except Exception as ex: - sys.exit('Failed to connect to timescaledb') + sys.exit(f"Failed to connect to timescaledb: {ex}") return conn +def fetch_valid_columns(): + """Fetches valid column names from the database.""" + query = "SELECT column_name FROM information_schema.columns WHERE table_name='metrics'" + conn = db_connect() + try: + with conn.cursor() as cursor: + cursor.execute(query) + return [row[0] for row in cursor.fetchall()] + conn.close() + except Exception as ex: + raise ValueError(f"Failed to fetch valid columns from the database: {ex}") + +def validate_column_name(column_name, valid_columns): + """Validates the column name against available columns in the database.""" + if column_name != "None" and column_name not in valid_columns: + raise ValueError(f"Invalid column name '{column_name}'. Available columns: {', '.join(valid_columns)}") + return column_name + def get_data_by_timerange_and_column(conn): + """ + Retrieves data from the database based on the given time range and column name/value. + """ + query = "SELECT * FROM omnia_telemetry.metrics WHERE true" + params = [] + + if start_time != "None" and stop_time != "None": + query += " AND time BETWEEN %s AND %s" + params.extend([start_time, stop_time]) + + if column_name != "None" and column_value != "None": + query += " AND %s = %s" + params.extend([AsIs(column_name), column_value]) + try: - # Query the database to retrieve the data based on the time range, column name, and column value - sql_query_case1 = ''' - SELECT * FROM omnia_telemetry.metrics - WHERE time BETWEEN %s AND %s AND %s = %s - ''' - sql_query_case2 = ''' - SELECT * FROM omnia_telemetry.metrics WHERE %s = %s - ''' - sql_query_case3 = ''' - SELECT * FROM omnia_telemetry.metrics - WHERE time BETWEEN %s AND %s - ''' - sql_query_case4 = ''' - SELECT * FROM omnia_telemetry.metrics - ''' - if column_name == 'None' and column_value == 'None' and start_time == 'None' and stop_time == 'None': - dataframe = pandas.read_sql(sql_query_case4, conn) - elif column_name == 'None' and column_value == 'None' and start_time != 'None' and stop_time != 'None': - dataframe = pandas.read_sql(sql_query_case3, conn, params=(start_time, stop_time)) - elif column_name != 'None' and column_value != 'None' and start_time == 'None' and stop_time == 'None': - dataframe = pandas.read_sql(sql_query_case2, conn, params=(AsIs(column_name), column_value)) - else: - print(start_time + stop_time) - dataframe = pandas.read_sql(sql_query_case1, conn, params= (start_time, stop_time, AsIs(column_name), column_value)) - return dataframe + return pd.read_sql(query, conn, params=params) except Exception as ex: - sys.exit('Failed to fetch data from timescaledb.'+ str(ex)) + sys.exit(f"Failed to fetch data from the database: {ex}") + + +args = parse_arguments() + +try: + user = validate_inputs(args.user, 'user') + password = validate_inputs(args.password, 'password') + host = validate_inputs(args.host, 'host') + port = validate_inputs(args.port, 'port') + dbname = validate_inputs(args.dbname, 'dbname') + + column_name = validate_column_name(args.column_name, fetch_valid_columns()) + column_value = validate_column_value(args.column_value) + start_time = validate_timestamp(args.start_time) + stop_time = validate_timestamp(args.stop_time) + filename = args.filename +except Exception as ex: + sys.exit(f"Failed to parse arguments: {ex}") def main(): ''' @@ -88,7 +152,7 @@ def main(): db_conn = db_connect() if db_conn is not None: dataframe = get_data_by_timerange_and_column(db_conn) - dataframe.to_csv(filename) + dataframe.to_csv(filename, index=False) db_conn.close() if __name__ == '__main__': diff --git a/utils/timescaledb_utility/roles/timescaledb_utility/tasks/initiate_timescaledb_python_utility.yml b/utils/timescaledb_utility/roles/timescaledb_utility/tasks/initiate_timescaledb_python_utility.yml index 76d3eeef1..93758898b 100644 --- a/utils/timescaledb_utility/roles/timescaledb_utility/tasks/initiate_timescaledb_python_utility.yml +++ b/utils/timescaledb_utility/roles/timescaledb_utility/tasks/initiate_timescaledb_python_utility.yml @@ -36,7 +36,7 @@ - name: Invoke python script ansible.builtin.command: | - {{ python_version }} {{ db_schema_utility }} {{ timescaledb_user }} {{ timescaledb_password }} - {{ timescale_svc_ip }} {{ timescale_svc_port.stdout }} {{ timescaledb_name }} - {{ metric_name }} {{ metric_value }} {{ start_timestamp | quote }} {{ stop_timestamp | quote }} {{ filename }} + {{ python_version }} {{ db_schema_utility }} {{ timescaledb_user }} {{ timescaledb_password }} + {{ timescale_svc_ip }} {{ timescale_svc_port.stdout }} {{ timescaledb_name }} + {{ metric_name }} {{ metric_value | quote }} {{ start_timestamp | quote }} {{ stop_timestamp | quote }} {{ filename | quote }} changed_when: false diff --git a/utils/timescaledb_utility/roles/timescaledb_utility/tasks/install_packages.yml b/utils/timescaledb_utility/roles/timescaledb_utility/tasks/install_packages.yml index 9afbbb2de..9b97db08c 100644 --- a/utils/timescaledb_utility/roles/timescaledb_utility/tasks/install_packages.yml +++ b/utils/timescaledb_utility/roles/timescaledb_utility/tasks/install_packages.yml @@ -12,8 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. --- +- name: Saving distribution and version of OS + ansible.builtin.set_fact: + oim_os: "{{ ansible_distribution | lower }}" + oim_os_version: "{{ ansible_distribution_version | lower }}" -- name: Install devel packages +- name: Install development packages + when: + - oim_os in oim_os_redhat or + oim_os in oim_os_rocky ansible.builtin.package: name: "{{ item }}" state: present diff --git a/utils/timescaledb_utility/roles/timescaledb_utility/vars/main.yml b/utils/timescaledb_utility/roles/timescaledb_utility/vars/main.yml index 8faa6e0c5..198e917a4 100644 --- a/utils/timescaledb_utility/roles/timescaledb_utility/vars/main.yml +++ b/utils/timescaledb_utility/roles/timescaledb_utility/vars/main.yml @@ -26,9 +26,13 @@ telemetry_vault_filename: "{{ role_path }}/../../../../input/.telemetry_vault_ke telemetry_config_syntax_fail_msg: "Failed.Syntax errors present in telemetry_config.yml.Fix errors and re-run playbook again." timescaledb_credentials_fail_msg: "Please provide timescaledb credentials in telemetry_config.yml" +python_version: "{{ ansible_python_interpreter.split('/')[-1] }}" # Usage: install_packages.yml +oim_os_redhat: "redhat" +oim_os_rocky: "rocky" + devel_packages: - - python39-devel + - "{{ python_version }}-devel" - postgresql-devel python_package: - psycopg2-binary @@ -40,5 +44,4 @@ timescaledb_service_failure_msg: "TimescaleDB is not running. Run telemetry.yml/ timescaledb_k8s_name: timescaledb namespace: telemetry-and-visualizations timescaledb_name: telemetry_metrics -python_version: python3.9 db_schema_utility: "{{ role_path }}/files/dump_data_from_db.py" diff --git a/utils/timescaledb_utility/timescaledb_utility_config.yml b/utils/timescaledb_utility/timescaledb_utility_config.yml index a017c290e..c3b1e05e7 100644 --- a/utils/timescaledb_utility/timescaledb_utility_config.yml +++ b/utils/timescaledb_utility/timescaledb_utility_config.yml @@ -30,4 +30,4 @@ stop_time: "" # File where data collected from timescaleDB should be dumped # Default value: "/root/telemetry_data.csv" -filename: "/root/telemetry_data.csv" \ No newline at end of file +filename: "/root/telemetry_data.csv" diff --git a/upgrade/roles/backup_omniadb/tasks/install_packages.yml b/utils/update_synclist.yml similarity index 57% rename from upgrade/roles/backup_omniadb/tasks/install_packages.yml rename to utils/update_synclist.yml index 28da7de10..193eb609b 100644 --- a/upgrade/roles/backup_omniadb/tasks/install_packages.yml +++ b/utils/update_synclist.yml @@ -12,21 +12,22 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -- name: Install python3-devel - ansible.builtin.yum: - name: python39-devel - state: present -# - name: Install psycopg2 package -# ansible.builtin.pip: -# name: psycopg2 -# state: present +- name: Check if virtual environment is active + ansible.builtin.import_playbook: check_venv.yml + when: not ( check_venv_executed | default(false) | bool ) -- name: Install psycopg2 module - ansible.builtin.command: "{{ python_version }} -m pip install psycopg2-binary" - changed_when: false +- name: Update synclist on provisioned nodes + hosts: localhost + connection: local + tasks: + - name: Fail if no inventory is provided + ansible.builtin.fail: + msg: "Inventory is not provided or is empty" + when: + - groups['all'] is not defined or (groups['all'] | length == 0) -- name: Install postgresql package - community.general.ansible_galaxy_install: - type: collection - name: community.postgresql + roles: + - servicetag_host_mapping + - update_synclist + - inventory_tagging diff --git a/utils/update_user_repo.yml b/utils/update_user_repo.yml index ec73f4f8d..9ef708627 100644 --- a/utils/update_user_repo.yml +++ b/utils/update_user_repo.yml @@ -13,6 +13,20 @@ # limitations under the License. --- +- name: Check if virtual environment is active + ansible.builtin.import_playbook: check_venv.yml + when: not ( check_venv_executed | default(false) | bool ) + +- name: Validate whether cross-os is present + hosts: localhost + connection: local + gather_facts: true + tasks: + - name: Validate cluster and Omnia Infrastructure Manager OS versions + ansible.builtin.include_role: + name: update_user_repo + tasks_from: check_os_versions.yml + - name: Update repo and registry hosts: slurm_control_node, slurm_node, kube_control_plane, kube_node, auth_server, login, etcd roles: diff --git a/utils/kernel_param_update/ansible.cfg b/utils/verify_intel_gaudi/ansible.cfg similarity index 76% rename from utils/kernel_param_update/ansible.cfg rename to utils/verify_intel_gaudi/ansible.cfg index 076548f60..a38bd642f 100644 --- a/utils/kernel_param_update/ansible.cfg +++ b/utils/verify_intel_gaudi/ansible.cfg @@ -1,9 +1,10 @@ [defaults] -log_path = /var/log/omnia/kernel_param_update.log +log_path = /var/log/omnia/verify_intel_gaudi.log host_key_checking = false forks = 5 timeout = 180 executable = /bin/bash +collections_path = $VIRTUAL_ENV display_skipped_hosts = false [persistent_connection] diff --git a/utils/verify_intel_gaudi/roles/verify_intel_gaudi/tasks/hccl_gaudi2_validation.yml b/utils/verify_intel_gaudi/roles/verify_intel_gaudi/tasks/hccl_gaudi2_validation.yml new file mode 100644 index 000000000..22ba99605 --- /dev/null +++ b/utils/verify_intel_gaudi/roles/verify_intel_gaudi/tasks/hccl_gaudi2_validation.yml @@ -0,0 +1,169 @@ +# Copyright 2024 Intel Corporation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Checking for the right amount of HPU devices + ansible.builtin.shell: | + set -o pipefail + lspci | grep "{{ verify_intel_gaudi_device_pattern }}" | wc -l + args: + executable: /bin/bash + register: hpu_dev + failed_when: ( hpu_dev.stdout != "8" ) + changed_when: false + +- name: Setting python version used for the test runs + ansible.builtin.set_fact: + pver: "{{ '.'.join(ansible_python_version.split('.')[0:2]) }}" + +- name: Create test folder + ansible.builtin.tempfile: + state: directory + suffix: omnia_gaudi_hccl_test + register: test_folder + +- name: Include vars from local_repo_access.yml file + ansible.builtin.include_vars: "{{ local_repo_access_dest_path }}" + delegate_to: localhost + delegate_facts: true + +- name: Get hccl_demo git repo tarball + ansible.builtin.get_url: + url: "{{ hostvars['localhost']['offline_git_path'] }}/hccl_demo.tar.gz" + dest: "{{ test_folder.path }}" + mode: "{{ verify_intel_gaudi_habana_tests['targz_permission'] }}" + +- name: Untar hccl_demo git repo + ansible.builtin.unarchive: + src: "{{ test_folder.path }}/hccl_demo.tar.gz" + dest: "{{ test_folder.path }}" + remote_src: true + +- name: Set hccl_demo untar folder + ansible.builtin.set_fact: + hccl_demo_untar_folder: "{{ test_folder.path }}/hccl_demo" + +- name: Build hccl_demo + ansible.builtin.shell: | + set -o pipefail + make clean + args: + executable: /bin/bash + chdir: "{{ hccl_demo_untar_folder }}" + changed_when: true + +- name: Run hccl_demo all_reduce single node test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ verify_intel_gaudi_habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['data_loader_aeon_lib_path'] }}" + RDMA_CORE_ROOT: "{{ verify_intel_gaudi_habana_tests['rdma_core_root'] }}" + RDMA_CORE_LIB: "{{ verify_intel_gaudi_habana_tests['rdma_core_lib'] }}" + HCCL_COMM_ID: "{{ verify_intel_gaudi_habana_extra['hccl_comm_id'] }}" + ansible.builtin.shell: | + set -o pipefail + python3 run_hccl_demo.py -clean --test all_reduce --nranks 8 --loop 1000 --node_id 0 --size 256m --ranks_per_node 8 + args: + executable: /bin/bash + chdir: "{{ hccl_demo_untar_folder }}" + register: hccl_demo_all_reduce_single_node_test_result + failed_when: > + (hccl_demo_all_reduce_single_node_test_result.stdout | regex_search('.NW Bandwidth.*:(.*)GB\/s','\\1') | first | float <= 127) or + (hccl_demo_all_reduce_single_node_test_result.stdout | regex_search('.Algo Bandwidth.*:(.*)GB\/s','\\1') | first | float <= 73) + changed_when: true + +- name: Run hccl_demo all_gather single node test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ verify_intel_gaudi_habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['data_loader_aeon_lib_path'] }}" + RDMA_CORE_ROOT: "{{ verify_intel_gaudi_habana_tests['rdma_core_root'] }}" + RDMA_CORE_LIB: "{{ verify_intel_gaudi_habana_tests['rdma_core_lib'] }}" + HCCL_COMM_ID: "{{ verify_intel_gaudi_habana_extra['hccl_comm_id'] }}" + ansible.builtin.shell: | + set -o pipefail + python3 run_hccl_demo.py -clean --test all_gather --nranks 8 --loop 1000 --node_id 0 --size 4m --ranks_per_node 8 + args: + executable: /bin/bash + chdir: "{{ hccl_demo_untar_folder }}" + register: hccl_demo_all_gather_single_node_test_result + failed_when: > + (hccl_demo_all_gather_single_node_test_result.stdout | regex_search('.NW Bandwidth.*:(.*)GB\/s','\\1') | first | float <= 127) or + (hccl_demo_all_gather_single_node_test_result.stdout | regex_search('.Algo Bandwidth.*:(.*)GB\/s','\\1') | first | float <= 18) + changed_when: true + +- name: Run hccl_demo reduce_scatter single node test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ verify_intel_gaudi_habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['data_loader_aeon_lib_path'] }}" + RDMA_CORE_ROOT: "{{ verify_intel_gaudi_habana_tests['rdma_core_root'] }}" + RDMA_CORE_LIB: "{{ verify_intel_gaudi_habana_tests['rdma_core_lib'] }}" + HCCL_COMM_ID: "{{ verify_intel_gaudi_habana_extra['hccl_comm_id'] }}" + ansible.builtin.shell: | + set -o pipefail + python3 run_hccl_demo.py -clean --test reduce_scatter --nranks 8 --loop 1000 --node_id 0 --size 64m + args: + executable: /bin/bash + chdir: "{{ hccl_demo_untar_folder }}" + register: hccl_demo_reduce_scatter_single_node_test_result + failed_when: > + (hccl_demo_reduce_scatter_single_node_test_result.stdout | regex_search('.NW Bandwidth.*:(.*)GB\/s','\\1') | first | float <= 126) or + (hccl_demo_reduce_scatter_single_node_test_result.stdout | regex_search('.Algo Bandwidth.*:(.*)GB\/s','\\1') | first | float <= 142) + changed_when: true + +- name: Run hccl_demo all2all single node test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ verify_intel_gaudi_habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['data_loader_aeon_lib_path'] }}" + RDMA_CORE_ROOT: "{{ verify_intel_gaudi_habana_tests['rdma_core_root'] }}" + RDMA_CORE_LIB: "{{ verify_intel_gaudi_habana_tests['rdma_core_lib'] }}" + HCCL_COMM_ID: "{{ verify_intel_gaudi_habana_extra['hccl_comm_id'] }}" + ansible.builtin.shell: | + set -o pipefail + python3 run_hccl_demo.py -clean --test all2all --nranks 8 --loop 1000 --node_id 0 --size 4m --ranks_per_node 8 + args: + executable: /bin/bash + chdir: "{{ hccl_demo_untar_folder }}" + register: hccl_demo_all2all_single_node_test_result + failed_when: > + (hccl_demo_all2all_single_node_test_result.stdout | regex_search('.NW Bandwidth.*:(.*)GB\/s','\\1') | first | float <= 126) or + (hccl_demo_all2all_single_node_test_result.stdout | regex_search('.Algo Bandwidth.*:(.*)GB\/s','\\1') | first | float <= 142) + changed_when: true + +- name: Remove hccl_demo directory + ansible.builtin.file: + state: absent + path: "{{ test_folder.path }}" diff --git a/utils/verify_intel_gaudi/roles/verify_intel_gaudi/tasks/hccl_gaudi3_validation.yml b/utils/verify_intel_gaudi/roles/verify_intel_gaudi/tasks/hccl_gaudi3_validation.yml new file mode 100644 index 000000000..64ffccd5e --- /dev/null +++ b/utils/verify_intel_gaudi/roles/verify_intel_gaudi/tasks/hccl_gaudi3_validation.yml @@ -0,0 +1,169 @@ +# Copyright 2024 Intel Corporation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Checking for the right amount of HPU devices + ansible.builtin.shell: | + set -o pipefail + lspci | grep "{{ verify_intel_gaudi_device_pattern }}" | wc -l + args: + executable: /bin/bash + register: hpu_dev + failed_when: ( hpu_dev.stdout != "8" ) + changed_when: false + +- name: Setting python version used for the test runs + ansible.builtin.set_fact: + pver: "{{ '.'.join(ansible_python_version.split('.')[0:2]) }}" + +- name: Create test folder + ansible.builtin.tempfile: + state: directory + suffix: omnia_gaudi_hccl_test + register: test_folder + +- name: Include vars from local_repo_access.yml file + ansible.builtin.include_vars: "{{ local_repo_access_dest_path }}" + delegate_to: localhost + delegate_facts: true + +- name: Get hccl_demo git repo tarball + ansible.builtin.get_url: + url: "{{ hostvars['localhost']['offline_git_path'] }}/hccl_demo.tar.gz" + dest: "{{ test_folder.path }}" + mode: "{{ verify_intel_gaudi_habana_tests['targz_permission'] }}" + +- name: Untar hccl_demo git repo + ansible.builtin.unarchive: + src: "{{ test_folder.path }}/hccl_demo.tar.gz" + dest: "{{ test_folder.path }}" + remote_src: true + +- name: Set hccl_demo untar folder + ansible.builtin.set_fact: + hccl_demo_untar_folder: "{{ test_folder.path }}/hccl_demo" + +- name: Build hccl_demo + ansible.builtin.shell: | + set -o pipefail + make clean + args: + executable: /bin/bash + chdir: "{{ hccl_demo_untar_folder }}" + changed_when: true + +- name: Run hccl_demo all_reduce single node test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ verify_intel_gaudi_habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['data_loader_aeon_lib_path'] }}" + RDMA_CORE_ROOT: "{{ verify_intel_gaudi_habana_tests['rdma_core_root'] }}" + RDMA_CORE_LIB: "{{ verify_intel_gaudi_habana_tests['rdma_core_lib'] }}" + HCCL_COMM_ID: "{{ verify_intel_gaudi_habana_extra['hccl_comm_id'] }}" + ansible.builtin.shell: | + set -o pipefail + python3 run_hccl_demo.py -clean --test all_reduce --nranks 8 --loop 1000 --node_id 0 --size 256m --ranks_per_node 8 + args: + executable: /bin/bash + chdir: "{{ hccl_demo_untar_folder }}" + register: hccl_demo_all_reduce_single_node_test_result + failed_when: > + (hccl_demo_all_reduce_single_node_test_result.stdout | regex_search('.NW Bandwidth.*:(.*)GB\/s','\\1') | first | float <= 255) or + (hccl_demo_all_reduce_single_node_test_result.stdout | regex_search('.Algo Bandwidth.*:(.*)GB\/s','\\1') | first | float <= 146) + changed_when: true + +- name: Run hccl_demo all_gather single node test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ verify_intel_gaudi_habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['data_loader_aeon_lib_path'] }}" + RDMA_CORE_ROOT: "{{ verify_intel_gaudi_habana_tests['rdma_core_root'] }}" + RDMA_CORE_LIB: "{{ verify_intel_gaudi_habana_tests['rdma_core_lib'] }}" + HCCL_COMM_ID: "{{ verify_intel_gaudi_habana_extra['hccl_comm_id'] }}" + ansible.builtin.shell: | + set -o pipefail + python3 run_hccl_demo.py -clean --test all_gather --nranks 8 --loop 1000 --node_id 0 --size 4m --ranks_per_node 8 + args: + executable: /bin/bash + chdir: "{{ hccl_demo_untar_folder }}" + register: hccl_demo_all_gather_single_node_test_result + failed_when: > + (hccl_demo_all_gather_single_node_test_result.stdout | regex_search('.NW Bandwidth.*:(.*)GB\/s','\\1') | first | float <= 254) or + (hccl_demo_all_gather_single_node_test_result.stdout | regex_search('.Algo Bandwidth.*:(.*)GB\/s','\\1') | first | float <= 36) + changed_when: true + +- name: Run hccl_demo reduce_scatter single node test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ verify_intel_gaudi_habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['data_loader_aeon_lib_path'] }}" + RDMA_CORE_ROOT: "{{ verify_intel_gaudi_habana_tests['rdma_core_root'] }}" + RDMA_CORE_LIB: "{{ verify_intel_gaudi_habana_tests['rdma_core_lib'] }}" + HCCL_COMM_ID: "{{ verify_intel_gaudi_habana_extra['hccl_comm_id'] }}" + ansible.builtin.shell: | + set -o pipefail + python3 run_hccl_demo.py -clean --test reduce_scatter --nranks 8 --loop 1000 --node_id 0 --size 64m + args: + executable: /bin/bash + chdir: "{{ hccl_demo_untar_folder }}" + register: hccl_demo_reduce_scatter_single_node_test_result + failed_when: > + (hccl_demo_reduce_scatter_single_node_test_result.stdout | regex_search('.NW Bandwidth.*:(.*)GB\/s','\\1') | first | float <= 253) or + (hccl_demo_reduce_scatter_single_node_test_result.stdout | regex_search('.Algo Bandwidth.*:(.*)GB\/s','\\1') | first | float <= 285) + changed_when: true + +- name: Run hccl_demo all2all single node test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ verify_intel_gaudi_habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['data_loader_aeon_lib_path'] }}" + RDMA_CORE_ROOT: "{{ verify_intel_gaudi_habana_tests['rdma_core_root'] }}" + RDMA_CORE_LIB: "{{ verify_intel_gaudi_habana_tests['rdma_core_lib'] }}" + HCCL_COMM_ID: "{{ verify_intel_gaudi_habana_extra['hccl_comm_id'] }}" + ansible.builtin.shell: | + set -o pipefail + python3 run_hccl_demo.py -clean --test all2all --nranks 8 --loop 1000 --node_id 0 --size 4m --ranks_per_node 8 + args: + executable: /bin/bash + chdir: "{{ hccl_demo_untar_folder }}" + register: hccl_demo_all2all_single_node_test_result + failed_when: > + (hccl_demo_all2all_single_node_test_result.stdout | regex_search('.NW Bandwidth.*:(.*)GB\/s','\\1') | first | float <= 253) or + (hccl_demo_all2all_single_node_test_result.stdout | regex_search('.Algo Bandwidth.*:(.*)GB\/s','\\1') | first | float <= 285) + changed_when: true + +- name: Remove hccl_demo directory + ansible.builtin.file: + state: absent + path: "{{ test_folder.path }}" diff --git a/utils/verify_intel_gaudi/roles/verify_intel_gaudi/tasks/hlqual_gaudi2_validation.yml b/utils/verify_intel_gaudi/roles/verify_intel_gaudi/tasks/hlqual_gaudi2_validation.yml new file mode 100644 index 000000000..ccbb8929a --- /dev/null +++ b/utils/verify_intel_gaudi/roles/verify_intel_gaudi/tasks/hlqual_gaudi2_validation.yml @@ -0,0 +1,287 @@ +# Copyright 2024 Intel Corporation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Setting python version used for the test runs + ansible.builtin.set_fact: + pver: "{{ '.'.join(ansible_python_version.split('.')[0:2]) }}" + +- name: Recursively change ownership of habana_logs directory + ansible.builtin.file: + path: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + state: directory + recurse: true + mode: "{{ verify_intel_gaudi_habana_tests['habana_logs_permission'] }}" + +- name: Run hl_qual hardware sanity check test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ verify_intel_gaudi_habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['data_loader_aeon_lib_path'] }}" + HABANALABS_HLTHUNK_TESTS_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habanalabs_hlthunk_tests_bin_path'] }}" + ansible.builtin.shell: | + set -o pipefail + ./hl_qual -gaudi2 -c all -rmod parallel -f2 -l extreme -t 60 -dis_mon + args: + executable: /bin/bash + chdir: /opt/habanalabs/qual/gaudi2/bin + register: sanity_test_result + failed_when: "'FAILED' in sanity_test_result.stdout" + changed_when: true + +- name: Run hl_qual memory bandwidth test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + HABANALABS_HLTHUNK_TESTS_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habanalabs_hlthunk_tests_bin_path'] }}" + ansible.builtin.shell: | + set -o pipefail + ./hl_qual -c all -rmod parallel -mb -memOnly -gaudi2 -dis_mon + args: + executable: /bin/bash + chdir: /opt/habanalabs/qual/gaudi2/bin + register: memory_bandwidth_test_result + failed_when: "'FAILED' in memory_bandwidth_test_result.stdout" + changed_when: true + +- name: Run hl_qual pci bandwidth test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + HABANALABS_HLTHUNK_TESTS_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habanalabs_hlthunk_tests_bin_path'] }}" + ansible.builtin.shell: | + set -o pipefail + ./hl_qual -c all -rmod serial -mb -b -pciOnly -gaudi2 -dis_mon + args: + executable: /bin/bash + chdir: /opt/habanalabs/qual/gaudi2/bin + register: pci_bandwidth_test_result + failed_when: "'FAILED' in pci_bandwidth_test_result.stdout" + changed_when: true + +- name: Run hl_qual serdes base test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + HABANALABS_HLTHUNK_TESTS_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habanalabs_hlthunk_tests_bin_path'] }}" + ansible.builtin.shell: | + set -o pipefail + ./hl_qual -gaudi2 -c all -rmod parallel -i 50 -nic_base -test_type pairs -dis_mon + args: + executable: /bin/bash + chdir: /opt/habanalabs/qual/gaudi2/bin + register: serdes_base_test_result + failed_when: "'FAILED' in serdes_base_test_result.stdout" + changed_when: true + +- name: Run hl_qual serdes base allreduce test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + HABANALABS_HLTHUNK_TESTS_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habanalabs_hlthunk_tests_bin_path'] }}" + ansible.builtin.shell: | + set -o pipefail + ./hl_qual -gaudi2 -c all -rmod parallel -i 50 -ep 100 -nic_base -test_type allreduce -dis_mon + args: + executable: /bin/bash + chdir: /opt/habanalabs/qual/gaudi2/bin + register: serdes_base_allreduce_test_result + failed_when: "'FAILED' in serdes_base_allreduce_test_result.stdout" + changed_when: true + +- name: Unload habanalabs kernel module + community.general.modprobe: + name: habanalabs + state: absent + +- name: Load habanalabs kernel module with timeout_locked param + community.general.modprobe: + name: habanalabs + state: present + params: 'timeout_locked=0' + +- name: Bring DOWN all Gaudi2 NICs + ansible.builtin.shell: | + set -o pipefail + ./manage_network_ifs.sh --down + args: + executable: /bin/bash + chdir: /opt/habanalabs/qual/gaudi2/bin + changed_when: true + +- name: Bring UP all Gaudi2 NICs + ansible.builtin.shell: | + set -o pipefail + ./manage_network_ifs.sh --up + args: + executable: /bin/bash + chdir: /opt/habanalabs/qual/gaudi2/bin + changed_when: true + +- name: Retry until HPUs NICs are ready + ansible.builtin.shell: | + set -o pipefail + ./manage_network_ifs.sh --status | grep down | wc -l + args: + executable: /bin/bash + chdir: /opt/habanalabs/qual/gaudi2/bin + register: result + until: (result.stdout == "0") + retries: 5 + delay: 5 + changed_when: false + failed_when: false + +- name: Run hl_qual HBM DMA stress test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ verify_intel_gaudi_habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['data_loader_aeon_lib_path'] }}" + HABANALABS_HLTHUNK_TESTS_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habanalabs_hlthunk_tests_bin_path'] }}" + ansible.builtin.shell: | + set -o pipefail + ./hl_qual -gaudi2 -c all -rmod parallel -i 1 -hbm_dma_stress -dis_mon + args: + executable: /bin/bash + chdir: /opt/habanalabs/qual/gaudi2/bin + register: hbm_dma_stress_test_result + failed_when: "'FAILED' in hbm_dma_stress_test_result.stdout" + changed_when: true + +- name: Run hl_qual HBM TPC stress test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ verify_intel_gaudi_habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['data_loader_aeon_lib_path'] }}" + HABANALABS_HLTHUNK_TESTS_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habanalabs_hlthunk_tests_bin_path'] }}" + ansible.builtin.shell: | + set -o pipefail + ./hl_qual -gaudi2 -c all -rmod parallel -i 1 -hbm_tpc_stress -dis_mon + args: + executable: /bin/bash + chdir: /opt/habanalabs/qual/gaudi2/bin + register: hbm_tpc_stress_test_result + failed_when: "'FAILED' in hbm_tpc_stress_test_result.stdout" + changed_when: true + +- name: Run hl_qual power stress test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ verify_intel_gaudi_habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['data_loader_aeon_lib_path'] }}" + HABANALABS_HLTHUNK_TESTS_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habanalabs_hlthunk_tests_bin_path'] }}" + ansible.builtin.shell: | + set -o pipefail + ./hl_qual -gaudi2 -c all -rmod parallel -s -t 120 -dis_mon + args: + executable: /bin/bash + chdir: /opt/habanalabs/qual/gaudi2/bin + register: power_stress_test_result + failed_when: "'FAILED' in power_stress_test_result.stdout" + changed_when: true + +- name: Run hl_qual EDP stress test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ verify_intel_gaudi_habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['data_loader_aeon_lib_path'] }}" + HABANALABS_HLTHUNK_TESTS_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habanalabs_hlthunk_tests_bin_path'] }}" + ansible.builtin.shell: | + set -o pipefail + ./hl_qual -gaudi2 -c all -rmod parallel -t 40 -e -Tw 3 -Ts 1 -dis_mon + args: + executable: /bin/bash + chdir: /opt/habanalabs/qual/gaudi2/bin + register: edp_stress_test_result + failed_when: "'FAILED' in edp_stress_test_result.stdout" + changed_when: true + +- name: Run hl_qual e2e concurrency test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ verify_intel_gaudi_habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['data_loader_aeon_lib_path'] }}" + RDMA_CORE_ROOT: "{{ verify_intel_gaudi_habana_tests['rdma_core_root'] }}" + RDMA_CORE_LIB: "{{ verify_intel_gaudi_habana_tests['rdma_core_lib'] }}" + HABANALABS_HLTHUNK_TESTS_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habanalabs_hlthunk_tests_bin_path'] }}" + ansible.builtin.shell: | + set -o pipefail + ./hl_qual -gaudi2 -c all -rmod parallel -t 30 -dis_mon -e2e_concurrency -disable_ports 8,22,23 + args: + executable: /bin/bash + chdir: /opt/habanalabs/qual/gaudi2/bin + register: e2e_concurrency_test_result + failed_when: "'FAILED' in e2e_concurrency_test_result.stdout" + changed_when: true + +- name: Run hl_qual SER test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ verify_intel_gaudi_habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['data_loader_aeon_lib_path'] }}" + RDMA_CORE_ROOT: "{{ verify_intel_gaudi_habana_tests['rdma_core_root'] }}" + RDMA_CORE_LIB: "{{ verify_intel_gaudi_habana_tests['rdma_core_lib'] }}" + HABANALABS_HLTHUNK_TESTS_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habanalabs_hlthunk_tests_bin_path'] }}" + ansible.builtin.shell: | + set -o pipefail + ./hl_qual -gaudi2 -c all -rmod parallel -dis_mon -ser + args: + executable: /bin/bash + chdir: /opt/habanalabs/qual/gaudi2/bin + register: ser_test_result + failed_when: "'FAILED' in ser_test_result.stdout" + changed_when: true diff --git a/utils/verify_intel_gaudi/roles/verify_intel_gaudi/tasks/hlqual_gaudi3_validation.yml b/utils/verify_intel_gaudi/roles/verify_intel_gaudi/tasks/hlqual_gaudi3_validation.yml new file mode 100644 index 000000000..accf6a6c3 --- /dev/null +++ b/utils/verify_intel_gaudi/roles/verify_intel_gaudi/tasks/hlqual_gaudi3_validation.yml @@ -0,0 +1,287 @@ +# Copyright 2024 Intel Corporation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Setting python version used for the test runs + ansible.builtin.set_fact: + pver: "{{ '.'.join(ansible_python_version.split('.')[0:2]) }}" + +- name: Recursively change ownership of habana_logs directory + ansible.builtin.file: + path: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + state: directory + recurse: true + mode: "{{ verify_intel_gaudi_habana_tests['habana_logs_permission'] }}" + +- name: Run hl_qual hardware sanity check test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ verify_intel_gaudi_habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['data_loader_aeon_lib_path'] }}" + HABANALABS_HLTHUNK_TESTS_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habanalabs_hlthunk_tests_bin_path'] }}" + ansible.builtin.shell: | + set -o pipefail + ./hl_qual -gaudi3 -c all -rmod parallel -f2 -l extreme -t 60 -dis_mon + args: + executable: /bin/bash + chdir: /opt/habanalabs/qual/gaudi3/bin + register: sanity_test_result + failed_when: "'FAILED' in sanity_test_result.stdout" + changed_when: true + +- name: Run hl_qual memory bandwidth test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + HABANALABS_HLTHUNK_TESTS_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habanalabs_hlthunk_tests_bin_path'] }}" + ansible.builtin.shell: | + set -o pipefail + ./hl_qual -c all -rmod parallel -mb -memOnly -gaudi3 -dis_mon + args: + executable: /bin/bash + chdir: /opt/habanalabs/qual/gaudi3/bin + register: memory_bandwidth_test_result + failed_when: "'FAILED' in memory_bandwidth_test_result.stdout" + changed_when: true + +- name: Run hl_qual pci bandwidth test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + HABANALABS_HLTHUNK_TESTS_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habanalabs_hlthunk_tests_bin_path'] }}" + ansible.builtin.shell: | + set -o pipefail + ./hl_qual -c all -rmod serial -mb -b -pciOnly -gaudi3 -dis_mon + args: + executable: /bin/bash + chdir: /opt/habanalabs/qual/gaudi3/bin + register: pci_bandwidth_test_result + failed_when: "'FAILED' in pci_bandwidth_test_result.stdout" + changed_when: true + +- name: Run hl_qual serdes base test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + HABANALABS_HLTHUNK_TESTS_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habanalabs_hlthunk_tests_bin_path'] }}" + ansible.builtin.shell: | + set -o pipefail + ./hl_qual -gaudi3 -c all -rmod parallel -i 50 -nic_base -test_type pairs -dis_mon + args: + executable: /bin/bash + chdir: /opt/habanalabs/qual/gaudi3/bin + register: serdes_base_test_result + failed_when: "'FAILED' in serdes_base_test_result.stdout" + changed_when: true + +- name: Run hl_qual serdes base allreduce test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + HABANALABS_HLTHUNK_TESTS_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habanalabs_hlthunk_tests_bin_path'] }}" + ansible.builtin.shell: | + set -o pipefail + ./hl_qual -gaudi3 -c all -rmod parallel -i 50 -ep 100 -nic_base -test_type allreduce -dis_mon + args: + executable: /bin/bash + chdir: /opt/habanalabs/qual/gaudi3/bin + register: serdes_base_allreduce_test_result + failed_when: "'FAILED' in serdes_base_allreduce_test_result.stdout" + changed_when: true + +- name: Unload habanalabs kernel module + community.general.modprobe: + name: habanalabs + state: absent + +- name: Load habanalabs kernel module with timeout_locked param + community.general.modprobe: + name: habanalabs + state: present + params: 'timeout_locked=0' + +- name: Bring DOWN all Gaudi3 NICs + ansible.builtin.shell: | + set -o pipefail + ./manage_network_ifs.sh --down + args: + executable: /bin/bash + chdir: /opt/habanalabs/qual/gaudi3/bin + changed_when: true + +- name: Bring UP all Gaudi3 NICs + ansible.builtin.shell: | + set -o pipefail + ./manage_network_ifs.sh --up + args: + executable: /bin/bash + chdir: /opt/habanalabs/qual/gaudi3/bin + changed_when: true + +- name: Retry until HPUs NICs are ready + ansible.builtin.shell: | + set -o pipefail + ./manage_network_ifs.sh --status | grep down | wc -l + args: + executable: /bin/bash + chdir: /opt/habanalabs/qual/gaudi3/bin + register: result + until: (result.stdout == "0") + retries: 5 + delay: 5 + changed_when: false + failed_when: false + +- name: Run hl_qual HBM DMA stress test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ verify_intel_gaudi_habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['data_loader_aeon_lib_path'] }}" + HABANALABS_HLTHUNK_TESTS_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habanalabs_hlthunk_tests_bin_path'] }}" + ansible.builtin.shell: | + set -o pipefail + ./hl_qual -gaudi3 -c all -rmod parallel -i 1 -hbm_dma_stress -dis_mon + args: + executable: /bin/bash + chdir: /opt/habanalabs/qual/gaudi3/bin + register: hbm_dma_stress_test_result + failed_when: "'FAILED' in hbm_dma_stress_test_result.stdout" + changed_when: true + +- name: Run hl_qual HBM TPC stress test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ verify_intel_gaudi_habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['data_loader_aeon_lib_path'] }}" + HABANALABS_HLTHUNK_TESTS_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habanalabs_hlthunk_tests_bin_path'] }}" + ansible.builtin.shell: | + set -o pipefail + ./hl_qual -gaudi3 -c all -rmod parallel -i 1 -hbm_tpc_stress -dis_mon + args: + executable: /bin/bash + chdir: /opt/habanalabs/qual/gaudi3/bin + register: hbm_tpc_stress_test_result + failed_when: "'FAILED' in hbm_tpc_stress_test_result.stdout" + changed_when: true + +- name: Run hl_qual power stress test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ verify_intel_gaudi_habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['data_loader_aeon_lib_path'] }}" + HABANALABS_HLTHUNK_TESTS_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habanalabs_hlthunk_tests_bin_path'] }}" + ansible.builtin.shell: | + set -o pipefail + ./hl_qual -gaudi3 -c all -rmod parallel -s -t 120 -dis_mon + args: + executable: /bin/bash + chdir: /opt/habanalabs/qual/gaudi3/bin + register: power_stress_test_result + failed_when: "'FAILED' in power_stress_test_result.stdout" + changed_when: true + +- name: Run hl_qual EDP stress test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ verify_intel_gaudi_habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['data_loader_aeon_lib_path'] }}" + HABANALABS_HLTHUNK_TESTS_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habanalabs_hlthunk_tests_bin_path'] }}" + ansible.builtin.shell: | + set -o pipefail + ./hl_qual -gaudi3 -c all -rmod parallel -t 40 -e -Tw 3 -Ts 1 -dis_mon + args: + executable: /bin/bash + chdir: /opt/habanalabs/qual/gaudi3/bin + register: edp_stress_test_result + failed_when: "'FAILED' in edp_stress_test_result.stdout" + changed_when: true + +- name: Run hl_qual e2e concurrency test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ verify_intel_gaudi_habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['data_loader_aeon_lib_path'] }}" + RDMA_CORE_ROOT: "{{ verify_intel_gaudi_habana_tests['rdma_core_root'] }}" + RDMA_CORE_LIB: "{{ verify_intel_gaudi_habana_tests['rdma_core_lib'] }}" + HABANALABS_HLTHUNK_TESTS_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habanalabs_hlthunk_tests_bin_path'] }}" + ansible.builtin.shell: | + set -o pipefail + ./hl_qual -gaudi3 -c all -rmod parallel -t 30 -dis_mon -e2e_concurrency -disable_ports 8,22,23 + args: + executable: /bin/bash + chdir: /opt/habanalabs/qual/gaudi3/bin + register: e2e_concurrency_test_result + failed_when: "'FAILED' in e2e_concurrency_test_result.stdout" + changed_when: true + +- name: Run hl_qual SER test + environment: + __python_cmd: "python{{ pver }}" + LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" + ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" + HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" + GC_KERNEL_PATH: "{{ verify_intel_gaudi_habana_tests['gc_kernel_path'] }}" + HABANA_SCAL_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habana_scal_bin_path'] }}" + HABANA_PLUGINS_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['habana_plugins_lib_path'] }}" + DATA_LOADER_AEON_LIB_PATH: "{{ verify_intel_gaudi_habana_tests['data_loader_aeon_lib_path'] }}" + RDMA_CORE_ROOT: "{{ verify_intel_gaudi_habana_tests['rdma_core_root'] }}" + RDMA_CORE_LIB: "{{ verify_intel_gaudi_habana_tests['rdma_core_lib'] }}" + HABANALABS_HLTHUNK_TESTS_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habanalabs_hlthunk_tests_bin_path'] }}" + ansible.builtin.shell: | + set -o pipefail + ./hl_qual -gaudi3 -c all -rmod parallel -dis_mon -ser + args: + executable: /bin/bash + chdir: /opt/habanalabs/qual/gaudi3/bin + register: ser_test_result + failed_when: "'FAILED' in ser_test_result.stdout" + changed_when: true diff --git a/utils/verify_intel_gaudi/roles/verify_intel_gaudi/tasks/include_omnia_config.yml b/utils/verify_intel_gaudi/roles/verify_intel_gaudi/tasks/include_omnia_config.yml new file mode 100644 index 000000000..cab9377ab --- /dev/null +++ b/utils/verify_intel_gaudi/roles/verify_intel_gaudi/tasks/include_omnia_config.yml @@ -0,0 +1,63 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check if omnia config file is encrypted + ansible.builtin.command: cat {{ omnia_config_filename }} + changed_when: false + register: config_content + no_log: true + +- name: Decrpyt omnia_config.yml + ansible.builtin.command: >- + ansible-vault decrypt {{ omnia_config_filename }} + --vault-password-file {{ omnia_vault_path }} + changed_when: false + when: "'$ANSIBLE_VAULT;' in config_content.stdout" + +- name: Include omnia_config.yml + block: + - name: Include omnia_config.yml + ansible.builtin.include_vars: "{{ omnia_config_filename }}" + register: include_omnia_config + no_log: true + tags: init + rescue: + - name: Failed to include omnia_config.yml + ansible.builtin.fail: + msg: "{{ omnia_config_syntax_fail_msg }} Error: {{ include_omnia_config.message }}" + +- name: Create ansible vault key + ansible.builtin.set_fact: + omnia_vault_key: "{{ lookup('password', '/dev/null chars=ascii_letters') }}" + when: ansible_vault_search_key not in config_content.stdout + +- name: Save vault key to omnia_vault_path + ansible.builtin.lineinfile: + path: "{{ omnia_vault_path }}" + line: "{{ omnia_vault_key }}" + mode: "{{ omnia_config_file_permission }}" + owner: root + create: true + when: ansible_vault_search_key not in config_content.stdout + +- name: Encrypt omnia config file + ansible.builtin.command: >- + ansible-vault encrypt {{ omnia_config_filename }} --vault-password-file {{ omnia_vault_path }} + changed_when: false + +- name: Update omnia_config.yml permission + ansible.builtin.file: + path: "{{ omnia_config_filename }}" + mode: "{{ omnia_config_file_permission }}" diff --git a/utils/verify_intel_gaudi/roles/verify_intel_gaudi/tasks/main.yml b/utils/verify_intel_gaudi/roles/verify_intel_gaudi/tasks/main.yml new file mode 100644 index 000000000..88d34028d --- /dev/null +++ b/utils/verify_intel_gaudi/roles/verify_intel_gaudi/tasks/main.yml @@ -0,0 +1,70 @@ +# Copyright 2024 Intel Corporation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Initialise node accelerator status + ansible.builtin.set_fact: + node_has_gaudi2: false + node_has_gaudi3: false + +- name: Check if the node has Gaudi3 + ansible.builtin.shell: | + set -o pipefail + lspci -n -d {{ gaudi3_pci_vendor_device_class }} + register: lspci_output + changed_when: false + failed_when: false + args: + executable: /bin/bash + +- name: Update node accelerator status + ansible.builtin.set_fact: + node_has_gaudi3: true + when: lspci_output.stdout | length > 0 + +- name: Check if the node has Gaudi2 + ansible.builtin.shell: | + set -o pipefail + lspci -n -d {{ gaudi2_pci_vendor_device_class }} + register: lspci_output + changed_when: false + failed_when: false + args: + executable: /bin/bash + when: node_has_gaudi3 == false + +- name: Update node accelerator status + ansible.builtin.set_fact: + node_has_gaudi2: true + when: node_has_gaudi3 == false and lspci_output.stdout | length > 0 + +- name: Set run_intel_gaudi_tests parameter + ansible.builtin.set_fact: + run_intel_gaudi_tests: "{{ hostvars['localhost']['run_intel_gaudi_tests'] }}" + +- name: Verify hl-qual on Gaudi3 nodes + ansible.builtin.include_tasks: hlqual_gaudi3_validation.yml + when: node_has_gaudi3 and run_intel_gaudi_tests + +- name: Verify hl-qual on Gaudi2 nodes + ansible.builtin.include_tasks: hlqual_gaudi2_validation.yml + when: node_has_gaudi2 and run_intel_gaudi_tests + +- name: Verify hccl on Gaudi2 nodes + ansible.builtin.include_tasks: hccl_gaudi2_validation.yml + when: node_has_gaudi2 and run_intel_gaudi_tests + +- name: Verify hccl on Gaudi3 nodes + ansible.builtin.include_tasks: hccl_gaudi3_validation.yml + when: node_has_gaudi3 and run_intel_gaudi_tests diff --git a/utils/verify_intel_gaudi/roles/verify_intel_gaudi/vars/main.yml b/utils/verify_intel_gaudi/roles/verify_intel_gaudi/vars/main.yml new file mode 100644 index 000000000..41a2cc2be --- /dev/null +++ b/utils/verify_intel_gaudi/roles/verify_intel_gaudi/vars/main.yml @@ -0,0 +1,43 @@ +# Copyright 2024 Intel Corporation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +local_repo_access_dest_path: "/opt/omnia/offline/local_repo_access.yml" + +omnia_config_filename: "{{ role_path }}/../../../../input/omnia_config.yml" +omnia_vault_path: "{{ role_path }}/../../../../input/.omnia_vault_key" +ansible_vault_search_key: "$ANSIBLE_VAULT;" +omnia_config_syntax_fail_msg: "Failed. Syntax errors present in omnia_config.yml. Fix errors and re-run playbook again." +omnia_config_file_permission: '0644' + +verify_intel_gaudi_device_pattern: "Processing accelerators: Habana Labs Ltd." +gaudi2_pci_vendor_device_class: "1da3:1020:1200" +gaudi3_pci_vendor_device_class: "1da3:1060:1200" + +verify_intel_gaudi_habana_tests: + log_level_all: "4" + enable_console: "true" + habana_logs: "/var/log/habana_logs" + habana_logs_permission: "0777" + gc_kernel_path: "/usr/lib/habanalabs/libtpc_kernels.so" + habana_scal_bin_path: "/opt/habanalabs/engines_fw" + habana_plugins_lib_path: "/opt/habanalabs/habana_plugins" + data_loader_aeon_lib_path: "/usr/lib/habanalabs/libaeon.so" + rdma_core_root: "/opt/habanalabs/rdma-core/src" + rdma_core_lib: "/opt/habanalabs/rdma-core/src/build/lib" + habanalabs_hlthunk_tests_bin_path: "/opt/habanalabs/src/hl-thunk/tests" + targz_permission: "644" + +verify_intel_gaudi_habana_extra: + hccl_comm_id: "127.0.0.1:5555" diff --git a/utils/verify_intel_gaudi/verify_intel_gaudi_installation.yml b/utils/verify_intel_gaudi/verify_intel_gaudi_installation.yml new file mode 100644 index 000000000..e34b8d5fc --- /dev/null +++ b/utils/verify_intel_gaudi/verify_intel_gaudi_installation.yml @@ -0,0 +1,42 @@ +# Copyright 2024 Intel Corporation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +# This playbook need to be called during the default omnia provisioning +# of newly installed Gaudi devices. This will make sure the device will function properly +# and help early detection of HW/SW missing requirements. +# It's recommended to do extensive tests when installing a new Gaudi node. +# This can be controlled using var: run_intel_gaudi_tests: true + +- name: Check if virtual environment is active + ansible.builtin.import_playbook: ../check_venv.yml + when: not ( check_venv_executed | default(false) | bool ) + +- name: Include omnia config file + hosts: localhost + connection: local + gather_facts: true + any_errors_fatal: true + tasks: + - name: Include omnia config file + ansible.builtin.include_role: + name: verify_intel_gaudi + tasks_from: include_omnia_config.yml + +- name: Verify Intel Gaudi installation + hosts: kube_node + gather_facts: true + any_errors_fatal: true + roles: + - verify_intel_gaudi diff --git a/utils/vllm_build/ansible.cfg b/utils/vllm_build/ansible.cfg index eedfccc6d..d742f55f7 100644 --- a/utils/vllm_build/ansible.cfg +++ b/utils/vllm_build/ansible.cfg @@ -4,6 +4,7 @@ host_key_checking = false forks = 5 timeout = 180 executable = /bin/bash +collections_path = $VIRTUAL_ENV [persistent_connection] command_timeout = 180 @@ -11,4 +12,4 @@ connect_timeout = 180 [ssh_connection] retries = 3 -ssh_args = -o ControlMaster=auto -o ControlPersist=180 \ No newline at end of file +ssh_args = -o ControlMaster=auto -o ControlPersist=180 diff --git a/utils/vllm_build/roles/vllm_build/tasks/configure_docker_proxy.yml b/utils/vllm_build/roles/vllm_build/tasks/configure_docker_proxy.yml new file mode 100644 index 000000000..deda42816 --- /dev/null +++ b/utils/vllm_build/roles/vllm_build/tasks/configure_docker_proxy.yml @@ -0,0 +1,52 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Create docker service directory + ansible.builtin.file: + path: "{{ docker_service_dest }}" + state: directory + mode: "{{ dir_mode }}" + +- name: Copy http-proxy.conf to docker service directory + ansible.builtin.template: + src: "{{ docker_http_proxy_conf_src }}" + dest: "{{ docker_service_dest }}/http-proxy.conf" + mode: "{{ file_mode }}" + +- name: Create .docker directory if it doesn't exist + ansible.builtin.file: + path: "{{ docker_auth_folder }}" + state: directory + mode: "{{ docker_dir_mode }}" + +- name: Copy docker config.json + ansible.builtin.template: + src: "{{ docker_config_src }}" + dest: "{{ docker_config_dest }}" + mode: "{{ docker_file_mode }}" + +- name: Reload systemd daemon + ansible.builtin.systemd: + daemon_reload: true + +- name: Restart docker service + ansible.builtin.service: + name: docker + state: restarted + enabled: true + register: docker_result + until: docker_result is succeeded + retries: "{{ package_retry }}" + delay: "{{ delay_time }}" diff --git a/utils/vllm_build/roles/vllm_build/tasks/docker_login.yml b/utils/vllm_build/roles/vllm_build/tasks/docker_login.yml new file mode 100644 index 000000000..fc448b76b --- /dev/null +++ b/utils/vllm_build/roles/vllm_build/tasks/docker_login.yml @@ -0,0 +1,29 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Docker login + ansible.builtin.command: nerdctl login -u {{ docker_username }} -p {{ docker_password }} + changed_when: true + register: docker_login_output + retries: "{{ image_retries }}" + delay: "{{ delay_time }}" + until: docker_login_output.rc == 0 + failed_when: false + no_log: true + +- name: Docker login check + ansible.builtin.fail: + msg: "{{ docker_login_fail_msg }} Error: {{ docker_login_output.stderr }}" + when: docker_login_output.rc != 0 diff --git a/utils/vllm_build/roles/vllm_build/tasks/enable_buildkit.yml b/utils/vllm_build/roles/vllm_build/tasks/enable_buildkit.yml index 392511da2..35c2ecedb 100644 --- a/utils/vllm_build/roles/vllm_build/tasks/enable_buildkit.yml +++ b/utils/vllm_build/roles/vllm_build/tasks/enable_buildkit.yml @@ -13,57 +13,71 @@ # limitations under the License. --- -- name: Create buildkit directory - ansible.builtin.file: - path: "{{ buildkit_dir }}" - state: directory - mode: "{{ dir_mode }}" +- name: Check buildkit created + ansible.builtin.command: nerdctl ps -f name=buildkitd + register: buildkit_container_check + changed_when: false + failed_when: false -- name: Clone buildkit repo - ansible.builtin.git: - repo: "{{ builldkit_repo }}" - dest: "{{ buildkit_dir }}" - single_branch: true - version: "{{ buildkit_version }}" +- name: Enable buildkit + when: "'buildkit' not in buildkit_container_check.stdout" + environment: + http_proxy: "{{ proxy[0].http_proxy | default('', true) }}" + https_proxy: "{{ proxy[0].https_proxy | default('', true) }}" + HTTP_PROXY: "{{ proxy[0].http_proxy | default('', true) }}" + HTTPS_PROXY: "{{ proxy[0].https_proxy | default('', true) }}" + block: + - name: Create buildkit directory + ansible.builtin.file: + path: "{{ buildkit_dir }}" + state: directory + mode: "{{ dir_mode }}" -- name: Execute make install - ansible.builtin.shell: - chdir: "{{ buildkit_dir }}" - cmd: "make && make install" - register: images_output - until: images_output is success - retries: "{{ image_retries }}" - delay: "{{ buildkit_delay }}" - changed_when: false + - name: Clone buildkit repo + ansible.builtin.git: + repo: "{{ builldkit_repo }}" + dest: "{{ buildkit_dir }}" + single_branch: true + version: "{{ buildkit_version }}" -- name: Execute make images - ansible.builtin.shell: - chdir: "{{ buildkit_dir }}" - cmd: "make && make install" - register: install_output - until: install_output is success - retries: "{{ image_retries }}" - delay: "{{ buildkit_delay }}" - changed_when: false + - name: Execute make install + ansible.builtin.shell: + chdir: "{{ buildkit_dir }}" + cmd: "make && make install" + register: images_output + until: images_output is success + retries: "{{ image_retries }}" + delay: "{{ delay_time }}" + changed_when: false -- name: Get container ID for buildkitd - ansible.builtin.shell: > - set -o pipefail - && nerdctl ps -q -a -f name=buildkitd | head -n 1 - register: buildkit_container_id - changed_when: false + - name: Execute make images + ansible.builtin.shell: + chdir: "{{ buildkit_dir }}" + cmd: "make && make install" + register: install_output + until: install_output is success + retries: "{{ image_retries }}" + delay: "{{ delay_time }}" + changed_when: false -- name: Stop buildkitd container - ansible.builtin.command: nerdctl stop {{ buildkit_container_id.stdout }} - when: buildkit_container_id.stdout is defined and buildkit_container_id.stdout != '' - changed_when: false + - name: Get container ID for buildkitd + ansible.builtin.shell: > + set -o pipefail + && nerdctl ps -q -a -f name=buildkitd | head -n 1 + register: buildkit_container_id + changed_when: false -- name: Remove buildkitd container - ansible.builtin.command: nerdctl rm {{ buildkit_container_id.stdout }} - when: buildkit_container_id.stdout is defined and buildkit_container_id.stdout != '' - changed_when: false + - name: Stop buildkitd container + ansible.builtin.command: nerdctl stop {{ buildkit_container_id.stdout }} + when: buildkit_container_id.stdout is defined and buildkit_container_id.stdout != '' + changed_when: false + + - name: Remove buildkitd container + ansible.builtin.command: nerdctl rm {{ buildkit_container_id.stdout }} + when: buildkit_container_id.stdout is defined and buildkit_container_id.stdout != '' + changed_when: false -- name: Run BuildKit container - ansible.builtin.command: > - nerdctl run -d --name buildkitd --privileged moby/buildkit:latest - changed_when: true + - name: Run BuildKit container + ansible.builtin.command: > + nerdctl run -d -e http_proxy -e HTTP_PROXY -e https_proxy -e HTTPS_PROXY -e no_proxy --name buildkitd --privileged moby/buildkit:latest + changed_when: true diff --git a/utils/vllm_build/roles/vllm_build/tasks/fetch_oim_details.yml b/utils/vllm_build/roles/vllm_build/tasks/fetch_oim_details.yml new file mode 100644 index 000000000..0bc9b592c --- /dev/null +++ b/utils/vllm_build/roles/vllm_build/tasks/fetch_oim_details.yml @@ -0,0 +1,35 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Gather all IP addresses + ansible.builtin.command: ip -4 addr show + register: ip_output + changed_when: false + +- name: Read Omnia Infrastructure Manager hostname + ansible.builtin.command: hostname + changed_when: false + register: hostname_output + +- name: Read Omnia Infrastructure Manager domain name + ansible.builtin.command: hostname -d + changed_when: false + register: domain_name_output + +- name: Set oim details + ansible.builtin.set_fact: + oim_hostname: "{{ hostname_output.stdout }}" + oim_domain_name: "{{ domain_name_output.stdout }}" + oim_ip_addresses: "{{ ip_output.stdout | regex_findall('inet\\s([0-9.]+)') }}" diff --git a/utils/vllm_build/roles/vllm_build/tasks/main.yml b/utils/vllm_build/roles/vllm_build/tasks/main.yml index 2f8bd916d..4a0d278a2 100644 --- a/utils/vllm_build/roles/vllm_build/tasks/main.yml +++ b/utils/vllm_build/roles/vllm_build/tasks/main.yml @@ -13,6 +13,19 @@ # limitations under the License. --- +- name: Set oim_os + ansible.builtin.set_fact: + oim_os: "{{ ansible_distribution | lower }}" + +- name: Fetch oim details + ansible.builtin.include_tasks: fetch_oim_details.yml + +- name: Validate site_config.yml + ansible.builtin.include_tasks: validate_site_config.yml + +- name: Validate provision_config_credentials.yml + ansible.builtin.include_tasks: validate_provision_config_credentials.yml + - name: Run prerequisite ansible.builtin.include_tasks: prereq_{{ ansible_distribution | lower }}.yml diff --git a/utils/vllm_build/roles/vllm_build/tasks/prereq_redhat.yml b/utils/vllm_build/roles/vllm_build/tasks/prereq_redhat.yml index 3a185b3b1..b1c9addf9 100644 --- a/utils/vllm_build/roles/vllm_build/tasks/prereq_redhat.yml +++ b/utils/vllm_build/roles/vllm_build/tasks/prereq_redhat.yml @@ -28,4 +28,12 @@ register: docker_result until: docker_result is succeeded retries: "{{ package_retry }}" - delay: "{{ buildkit_delay }}" + delay: "{{ delay_time }}" + +- name: Configure proxy environment variables for docker + ansible.builtin.include_tasks: configure_docker_proxy.yml + when: proxy_status + +- name: Login to docker when credentials are given + ansible.builtin.include_tasks: docker_login.yml + when: docker_login diff --git a/utils/vllm_build/roles/vllm_build/tasks/prereq_ubuntu.yml b/utils/vllm_build/roles/vllm_build/tasks/prereq_ubuntu.yml index acb92258d..6d62ad97d 100644 --- a/utils/vllm_build/roles/vllm_build/tasks/prereq_ubuntu.yml +++ b/utils/vllm_build/roles/vllm_build/tasks/prereq_ubuntu.yml @@ -31,9 +31,19 @@ ansible.builtin.set_fact: os_release: "{{ ansible_distribution_release }}" -- name: Clean apt cache - ansible.builtin.apt: - autoclean: true +- name: Try cleaning apt cache in Ubuntu + block: + - name: Clean apt cache + ansible.builtin.apt: + autoclean: true + register: clean_apt_cache + until: clean_apt_cache is not failed + retries: "{{ repo_retries }}" + delay: "{{ repo_delay }}" + rescue: + - name: Failed to clean apt cache + ansible.builtin.fail: + msg: "{{ clean_apt_cache_fail_msg }}" - name: Configure Docker Repository ansible.builtin.template: @@ -48,9 +58,19 @@ mode: "{{ file_mode }}" changed_when: false -- name: Update apt cache - ansible.builtin.apt: - update_cache: true +- name: Try updating repos in Ubuntu + block: + - name: Update apt cache + ansible.builtin.apt: + update_cache: true + register: update_repos + until: update_repos is not failed + retries: "{{ repo_retries }}" + delay: "{{ repo_delay }}" + rescue: + - name: Failed to update repos + ansible.builtin.fail: + msg: "{{ docker_update_repos_fail_msg }}" - name: Get epoch number of docker-ce/docker-ce-cli ansible.builtin.shell: "set -o pipefail | apt-cache show docker-ce | grep 'Version: 5:24.0.4' | awk '{print $2}'" @@ -74,4 +94,12 @@ register: docker_result until: docker_result is succeeded retries: "{{ package_retry }}" - delay: "{{ buildkit_delay }}" + delay: "{{ delay_time }}" + +- name: Configure proxy environment variables for docker + ansible.builtin.include_tasks: configure_docker_proxy.yml + when: proxy_status + +- name: Login to docker when credentials are given + ansible.builtin.include_tasks: docker_login.yml + when: docker_login diff --git a/utils/vllm_build/roles/vllm_build/tasks/validate_provision_config_credentials.yml b/utils/vllm_build/roles/vllm_build/tasks/validate_provision_config_credentials.yml new file mode 100644 index 000000000..ebde47525 --- /dev/null +++ b/utils/vllm_build/roles/vllm_build/tasks/validate_provision_config_credentials.yml @@ -0,0 +1,84 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Check provision_config_credentials.yml file is encrypted + ansible.builtin.command: cat {{ provision_config_credentials_filename }} + changed_when: false + register: provision_config_content + no_log: true + +- name: Decrpyt provision_config_credentials.yml + ansible.builtin.command: >- + ansible-vault decrypt {{ provision_config_credentials_filename }} + --vault-password-file {{ provision_credentials_vault_path }} + changed_when: false + when: ansible_vault_search_key in provision_config_content.stdout + +- name: Include provision_config_credentials.yml + block: + - name: Include provision_config_credentials.yml + ansible.builtin.include_vars: "{{ provision_config_credentials_filename }}" + register: include_provision_config + no_log: true + rescue: + - name: Failed to include_provision_config_credentials.yml + ansible.builtin.fail: + msg: "{{ provision_config_credentials_syntax_fail_msg }} Error: {{ include_provision_config.message }}" + +- name: Create ansible vault key + ansible.builtin.set_fact: + provision_vault_key: "{{ lookup('password', '/dev/null chars=ascii_letters') }}" + when: ansible_vault_search_key not in provision_config_content.stdout + +- name: Save vault key to provision_vault_path + ansible.builtin.lineinfile: + path: "{{ provision_credentials_vault_path }}" + line: "{{ provision_vault_key }}" + mode: "{{ conf_file_mode }}" + owner: root + create: true + when: ansible_vault_search_key not in provision_config_content.stdout + +- name: Set default docker_login value + ansible.builtin.set_fact: + docker_login: false + +- name: Assert docker_username and docker_password + ansible.builtin.assert: + that: docker_password | length > 1 + fail_msg: "{{ docker_password_fail_msg }}" + when: docker_username | length > 1 + +- name: Set docker_login to true + ansible.builtin.set_fact: + docker_login: true + when: docker_username | length > 1 + +- name: Warning - docker_username and docker_password credentials not provided + ansible.builtin.pause: + seconds: "{{ warning_wait_time }}" + prompt: "{{ warning_msg_docker_username_password_incomplete }}" + when: docker_login is false + +- name: Encrypt provision_config_credentials.yml + ansible.builtin.command: >- + ansible-vault encrypt {{ provision_config_credentials_filename }} + --vault-password-file {{ provision_credentials_vault_path }} + changed_when: false + +- name: Update provision_config_credentials.yml permission + ansible.builtin.file: + path: "{{ provision_config_credentials_filename }}" + mode: "{{ conf_file_mode }}" diff --git a/utils/vllm_build/roles/vllm_build/tasks/validate_site_config.yml b/utils/vllm_build/roles/vllm_build/tasks/validate_site_config.yml new file mode 100644 index 000000000..236f82263 --- /dev/null +++ b/utils/vllm_build/roles/vllm_build/tasks/validate_site_config.yml @@ -0,0 +1,104 @@ +# Copyright 2024 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Initialize variables + ansible.builtin.set_fact: + http_proxy_input_status: false + https_proxy_input_status: false + no_proxy_input_status: false + proxy_status: false + +- name: Include site_config.yml + ansible.builtin.include_vars: "{{ site_config_file }}" + +- name: Validate http_proxy variable provided + ansible.builtin.set_fact: + http_proxy_input_status: true + when: + - proxy[0].http_proxy is defined + - proxy[0].http_proxy | default("", true) | length > 1 + +- name: Validate https_proxy variable provided + ansible.builtin.set_fact: + https_proxy_input_status: true + when: + - proxy[0].https_proxy is defined + - proxy[0].https_proxy | default("", true) | length > 1 + +- name: Validate no_proxy variable provided + ansible.builtin.set_fact: + no_proxy_input_status: true + when: + - proxy[0].no_proxy is defined + - proxy[0].no_proxy | default("", true) | length > 1 + +- name: Validate both http_proxy and https_proxy input provided + ansible.builtin.fail: + msg: "{{ invalid_proxy_failure_msg }}" + when: + - not https_proxy_input_status and http_proxy_input_status or + not http_proxy_input_status and https_proxy_input_status + +- name: Validate proxy + when: + - http_proxy_input_status + - https_proxy_input_status + block: + - name: Validate http_proxy, https_proxy and no_proxy configured as environment variables + ansible.builtin.assert: + that: + - lookup('env', 'http_proxy') | length > 1 + - lookup('env', 'https_proxy') | length > 1 + - lookup('env', 'no_proxy') | length > 1 + - lookup('env', 'http_proxy') == proxy[0].http_proxy + - lookup('env', 'https_proxy') == proxy[0].https_proxy + - oim_hostname in lookup('env', 'no_proxy') + fail_msg: "{{ proxy_env_fail_msg }}" + + - name: Try updating repos in Ubuntu + when: oim_os in oim_os_ubuntu + block: + - name: Update repos in Ubuntu + ansible.builtin.apt: + update_cache: true + register: update_repos + until: update_repos is not failed + retries: "{{ repo_retries }}" + delay: "{{ repo_delay }}" + rescue: + - name: Failed to update repos + ansible.builtin.fail: + msg: "{{ update_repos_fail_msg }}" + + - name: Try updating repos in RHEL/Rocky + when: + - oim_os in oim_os_redhat or + oim_os in oim_os_rocky + block: + - name: Update repos in RHEL/Rocky + ansible.builtin.dnf: + update_cache: true + register: update_repos + until: update_repos is not failed + retries: "{{ repo_retries }}" + delay: "{{ repo_delay }}" + rescue: + - name: Failed to update repos + ansible.builtin.fail: + msg: "{{ update_repos_fail_msg }}" + + - name: Set proxy_status to true + ansible.builtin.set_fact: + proxy_status: true diff --git a/utils/vllm_build/roles/vllm_build/tasks/vllm_local_repo_push.yml b/utils/vllm_build/roles/vllm_build/tasks/vllm_local_repo_push.yml index ec453ccd5..9c80e0ae7 100644 --- a/utils/vllm_build/roles/vllm_build/tasks/vllm_local_repo_push.yml +++ b/utils/vllm_build/roles/vllm_build/tasks/vllm_local_repo_push.yml @@ -13,7 +13,7 @@ # limitations under the License. --- -- name: Read hostname of control plane +- name: Read hostname of Omnia Infrastructure Manager ansible.builtin.command: hostname changed_when: false register: hostname_result diff --git a/utils/vllm_build/roles/vllm_build/tasks/vllm_source_build.yml b/utils/vllm_build/roles/vllm_build/tasks/vllm_source_build.yml index b8086e7df..afee370da 100644 --- a/utils/vllm_build/roles/vllm_build/tasks/vllm_source_build.yml +++ b/utils/vllm_build/roles/vllm_build/tasks/vllm_source_build.yml @@ -31,9 +31,13 @@ - name: Build vLLM latest container environment: BUILDKIT_HOST: "nerdctl-container://buildkitd" - ansible.builtin.command: nerdctl build -f "{{ vllm_dir }}/Dockerfile.rocm" -t vllm-rocm:latest "{{ vllm_dir }}" + http_proxy: "{{ proxy[0].http_proxy | default('', true) }}" + https_proxy: "{{ proxy[0].https_proxy | default('', true) }}" + HTTP_PROXY: "{{ proxy[0].http_proxy | default('', true) }}" + HTTPS_PROXY: "{{ proxy[0].https_proxy | default('', true) }}" + ansible.builtin.command: "nerdctl build -f {{ vllm_dir }}/Dockerfile.rocm --build-arg HTTP_PROXY=$HTTP_PROXY --build-arg HTTPS_PROXY=$HTTPS_PROXY -t vllm-rocm:latest {{ vllm_dir }}" # noqa: yaml[line-length] register: container_result until: container_result is succeeded retries: "{{ package_retry }}" - delay: "{{ buildkit_delay }}" + delay: "{{ delay_time }}" changed_when: false diff --git a/utils/vllm_build/roles/vllm_build/templates/docker_http_proxy_conf.j2 b/utils/vllm_build/roles/vllm_build/templates/docker_http_proxy_conf.j2 new file mode 100644 index 000000000..0e5392d68 --- /dev/null +++ b/utils/vllm_build/roles/vllm_build/templates/docker_http_proxy_conf.j2 @@ -0,0 +1,4 @@ +[Service] +Environment="HTTP_PROXY={{ proxy[0].http_proxy | default('', true) }}" +Environment="HTTPS_PROXY={{ proxy[0].http_proxy | default('', true) }}" +Environment="NO_PROXY=localhost,{{ oim_hostname }},*.{{ oim_domain_name }},{{ oim_ip_addresses | join(',') }}{% if no_proxy_input_status %},{{ proxy[0].no_proxy }}{% endif %}" diff --git a/utils/vllm_build/roles/vllm_build/templates/docker_json.j2 b/utils/vllm_build/roles/vllm_build/templates/docker_json.j2 new file mode 100644 index 000000000..cb28fec4c --- /dev/null +++ b/utils/vllm_build/roles/vllm_build/templates/docker_json.j2 @@ -0,0 +1,9 @@ +{ + "proxies": { + "default": { + "httpProxy": "{{ proxy[0].http_proxy | default('', true) }}", + "httpsProxy": "{{ proxy[0].https_proxy | default('', true) }}", + "noProxy": "localhost,{{ oim_hostname }},*.{{ oim_domain_name }},{{ oim_ip_addresses | join(',') }}{% if no_proxy_input_status %},{{ proxy[0].no_proxy }}{% endif %}" + } + } +} diff --git a/utils/vllm_build/roles/vllm_build/vars/main.yml b/utils/vllm_build/roles/vllm_build/vars/main.yml index f2b6f2845..4cc35af2f 100644 --- a/utils/vllm_build/roles/vllm_build/vars/main.yml +++ b/utils/vllm_build/roles/vllm_build/vars/main.yml @@ -13,6 +13,11 @@ # limitations under the License. --- +# Usage: main.yml +local_repo_access_path: "/opt/omnia/offline/local_repo_access.yml" +include_local_repo_access_msg: "Failed to include {{ local_repo_access_path }}. +Please run discovery_provision.yml before running the playbook vllm_build.yml." + # Usage prereq.yml sources_list_dest: /etc/apt/sources.list.d docker_repo_temp: templates/docker_repo.j2 @@ -36,6 +41,9 @@ cert_packages: gpg_path: /etc/apt/keyrings/docker.asc docker_gpg_url: https://download.docker.com/linux/ubuntu/gpg package_retry: 10 +docker_update_repos_fail_msg: "Failed to update the docker repositories. Please ensure that the docker repositories are accessible +from the Omnia Infrastructure Manager and re-run the playbook." +clean_apt_cache_fail_msg: "Failed to clean the apt cache. Please ensure there are no lock files present and try running the playbook again." # Usage: main.yml vllm_version: "v0.3.2" @@ -49,7 +57,44 @@ buildkit_version: "v0.13.1" builldkit_repo: "https://github.com/moby/buildkit.git" buildkit_dir: "/buildkit" image_retries: 5 -buildkit_delay: 10 +delay_time: 10 # Usage: vllm_local_repo_push.yml nerdctl_registry_port: "5001" + +# Usage: configure_docker_proxy.yml +docker_file_mode: "0600" +docker_config_src: "{{ role_path }}/templates/docker_json.j2" +docker_config_dest: "/root/.docker/config.json" +docker_service_dest: "/etc/systemd/system/docker.service.d" +docker_http_proxy_conf_src: "{{ role_path }}/templates/docker_http_proxy_conf.j2" +docker_auth_folder: "/root/.docker/" +docker_dir_mode: "700" + +# Usage: docker_login.yml +docker_login_fail_msg: "Docker login failed. Please ensure the docker login credentials in the input/provision_config_credentials.yml are valid. +If they are, this error can occur due to a pull limit issue or multiple requests. Please try running the playbook again after waiting for a while." + +# Usage: validate_provision_config_credentials.yml +docker_password_fail_msg: "docker password must be mentioned when docker_username is defined." +warning_wait_time: 30 +warning_msg_docker_username_password_incomplete: "[WARNING] Docker credentials not provided in provision_config_credentials.yml. +Proceeding without docker credentials." +provision_config_credentials_filename: "{{ role_path }}/../../../../input/provision_config_credentials.yml" +provision_credentials_vault_path: "{{ role_path }}/../../../../input/.provision_credential_vault_key" +ansible_vault_search_key: "$ANSIBLE_VAULT;" +provision_config_credentials_syntax_fail_msg: "Failed. Syntax errors present in provision_config_credentials.yml. Fix errors and re-run playbook again." +conf_file_mode: "0644" + +# Usage: validate_site_config.yml +site_config_file: "{{ role_path }}/../../../../input/site_config.yml" +invalid_proxy_failure_msg: "Failed. Both http_proxy and https_proxy should be set for proxy variable provided in site_config.yml" +proxy_env_fail_msg: "Failed. The values for http_proxy {{ proxy[0].http_proxy }} and https_proxy {{ proxy[0].https_proxy }} in the +proxy variable of the site_config.yml should be set as environment variables http_proxy and https_proxy in the Omnia Infrastructure Manager. +The no_proxy environment variable should include the Omnia Infrastructure Manager hostname and the admin network IP address." +update_repos_fail_msg: "Failed to update repos. Verify proxy configuration in Omnia Infrastructure Manager for acccessing internet." +oim_os_redhat: "redhat" +oim_os_rocky: "rocky" +oim_os_ubuntu: "ubuntu" +repo_retries: 5 +repo_delay: 10 diff --git a/utils/vllm_build/vllm_k8_config.yml b/utils/vllm_build/vllm_k8_config.yml index d204a5f63..50972745b 100644 --- a/utils/vllm_build/vllm_k8_config.yml +++ b/utils/vllm_build/vllm_k8_config.yml @@ -5,32 +5,31 @@ metadata: spec: predictor: containers: - - args: - - "--port" - - "8080" - - "--model" - - "meta-llama/Meta-Llama-3-70B-Instruct" - command: - - "python3" - - "-m" - - "vllm.entrypoints.api_server" - env: - - name: HUGGING_FACE_HUB_TOKEN - value: "xxxxxxxxxxxxxxxxxxxxx" - - name: HTTP_PROXY - value: "http://10.20.0.1:3128" - - name: HTTPS_PROXY - value: "http://10.20.0.1:3128" - image: vllm-rocm:latest - imagePullPolicy: IfNotPresent - name: vllm-container - resources: - limits: - cpu: "4" - memory: 600Gi - amd.com/gpu: "4" - requests: - cpu: "1" - memory: 200Gi - amd.com/gpu: "4" - + - args: + - "--port" + - "8080" + - "--model" + - "meta-llama/Meta-Llama-3-70B-Instruct" + command: + - "python3" + - "-m" + - "vllm.entrypoints.api_server" + env: + - name: HUGGING_FACE_HUB_TOKEN + value: "xxxxxxxxxxxxxxxxxxxxx" + - name: HTTP_PROXY + value: "http://:3128" + - name: HTTPS_PROXY + value: "http://:3128" + image: vllm-rocm:latest + imagePullPolicy: IfNotPresent + name: vllm-container + resources: + limits: + cpu: "4" + memory: 600Gi + amd.com/gpu: "4" + requests: + cpu: "1" + memory: 200Gi + amd.com/gpu: "4"