Skip to content

Commit

Permalink
don't fail if RHOAI is not installed
Browse files Browse the repository at this point in the history
  • Loading branch information
bdattoma committed Oct 25, 2024
1 parent 8b5622d commit 58710b2
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 11 deletions.
9 changes: 8 additions & 1 deletion ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_operator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,13 @@ function wait_until_driver_image_is_built() {
}

function create_acceleratorprofile() {
echo "Creating AMD Accelerator Profile"
rhoai_ns=$(oc get namespace redhat-ods-applications --ignore-not-found -oname)
if [ -n $rhoai_ns ];
then
echo "redhat-ods-applications namespace not found. Is RHOAI Installed? NVIDIA Accelerator Profile creation SKIPPED."
return 0
fi
echo "Creating an Accelerator Profile for Dashboard"
oc apply -f - <<EOF
apiVersion: dashboard.opendatahub.io/v1
Expand Down Expand Up @@ -192,5 +199,5 @@ fi
echo "Configuration of AMD GPU node and Operators completed"
# the message appears in the logs, but the pod may get delete before our code next iteration checks the logs once again,
# hence it'd fails to reach the pod. It happened to me
# wait_while 1200 monitor_logs "$name" openshift-amd-gpu docker-build "Successfully pushed image-registry.openshift-image-registry.svc:5000/openshift-amd-gpu"
wait_while 1200 monitor_logs "$name" openshift-amd-gpu docker-build "Successfully pushed image-registry.openshift-image-registry.svc:5000/openshift-amd-gpu"
create_acceleratorprofile
26 changes: 16 additions & 10 deletions ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,13 @@ function rerun_accelerator_migration() {
# 1. Delete the migration configmap
# 2. Rollout restart dashboard deployment, so the configmap is created again and the migration run again
# Context: https://github.com/opendatahub-io/odh-dashboard/issues/1938

echo "Creating NVIDIA Accelerator Profile via RHOAI Dashboard deployment rollout"
configmap=$(oc get configmap migration-gpu-status --ignore-not-found -n redhat-ods-applications -oname)
if [ -n $configmap ];
then
echo "migration-gpu-status not found. Is RHOAI Installed? NVIDIA Accelerator Profile creation SKIPPED."
return 0
fi
echo "Deleting configmap migration-gpu-status"
if ! oc delete configmap migration-gpu-status -n redhat-ods-applications;
then
Expand All @@ -81,13 +87,13 @@ function rerun_accelerator_migration() {
oc describe AcceleratorProfiles -n redhat-ods-applications
}

wait_until_pod_ready_status "gpu-operator"
oc apply -f "$GPU_INSTALL_DIR/../nfd_deploy.yaml"
oc get csv -n nvidia-gpu-operator "$CSVNAME" -o jsonpath='{.metadata.annotations.alm-examples}' | jq .[0] > clusterpolicy.json
oc apply -f clusterpolicy.json
wait_until_pod_ready_status "nvidia-device-plugin-daemonset" 600
wait_until_pod_ready_status "nvidia-container-toolkit-daemonset"
wait_until_pod_ready_status "nvidia-dcgm-exporter"
wait_until_pod_ready_status "gpu-feature-discovery"
wait_until_pod_ready_status "nvidia-operator-validator"
# wait_until_pod_ready_status "gpu-operator"
# oc apply -f "$GPU_INSTALL_DIR/../nfd_deploy.yaml"
# oc get csv -n nvidia-gpu-operator "$CSVNAME" -o jsonpath='{.metadata.annotations.alm-examples}' | jq .[0] > clusterpolicy.json
# oc apply -f clusterpolicy.json
# wait_until_pod_ready_status "nvidia-device-plugin-daemonset" 600
# wait_until_pod_ready_status "nvidia-container-toolkit-daemonset"
# wait_until_pod_ready_status "nvidia-dcgm-exporter"
# wait_until_pod_ready_status "gpu-feature-discovery"
# wait_until_pod_ready_status "nvidia-operator-validator"
rerun_accelerator_migration

0 comments on commit 58710b2

Please sign in to comment.