From 47e21bb8620c1c1725d0d41c81af8009f2d27eb7 Mon Sep 17 00:00:00 2001 From: "Jeroen M. Galjaard" Date: Wed, 19 Jul 2023 16:21:10 +0200 Subject: [PATCH] Add vulcano dependency to installation to enable GANG scheduling from the get-go --- terraform/terraform-dependencies/main.tf | 39 ++++++++++++++++--- terraform/terraform-dependencies/variables.tf | 24 +++++++++++- 2 files changed, 56 insertions(+), 7 deletions(-) diff --git a/terraform/terraform-dependencies/main.tf b/terraform/terraform-dependencies/main.tf index bc38cab3..99479b5c 100644 --- a/terraform/terraform-dependencies/main.tf +++ b/terraform/terraform-dependencies/main.tf @@ -1,15 +1,42 @@ - data "google_client_config" "default" {} -# Retrieve kustomize templates -data "kustomization_build" "training_operator" { - path = "github.com/kubeflow/manifests.git/apps/training-operator/upstream/overlays/standalone?ref=${var.kubeflow_version}" +# Add Vulcano Gang scheduler plugin using all default values. +resource "helm_release" "vulcano_scheduler" { + + name = var.vulcano_scheduler_information.release_name + repository = var.vulcano_scheduler_repo_url + chart = var.vulcano_scheduler_information.chart_name + version = var.vulcano_scheduler_information.version + + namespace = var.vulcano_scheduler_information.namespace + create_namespace = true +} + +# Treat training-operator as overlay and apply a patch to add support for gang scheduling. +# Creates an overlay (patched version) of the original training operator to deploy. +data "kustomization_overlay" "training_operator" { + resources = [ + "github.com/kubeflow/manifests.git/apps/training-operator/upstream/overlays/standalone?ref=${var.kubeflow_version}" + ] + + # Apply vulcano patch in overlay. + patches { + path = "patches/training-operator-patch.yaml" + target { + kind = "Deployment" + namespace = "kubeflow" + name = "training-operator" + } + } } # Deploy resources one-by-one. resource "kustomization_resource" "training_operator" { - for_each = data.kustomization_build.training_operator.ids - manifest = data.kustomization_build.training_operator.manifests[each.value] + # Before we can install the training operator, we need to have the vulcano_scheduler up and running. + # See also the patch that we apply to the training operator through kustomize. + depends_on = [helm_release.vulcano_scheduler] + for_each = data.kustomization_overlay.training_operator.ids + manifest = data.kustomization_overlay.training_operator.manifests[each.value] } # Create NFS resource diff --git a/terraform/terraform-dependencies/variables.tf b/terraform/terraform-dependencies/variables.tf index c0905001..c062ed75 100644 --- a/terraform/terraform-dependencies/variables.tf +++ b/terraform/terraform-dependencies/variables.tf @@ -29,7 +29,7 @@ variable "project_zone" { variable "description" { type = string - default = "Managed by terraform FLTK testbed deployment" + default = "Managed by terraform FLTK testbed deployment." } variable "account_id" { @@ -72,3 +72,25 @@ variable "nfs_provisioner_repo_url" { type = string default = "https://charts.helm.sh/stable" } + +variable "vulcano_scheduler_information" { + type = object({ + release_name = string + chart_name = string + namespace = string + version = string + }) + default = { + release_name = "vulcano" + chart_name = "vulcano" + namespace = "vulcano-system" + version = "v1.7.0" + } +} + +variable "vulcano_scheduler_repo_url" { + description = "Repository URL to locate the utilized helm charts for Vulcano Scheduler Plugin." + type = string + default = "https://github.com/volcano-sh/helm-charts" +} +