Skip to content

Commit

Permalink
Add vulcano dependency to installation to enable GANG scheduling from…
Browse files Browse the repository at this point in the history
… the get-go
  • Loading branch information
JMGaljaard committed Sep 25, 2023
1 parent 7817a14 commit 47e21bb
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 7 deletions.
39 changes: 33 additions & 6 deletions terraform/terraform-dependencies/main.tf
Original file line number Diff line number Diff line change
@@ -1,15 +1,42 @@

data "google_client_config" "default" {}

# Retrieve kustomize templates
data "kustomization_build" "training_operator" {
path = "github.com/kubeflow/manifests.git/apps/training-operator/upstream/overlays/standalone?ref=${var.kubeflow_version}"
# Add Vulcano Gang scheduler plugin using all default values.
resource "helm_release" "vulcano_scheduler" {

name = var.vulcano_scheduler_information.release_name
repository = var.vulcano_scheduler_repo_url
chart = var.vulcano_scheduler_information.chart_name
version = var.vulcano_scheduler_information.version

namespace = var.vulcano_scheduler_information.namespace
create_namespace = true
}

# Treat training-operator as overlay and apply a patch to add support for gang scheduling.
# Creates an overlay (patched version) of the original training operator to deploy.
data "kustomization_overlay" "training_operator" {
resources = [
"github.com/kubeflow/manifests.git/apps/training-operator/upstream/overlays/standalone?ref=${var.kubeflow_version}"
]

# Apply vulcano patch in overlay.
patches {
path = "patches/training-operator-patch.yaml"
target {
kind = "Deployment"
namespace = "kubeflow"
name = "training-operator"
}
}
}

# Deploy resources one-by-one.
resource "kustomization_resource" "training_operator" {
for_each = data.kustomization_build.training_operator.ids
manifest = data.kustomization_build.training_operator.manifests[each.value]
# Before we can install the training operator, we need to have the vulcano_scheduler up and running.
# See also the patch that we apply to the training operator through kustomize.
depends_on = [helm_release.vulcano_scheduler]
for_each = data.kustomization_overlay.training_operator.ids
manifest = data.kustomization_overlay.training_operator.manifests[each.value]
}

# Create NFS resource
Expand Down
24 changes: 23 additions & 1 deletion terraform/terraform-dependencies/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ variable "project_zone" {

variable "description" {
type = string
default = "Managed by terraform FLTK testbed deployment"
default = "Managed by terraform FLTK testbed deployment."
}

variable "account_id" {
Expand Down Expand Up @@ -72,3 +72,25 @@ variable "nfs_provisioner_repo_url" {
type = string
default = "https://charts.helm.sh/stable"
}

variable "vulcano_scheduler_information" {
type = object({
release_name = string
chart_name = string
namespace = string
version = string
})
default = {
release_name = "vulcano"
chart_name = "vulcano"
namespace = "vulcano-system"
version = "v1.7.0"
}
}

variable "vulcano_scheduler_repo_url" {
description = "Repository URL to locate the utilized helm charts for Vulcano Scheduler Plugin."
type = string
default = "https://github.com/volcano-sh/helm-charts"
}

0 comments on commit 47e21bb

Please sign in to comment.