diff --git a/terraform/terraform-dependencies-local/main.tf b/terraform/terraform-dependencies-local/main.tf new file mode 100644 index 00000000..e7dbd5d2 --- /dev/null +++ b/terraform/terraform-dependencies-local/main.tf @@ -0,0 +1,57 @@ +# Add Vulcano Gang scheduler plugin using all default values. +resource "helm_release" "volcano_scheduler" { + name = var.vulcano_scheduler_information.release_name + repository = var.vulcano_scheduler_repo_url + chart = var.vulcano_scheduler_information.chart_name + version = var.vulcano_scheduler_information.version + + namespace = var.vulcano_scheduler_information.namespace + create_namespace = true +} + +# Treat training-operator as overlay and apply a patch to add support for gang scheduling. +# Creates an overlay (patched version) of the original training operator to deploy. +data "kustomization_overlay" "training_operator" { + resources = [ + "github.com/kubeflow/manifests.git/apps/training-operator/upstream/overlays/standalone?ref=${var.kubeflow_version}" + ] + + # Apply vulcano patch in overlay. + patches { + path = "patches/training-operator-patch.yaml" + target { + kind = "Deployment" + namespace = "kubeflow" + name = "training-operator" + } + } +} + +# Deploy resources one-by-one. +resource "kustomization_resource" "training_operator" { + # Before we can install the training operator, we need to have the vulcano_scheduler up and running. + # See also the patch that we apply to the training operator through kustomize. + depends_on = [helm_release.volcano_scheduler] + for_each = data.kustomization_overlay.training_operator.ids + manifest = data.kustomization_overlay.training_operator.manifests[each.value] +} + +# Create NFS resource +resource "helm_release" "nfs_client_provisioner" { + name = var.nfs_provider_information.release_name + repository = var.nfs_provisioner_repo_url + chart = var.nfs_provider_information.chart_name + + namespace = var.nfs_provider_information.namespace + create_namespace = true + + values = [ + templatefile("${path.module}/values.nfs.yaml.tpl", { + nfs_server_path = var.nfs_provider_information.server_path + image_repository = var.nfs_provider_information.image_repository + image_tag = var.nfs_provider_information.image_tag + pull_policy = var.nfs_provider_information.pull_policy + nfs_size = var.nfs_provider_information.storage_size + }) + ] +} diff --git a/terraform/terraform-dependencies-local/patches/training-operator-patch.yaml b/terraform/terraform-dependencies-local/patches/training-operator-patch.yaml new file mode 100644 index 00000000..3a3f82ee --- /dev/null +++ b/terraform/terraform-dependencies-local/patches/training-operator-patch.yaml @@ -0,0 +1,12 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: training-operator +spec: + template: + spec: + containers: + - name: training-operator + command: + - /manager + - --gang-scheduler-name=volcano diff --git a/terraform/terraform-dependencies-local/providers.tf b/terraform/terraform-dependencies-local/providers.tf new file mode 100644 index 00000000..33b6cac3 --- /dev/null +++ b/terraform/terraform-dependencies-local/providers.tf @@ -0,0 +1,18 @@ +# Kustomization (i.e. Kustomize) requires kubernetes config +provider "kustomization" { + kubeconfig_path = var.kubernetes_config_path +} + +provider "kubectl" { + config_path = var.kubernetes_config_path +} + +provider "kubernetes" { + config_path = kubernetes_config_path +} + +provider "helm" { + kubernetes { + config_path = var.kubernetes_config_path + } +} diff --git a/terraform/terraform-dependencies-local/values.nfs.yaml.tpl b/terraform/terraform-dependencies-local/values.nfs.yaml.tpl new file mode 100644 index 00000000..32f53bcc --- /dev/null +++ b/terraform/terraform-dependencies-local/values.nfs.yaml.tpl @@ -0,0 +1,98 @@ +# Default values for nfs-provisioner. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +replicaCount: 1 + +# imagePullSecrets: + +image: + repository: ${image_repository} + tag: ${image_tag} + pullPolicy: ${pull_policy} + + +# For a list of available arguments +# Please see https://github.com/kubernetes-incubator/external-storage/blob/master/nfs/docs/deployment.md#arguments +extraArgs: {} + # device-based-fsids: false + +service: + type: ClusterIP + + nfsPort: 2049 + nlockmgrPort: 32803 + mountdPort: 20048 + rquotadPort: 875 + rpcbindPort: 111 + statdPort: 662 + # nfsNodePort: + # nlockmgrNodePort: + # mountdNodePort: + # rquotadNodePort: + # rpcbindNodePort: + # statdNodePort: + + externalIPs: [] + +persistence: + enabled: true + + ## Persistent Volume Storage Class + ## If defined, storageClassName: + ## If set to "-", storageClassName: "", which disables dynamic provisioning + ## If undefined (the default) or set to null, no storageClassName spec is + ## set, choosing the default provisioner. (gp2 on AWS, standard on + ## GKE, AWS & OpenStack) + ## + storageClassName: "standard" + accessMode: ReadWriteOnce + size: ${nfs_size} + +## For creating the StorageClass automatically: +storageClass: + create: true + + ## Set a provisioner name. If unset, a name will be generated. + # provisionerName: + + ## Set StorageClass as the default StorageClass + ## Ignored if storageClass.create is false + defaultClass: false + + ## Set a StorageClass name + ## Ignored if storageClass.create is false + name: nfs + + # set to null to prevent expansion + allowVolumeExpansion: true + ## StorageClass parameters + parameters: {} + + mountOptions: + - vers=3 + + ## ReclaimPolicy field of the class, which can be either Delete or Retain + reclaimPolicy: Delete + +## For RBAC support: +rbac: + create: true + + ## Ignored if rbac.create is true + ## + serviceAccountName: default + +resources: {} + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +nodeSelector: {} + +tolerations: [] + +affinity: {} \ No newline at end of file diff --git a/terraform/terraform-dependencies-local/variables.tf b/terraform/terraform-dependencies-local/variables.tf new file mode 100644 index 00000000..60236f62 --- /dev/null +++ b/terraform/terraform-dependencies-local/variables.tf @@ -0,0 +1,61 @@ +variable "kubernetes_config_path" { + description = "Path of Kubernetes configuration file (change for non-default kubectl setup)" + default = "~/.kube/config" +} + +variable "kubeflow_version" { + type = string + description = "Kubeflow (PyTorch) Training Operator to install." + default = "v1.5.0" +} + +variable "nfs_provider_information" { + type = object({ + release_name = string + chart_name = string + namespace = string + server_path = string + image_repository = string + image_tag = string + pull_policy = string + storage_size = string + }) + default = { + release_name = "nfs-server" + chart_name = "nfs-server-provisioner" + namespace = "test" + server_path = "/mnt/kubernetes" + image_repository = "k8s.gcr.io/sig-storage/nfs-provisioner" + image_tag = "v3.0.0" + pull_policy = "IfNotPresent" + storage_size = "50Gi" + } +} + +variable "nfs_provisioner_repo_url" { + description = "Repository URL to locate the utilized helm charts" + type = string + default = "https://charts.helm.sh/stable" +} + +variable "vulcano_scheduler_information" { + type = object({ + release_name = string + chart_name = string + namespace = string + version = string + }) + default = { + release_name = "volcano" + chart_name = "volcano" + namespace = "volcano-system" + version = "v1.8.0" + } +} + +variable "vulcano_scheduler_repo_url" { + description = "Repository URL to locate the utilized helm charts for Vulcano Scheduler Plugin." + type = string + default = "https://volcano-sh.github.io/helm-charts" +} + diff --git a/terraform/terraform-dependencies-local/versions.tf b/terraform/terraform-dependencies-local/versions.tf new file mode 100644 index 00000000..a9398cf8 --- /dev/null +++ b/terraform/terraform-dependencies-local/versions.tf @@ -0,0 +1,23 @@ +terraform { + required_providers { + kustomization = { + source = "kbst/kustomization" + version = ">= 0.7.0" + } + + kubectl = { + source = "gavinbunney/kubectl" + version = ">= 1.13.1" + } + + kubernetes = { + source = "hashicorp/kubernetes" + version = ">= 1.13.1" + } + + helm = { + source = "hashicorp/helm" + } + } + required_version = "~> 1.1" +}