From ee1b691acdb2d605ffe77935aa45c57ac047ed15 Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Mon, 27 Jan 2025 10:49:50 +0000 Subject: [PATCH] KEP-2170: Deploy JobSet in `kubeflow-system` namespace (#2388) * KEP-2170: Deploy JobSet in kubeflow-system namespace Signed-off-by: Andrey Velichkevich * Remove namespace from base Signed-off-by: Andrey Velichkevich * Remove label from namespace Signed-off-by: Andrey Velichkevich * Create third-party dir for JobSet Signed-off-by: Andrey Velichkevich * Bump JobSet to v0.7.3 Signed-off-by: Andrey Velichkevich * Drop namespace from JobSet config Signed-off-by: Andrey Velichkevich --------- Signed-off-by: Andrey Velichkevich --- manifests/v2/base/manager/kustomization.yaml | 2 -- manifests/v2/base/rbac/kustomization.yaml | 2 -- .../kustomization.yaml | 2 +- .../torch_distributed.yaml} | 0 manifests/v2/base/webhook/kustomization.yaml | 2 -- .../kustomization.yaml | 11 ++++++++-- .../{only-manager => manager}/namespace.yaml | 0 .../kustomization.yaml | 2 +- .../v2/overlays/standalone/kustomization.yaml | 19 ----------------- .../jobset/jobset_manager_config.yaml | 2 ++ .../v2/third-party/jobset/kustomization.yaml | 18 ++++++++++++++++ .../jobset/patches/jobset_config_patch.yaml | 21 +++++++++++++++++++ .../patches/jobset_remove_namespace.yaml} | 4 +++- 13 files changed, 55 insertions(+), 30 deletions(-) rename manifests/v2/base/runtimes/{pre-training => pretraining}/kustomization.yaml (73%) rename manifests/v2/base/runtimes/{pre-training/torch-distributed.yaml => pretraining/torch_distributed.yaml} (100%) rename manifests/v2/overlays/{only-manager => manager}/kustomization.yaml (58%) rename manifests/v2/overlays/{only-manager => manager}/namespace.yaml (100%) rename manifests/v2/overlays/{only-runtimes => runtimes}/kustomization.yaml (66%) delete mode 100644 manifests/v2/overlays/standalone/kustomization.yaml create mode 100644 manifests/v2/third-party/jobset/jobset_manager_config.yaml create mode 100644 manifests/v2/third-party/jobset/kustomization.yaml create mode 100644 manifests/v2/third-party/jobset/patches/jobset_config_patch.yaml rename manifests/v2/{overlays/standalone/namespace.yaml => third-party/jobset/patches/jobset_remove_namespace.yaml} (50%) diff --git a/manifests/v2/base/manager/kustomization.yaml b/manifests/v2/base/manager/kustomization.yaml index a62e9473d9..7394a6d059 100644 --- a/manifests/v2/base/manager/kustomization.yaml +++ b/manifests/v2/base/manager/kustomization.yaml @@ -1,4 +1,2 @@ resources: - manager.yaml -# TODO (andreyvelich): Move it to overlays once we copy the JobSet manifests. -namespace: kubeflow-system diff --git a/manifests/v2/base/rbac/kustomization.yaml b/manifests/v2/base/rbac/kustomization.yaml index e9fca6afba..25a37bf74f 100644 --- a/manifests/v2/base/rbac/kustomization.yaml +++ b/manifests/v2/base/rbac/kustomization.yaml @@ -2,5 +2,3 @@ resources: - role.yaml - role_binding.yaml - service_account.yaml -# TODO (andreyvelich): Move it to overlays once we copy the JobSet manifests. -namespace: kubeflow-system diff --git a/manifests/v2/base/runtimes/pre-training/kustomization.yaml b/manifests/v2/base/runtimes/pretraining/kustomization.yaml similarity index 73% rename from manifests/v2/base/runtimes/pre-training/kustomization.yaml rename to manifests/v2/base/runtimes/pretraining/kustomization.yaml index 1fb6985131..6facf87216 100644 --- a/manifests/v2/base/runtimes/pre-training/kustomization.yaml +++ b/manifests/v2/base/runtimes/pretraining/kustomization.yaml @@ -1,4 +1,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - - torch-distributed.yaml + - torch_distributed.yaml diff --git a/manifests/v2/base/runtimes/pre-training/torch-distributed.yaml b/manifests/v2/base/runtimes/pretraining/torch_distributed.yaml similarity index 100% rename from manifests/v2/base/runtimes/pre-training/torch-distributed.yaml rename to manifests/v2/base/runtimes/pretraining/torch_distributed.yaml diff --git a/manifests/v2/base/webhook/kustomization.yaml b/manifests/v2/base/webhook/kustomization.yaml index 1ea670ceef..5723808d02 100644 --- a/manifests/v2/base/webhook/kustomization.yaml +++ b/manifests/v2/base/webhook/kustomization.yaml @@ -10,5 +10,3 @@ patches: kind: ValidatingWebhookConfiguration configurations: - kustomizeconfig.yaml -# TODO (andreyvelich): Move it to overlays once we copy the JobSet manifests. -namespace: kubeflow-system diff --git a/manifests/v2/overlays/only-manager/kustomization.yaml b/manifests/v2/overlays/manager/kustomization.yaml similarity index 58% rename from manifests/v2/overlays/only-manager/kustomization.yaml rename to manifests/v2/overlays/manager/kustomization.yaml index b6f81239d8..8e3b9873df 100644 --- a/manifests/v2/overlays/only-manager/kustomization.yaml +++ b/manifests/v2/overlays/manager/kustomization.yaml @@ -1,16 +1,23 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization + +# Namespace where all resources are deployed. +namespace: kubeflow-system + resources: - namespace.yaml - ../../base/crds - ../../base/manager - ../../base/rbac - ../../base/webhook - # TODO (andreyvelich): JobSet should support kubeflow-system namespace. - - https://github.com/kubernetes-sigs/jobset/releases/download/v0.6.0/manifests.yaml + - ../../third-party/jobset # Comment this line if JobSet is installed on the Kubernetes cluster. + +# Update the Kubeflow Training manager image tag. images: - name: kubeflow/training-operator-v2 newTag: latest + +# Secret for the Kubeflow Training webhook. secretGenerator: - name: training-operator-v2-webhook-cert namespace: kubeflow-system diff --git a/manifests/v2/overlays/only-manager/namespace.yaml b/manifests/v2/overlays/manager/namespace.yaml similarity index 100% rename from manifests/v2/overlays/only-manager/namespace.yaml rename to manifests/v2/overlays/manager/namespace.yaml diff --git a/manifests/v2/overlays/only-runtimes/kustomization.yaml b/manifests/v2/overlays/runtimes/kustomization.yaml similarity index 66% rename from manifests/v2/overlays/only-runtimes/kustomization.yaml rename to manifests/v2/overlays/runtimes/kustomization.yaml index 41fb29b783..970726d8c8 100644 --- a/manifests/v2/overlays/only-runtimes/kustomization.yaml +++ b/manifests/v2/overlays/runtimes/kustomization.yaml @@ -1,4 +1,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - - ../../base/runtimes/pre-training + - ../../base/runtimes/pretraining diff --git a/manifests/v2/overlays/standalone/kustomization.yaml b/manifests/v2/overlays/standalone/kustomization.yaml deleted file mode 100644 index 2a59e17ed4..0000000000 --- a/manifests/v2/overlays/standalone/kustomization.yaml +++ /dev/null @@ -1,19 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization -resources: - - namespace.yaml - - ../../base/crds - - ../../base/manager - - ../../base/rbac - - ../../base/webhook - - ../../base/runtimes/pre-training - # TODO (andreyvelich): JobSet should support kubeflow-system namespace. - - https://github.com/kubernetes-sigs/jobset/releases/download/v0.6.0/manifests.yaml -images: - - name: kubeflow/training-operator-v2 - newTag: latest -secretGenerator: - - name: training-operator-v2-webhook-cert - namespace: kubeflow-system - options: - disableNameSuffixHash: true diff --git a/manifests/v2/third-party/jobset/jobset_manager_config.yaml b/manifests/v2/third-party/jobset/jobset_manager_config.yaml new file mode 100644 index 0000000000..ac9957d47d --- /dev/null +++ b/manifests/v2/third-party/jobset/jobset_manager_config.yaml @@ -0,0 +1,2 @@ +apiVersion: config.jobset.x-k8s.io/v1alpha1 +kind: Configuration diff --git a/manifests/v2/third-party/jobset/kustomization.yaml b/manifests/v2/third-party/jobset/kustomization.yaml new file mode 100644 index 0000000000..139fa7300d --- /dev/null +++ b/manifests/v2/third-party/jobset/kustomization.yaml @@ -0,0 +1,18 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - https://github.com/kubernetes-sigs/jobset/releases/download/v0.7.3/manifests.yaml + +# Config for the JobSet manager. +configMapGenerator: + - name: jobset-manager-config + files: + - jobset_manager_config.yaml + options: + disableNameSuffixHash: true + +# Add required patches. +patchesStrategicMerge: + - patches/jobset_remove_namespace.yaml # Remove namespace from the JobSet release manifests. + - patches/jobset_config_patch.yaml # Add custom manager config to the JobSet. diff --git a/manifests/v2/third-party/jobset/patches/jobset_config_patch.yaml b/manifests/v2/third-party/jobset/patches/jobset_config_patch.yaml new file mode 100644 index 0000000000..c8e5fdfef6 --- /dev/null +++ b/manifests/v2/third-party/jobset/patches/jobset_config_patch.yaml @@ -0,0 +1,21 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: jobset-controller-manager + namespace: jobset-system +spec: + template: + spec: + containers: + - name: manager + args: + - "--config=/jobset_manager_config.yaml" + volumeMounts: + - name: jobset-manager-config + mountPath: /jobset_manager_config.yaml + subPath: jobset_manager_config.yaml + readOnly: true + volumes: + - name: jobset-manager-config + configMap: + name: jobset-manager-config diff --git a/manifests/v2/overlays/standalone/namespace.yaml b/manifests/v2/third-party/jobset/patches/jobset_remove_namespace.yaml similarity index 50% rename from manifests/v2/overlays/standalone/namespace.yaml rename to manifests/v2/third-party/jobset/patches/jobset_remove_namespace.yaml index 6bfc4968bd..1e7a99903e 100644 --- a/manifests/v2/overlays/standalone/namespace.yaml +++ b/manifests/v2/third-party/jobset/patches/jobset_remove_namespace.yaml @@ -1,4 +1,6 @@ +--- +$patch: delete apiVersion: v1 kind: Namespace metadata: - name: kubeflow-system + name: jobset-system