From 88dbb1b983f6330909ea407b6622fffe017487f8 Mon Sep 17 00:00:00 2001 From: Blake Devcich <89158881+bdevcich@users.noreply.github.com> Date: Thu, 16 Nov 2023 12:22:46 -0600 Subject: [PATCH] Added ability to use non-default profiles (#140) * Added no-xattr profile to nnf-dm-config config map to run `dcp --xattrs none` * Having another profile in there makes it easier for int-test * DM controller now looks retrieves the profile specified by dm.spec.profile Copy Offload API does not yet support this Signed-off-by: Blake Devcich --- config/dm_config/nnf-dm-config.yaml | 56 +++++++++-------- go.mod | 2 +- go.sum | 4 +- .../controller/datamovement_controller.go | 15 ++--- .../datamovement_controller_test.go | 61 ++++++++++++++++++- .../api/v1alpha1/nnf_datamovement_types.go | 9 +++ .../nnf.cray.hpe.com_nnfdatamovements.yaml | 6 ++ vendor/modules.txt | 2 +- 8 files changed, 114 insertions(+), 41 deletions(-) diff --git a/config/dm_config/nnf-dm-config.yaml b/config/dm_config/nnf-dm-config.yaml index 50debc43..ab84e2b0 100644 --- a/config/dm_config/nnf-dm-config.yaml +++ b/config/dm_config/nnf-dm-config.yaml @@ -1,38 +1,44 @@ # Each profile is capable of providing different configurations for data movement. -# Note: For now, only the default profile is supported. profiles: - # Default profile that is used for all data movement activity. default: - # The number of slots specified in the MPI hostfile. A value less than 1 disables the use - # of slots in the hostfile. - slots: 8 + # The number of slots specified in the MPI hostfile. A value less than 1 disables the use + # of slots in the hostfile. + slots: 8 + + # The number of max_slots specified in the MPI hostfile. A value less than 1 disables the use + # of max_slots in the hostfile. + maxSlots: 0 - # The number of max_slots specified in the MPI hostfile. A value less than 1 disables the use - # of max_slots in the hostfile. - maxSlots: 0 + # The full command to execute data movement. $VARS are replaced by the nnf software. Available + # $VARS: + # HOSTFILE: hostfile that is created and used for mpirun. Contains a list of hosts and the + # slots/max_slots for each host. This hostfile is created at `/tmp//hostfile` + # UID: User ID that is inherited from the Workflow + # GID: Group ID that is inherited from the Workflow + # SRC: source for the data movement + # DEST destination for the data movement + # default: command: ulimit -n 2048 && mpirun --allow-run-as-root --hostfile $HOSTFILE dcp --progress 1 --uid $UID --gid $GID $SRC $DEST + command: ulimit -n 2048 && mpirun --allow-run-as-root --hostfile $HOSTFILE dcp --progress 1 --uid $UID --gid $GID $SRC $DEST - # The full command to execute data movement. $VARS are replaced by the nnf software. Available - # $VARS: - # HOSTFILE: hostfile that is created and used for mpirun. Contains a list of hosts and the - # slots/max_slots for each host. This hostfile is created at `/tmp//hostfile` - # UID: User ID that is inherited from the Workflow - # GID: Group ID that is inherited from the Workflow - # SRC: source for the data movement - # DEST destination for the data movement - # default: command: ulimit -n 2048 && mpirun --allow-run-as-root --hostfile $HOSTFILE dcp --progress 1 --uid $UID --gid $GID $SRC $DEST - command: ulimit -n 2048 && mpirun --allow-run-as-root --hostfile $HOSTFILE dcp --progress 1 --uid $UID --gid $GID $SRC $DEST + # If true, enable the command's stdout to be saved in the log when the command completes + # successfully. On failure, the output is always logged. + logStdout: false - # If true, enable the command's stdout to be saved in the log when the command completes - # successfully. On failure, the output is always logged. - logStdout: false + # Similar to logStdout, store the command's stdout in Status.Message when the command + # completes successfully. On failure, the output is always stored. + storeStdout: false - # Similar to logStdout, store the command's stdout in Status.Message when the command - # completes successfully. On failure, the output is always stored. - storeStdout: false + # Same as default profile but tell dcp not to copy xattrs + no-xattr: + slots: 8 + maxSlots: 0 + command: ulimit -n 2048 && mpirun --allow-run-as-root --hostfile $HOSTFILE dcp --progress 1 --xattrs none --uid $UID --gid $GID $SRC $DEST + logStdout: false + storeStdout: false # NnfDataMovement resources have the ability to collect and store the progress percentage and the # last few lines of output in the CommandStatus field. This number is used for the interval to collect # the progress data. `dcp --progress N` must be included in the data movement command in order for # progress to be collected. A value less than 1 disables this functionality. -progressIntervalSeconds: 5 \ No newline at end of file +progressIntervalSeconds: 5 diff --git a/go.mod b/go.mod index 2542bb1e..acd3b3e0 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.19 require ( github.com/NearNodeFlash/lustre-fs-operator v0.0.1-0.20231031201943-531116c1194e - github.com/NearNodeFlash/nnf-sos v0.0.1-0.20231114204216-1a62f95d74d5 + github.com/NearNodeFlash/nnf-sos v0.0.1-0.20231116180019-91601a97a3d2 github.com/onsi/ginkgo/v2 v2.11.0 github.com/onsi/gomega v1.27.10 github.com/prometheus/client_golang v1.16.0 diff --git a/go.sum b/go.sum index 7db0fc50..98e63a7e 100644 --- a/go.sum +++ b/go.sum @@ -4,8 +4,8 @@ github.com/NearNodeFlash/lustre-fs-operator v0.0.1-0.20231031201943-531116c1194e github.com/NearNodeFlash/lustre-fs-operator v0.0.1-0.20231031201943-531116c1194e/go.mod h1:qBcz9p8sXm1qhDf8WUmhxTlD1NCMEjoAD7NoHbQvMiI= github.com/NearNodeFlash/nnf-ec v0.0.0-20231010162453-a8168bb6a52f h1:aWtSSQLLk9mUZj94mowirQeVw9saf80gVe10X0rZe8o= github.com/NearNodeFlash/nnf-ec v0.0.0-20231010162453-a8168bb6a52f/go.mod h1:oxdwMqfttOF9dabJhqrWlirCnMk8/8eyLMwl+hducjk= -github.com/NearNodeFlash/nnf-sos v0.0.1-0.20231114204216-1a62f95d74d5 h1:ngJNueL3HbFewXc4pf3mRFQfEKOk0q6eJcQ4Ir787ho= -github.com/NearNodeFlash/nnf-sos v0.0.1-0.20231114204216-1a62f95d74d5/go.mod h1:t0KypbCmssZzL9vhQFHLdauxHKgptJK1SbPJHjm+Baw= +github.com/NearNodeFlash/nnf-sos v0.0.1-0.20231116180019-91601a97a3d2 h1:7E4SvAx78e8bvzWiP5532SoaTY84IXUx5fyJ14Y04B8= +github.com/NearNodeFlash/nnf-sos v0.0.1-0.20231116180019-91601a97a3d2/go.mod h1:t0KypbCmssZzL9vhQFHLdauxHKgptJK1SbPJHjm+Baw= github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= github.com/benbjohnson/clock v1.3.0 h1:ip6w0uFQkncKQ979AypyG0ER7mqUSBdKLOgAle/AT8A= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= diff --git a/internal/controller/datamovement_controller.go b/internal/controller/datamovement_controller.go index 2d631532..91308617 100644 --- a/internal/controller/datamovement_controller.go +++ b/internal/controller/datamovement_controller.go @@ -66,8 +66,7 @@ const ( configMapNamespace = nnfv1alpha1.DataMovementNamespace // DM ConfigMap Data Keys - configMapKeyData = "nnf-dm-config.yaml" - configMapKeyProfileDefault = "default" + configMapKeyData = "nnf-dm-config.yaml" ) // Regex to scrape the progress output of the `dcp` command. Example output: @@ -256,16 +255,14 @@ func (r *DataMovementReconciler) Reconcile(ctx context.Context, req ctrl.Request } log.Info("Using config map", "config", cfg) - // TODO: Allow use of non-default dm config profiles - for now only use the default. For copy - // offload API, we could create "fake" profiles and store those in the DM object based on the - // parameters supplied to the CreateRequest(). - // Ensure profile exists - profile, found := cfg.Profiles[configMapKeyProfileDefault] + // Ensure requested DM profile exists + profile, found := cfg.Profiles[dm.Spec.Profile] if !found { - return ctrl.Result{}, dwsv1alpha2.NewResourceError("").WithUserMessage("'%s' profile not found in config map: %v", configMapKeyProfileDefault, client.ObjectKeyFromObject(configMap)).WithUser().WithFatal() + return ctrl.Result{}, dwsv1alpha2.NewResourceError("").WithUserMessage("'%s' profile not found in config map: %v", dm.Spec.Profile, client.ObjectKeyFromObject(configMap)).WithUser().WithFatal() } - log.Info("Using profile", "name", configMapKeyProfileDefault, "profile", profile) + log.Info("Using profile", "profile name", dm.Spec.Profile, "profile", profile) + // Built command + hostfile cmdArgs, mpiHostfile, err := buildDMCommand(ctx, profile, hosts, dm) if err != nil { return ctrl.Result{}, dwsv1alpha2.NewResourceError("could not create data movement command").WithError(err).WithMajor() diff --git a/internal/controller/datamovement_controller_test.go b/internal/controller/datamovement_controller_test.go index e7c68f78..592ebae3 100644 --- a/internal/controller/datamovement_controller_test.go +++ b/internal/controller/datamovement_controller_test.go @@ -106,13 +106,13 @@ var _ = Describe("Data Movement Test", func() { // Default config map data dmCfg = &dmConfig{ Profiles: map[string]dmConfigProfile{ - "default": { + nnfv1alpha1.DataMovementProfileDefault: { Command: defaultCommand, }, }, ProgressIntervalSeconds: 1, } - dmCfgProfile = dmCfg.Profiles[configMapKeyProfileDefault] + dmCfgProfile = dmCfg.Profiles[nnfv1alpha1.DataMovementProfileDefault] dm = &nnfv1alpha1.NnfDataMovement{ ObjectMeta: metav1.ObjectMeta{ @@ -140,7 +140,7 @@ var _ = Describe("Data Movement Test", func() { // Create CM and verify label if createCm { // allow test to override the values in the default cfg profile - dmCfg.Profiles[configMapKeyProfileDefault] = dmCfgProfile + dmCfg.Profiles[nnfv1alpha1.DataMovementProfileDefault] = dmCfgProfile // Convert the config to raw b, err := yaml.Marshal(dmCfg) @@ -393,6 +393,61 @@ var _ = Describe("Data Movement Test", func() { }) }) + Context("when a non-default profile is supplied (and present)", func() { + p := "test-profile" + cmd := "sleep .1" + + BeforeEach(func() { + dmCfgProfile = dmConfigProfile{ + Command: cmd, + } + dmCfg.Profiles[p] = dmCfgProfile + dm.Spec.Profile = p + }) + It("should use that profile to perform data movement", func() { + + By("completing the data movement successfully") + Eventually(func(g Gomega) nnfv1alpha1.NnfDataMovementStatus { + g.Expect(k8sClient.Get(context.TODO(), client.ObjectKeyFromObject(dm), dm)).To(Succeed()) + return dm.Status + }, "3s").Should(MatchFields(IgnoreExtras, Fields{ + "State": Equal(nnfv1alpha1.DataMovementConditionTypeFinished), + "Status": Equal(nnfv1alpha1.DataMovementConditionReasonSuccess), + })) + + By("verify that profile is used") + Expect(dm.Spec.Profile).To(Equal(p)) + Expect(dm.Status.CommandStatus.Command).To(Equal(cmdBashPrefix + cmd)) + }) + }) + + Context("when a non-default profile is supplied (and NOT present)", func() { + m := "missing-test-profile" + cmd := "sleep .1" + + BeforeEach(func() { + dmCfgProfile = dmConfigProfile{ + Command: cmd, + } + dmCfg.Profiles["test-profile"] = dmCfgProfile + dm.Spec.Profile = m + }) + It("should use that profile to perform data movement and fail", func() { + + By("having a State/Status of 'Finished'/'Invalid'") + Eventually(func(g Gomega) nnfv1alpha1.NnfDataMovementStatus { + g.Expect(k8sClient.Get(context.TODO(), client.ObjectKeyFromObject(dm), dm)).To(Succeed()) + return dm.Status + }).Should(MatchFields(IgnoreExtras, Fields{ + "State": Equal(nnfv1alpha1.DataMovementConditionTypeFinished), + "Status": Equal(nnfv1alpha1.DataMovementConditionReasonInvalid), + })) + + By("verify that profile is used") + Expect(dm.Spec.Profile).To(Equal(m)) + }) + }) + Context("when a data movement command fails", func() { BeforeEach(func() { dmCfgProfile.Command = "false" diff --git a/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnf_datamovement_types.go b/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnf_datamovement_types.go index 317aa83a..bb6dcd11 100644 --- a/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnf_datamovement_types.go +++ b/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnf_datamovement_types.go @@ -31,6 +31,10 @@ const ( // data movement. Individual nodes may also perform data movement in which case they use the // NNF Node Name as the namespace. DataMovementNamespace = "nnf-dm-system" + + // The name of the default profile stored in the nnf-dm-config ConfigMap that is used to + // configure Data Movement. + DataMovementProfileDefault = "default" ) // NnfDataMovementSpec defines the desired state of NnfDataMovement @@ -58,6 +62,11 @@ type NnfDataMovementSpec struct { // +kubebuilder:default:=false Cancel bool `json:"cancel,omitempty"` + // Profile specifies the name of profile in the nnf-dm-config ConfigMap to be used for + // configuring data movement. Defaults to the default profile. + // +kubebuilder:default:=default + Profile string `json:"profile,omitempty"` + // User defined configuration on how data movement should be performed. This overrides the // configuration defined in the nnf-dm-config ConfigMap. These values are typically set by the // Copy Offload API. diff --git a/vendor/github.com/NearNodeFlash/nnf-sos/config/crd/bases/nnf.cray.hpe.com_nnfdatamovements.yaml b/vendor/github.com/NearNodeFlash/nnf-sos/config/crd/bases/nnf.cray.hpe.com_nnfdatamovements.yaml index 5c7e5128..c008b052 100644 --- a/vendor/github.com/NearNodeFlash/nnf-sos/config/crd/bases/nnf.cray.hpe.com_nnfdatamovements.yaml +++ b/vendor/github.com/NearNodeFlash/nnf-sos/config/crd/bases/nnf.cray.hpe.com_nnfdatamovements.yaml @@ -109,6 +109,12 @@ spec: operation. format: int32 type: integer + profile: + default: default + description: Profile specifies the name of profile in the nnf-dm-config + ConfigMap to be used for configuring data movement. Defaults to + the default profile. + type: string source: description: Source describes the source of the data movement operation properties: diff --git a/vendor/modules.txt b/vendor/modules.txt index beadd30e..8d8be84a 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -10,7 +10,7 @@ github.com/NearNodeFlash/lustre-fs-operator/config/crd/bases # github.com/NearNodeFlash/nnf-ec v0.0.0-20231010162453-a8168bb6a52f ## explicit; go 1.19 github.com/NearNodeFlash/nnf-ec/pkg/rfsf/pkg/models -# github.com/NearNodeFlash/nnf-sos v0.0.1-0.20231114204216-1a62f95d74d5 +# github.com/NearNodeFlash/nnf-sos v0.0.1-0.20231116180019-91601a97a3d2 ## explicit; go 1.19 github.com/NearNodeFlash/nnf-sos/api/v1alpha1 github.com/NearNodeFlash/nnf-sos/config/crd/bases