From 753b301fd45acec8ea683ef303451ef5c437a1b6 Mon Sep 17 00:00:00 2001 From: Dean Roehrich Date: Thu, 6 Jul 2023 10:13:46 -0500 Subject: [PATCH 1/5] Handle stale kustomize or controller-gen in bin dir. (#118) Signed-off-by: Dean Roehrich --- Makefile | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 022d9d30..aac8e8ec 100644 --- a/Makefile +++ b/Makefile @@ -184,6 +184,12 @@ LOCALBIN ?= $(shell pwd)/bin $(LOCALBIN): mkdir -p $(LOCALBIN) +.PHONY: clean-bin +clean-bin: + if [[ -d $(LOCALBIN) ]]; then \ + chmod -R u+w $(LOCALBIN) && rm -rf $(LOCALBIN); \ + fi + ## Tool Binaries KUSTOMIZE ?= $(LOCALBIN)/kustomize CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen @@ -195,14 +201,17 @@ CONTROLLER_TOOLS_VERSION ?= v0.12.0 KUSTOMIZE_INSTALL_SCRIPT ?= "https://raw.githubusercontent.com/kubernetes-sigs/kustomize/master/hack/install_kustomize.sh" .PHONY: kustomize -kustomize: $(KUSTOMIZE) ## Download kustomize locally if necessary. -$(KUSTOMIZE): $(LOCALBIN) - test -s $(LOCALBIN)/kustomize || { curl -s $(KUSTOMIZE_INSTALL_SCRIPT) | bash -s -- $(subst v,,$(KUSTOMIZE_VERSION)) $(LOCALBIN); } +kustomize: $(LOCALBIN) ## Download kustomize locally if necessary. + if [[ ! -s $(LOCALBIN)/kustomize || $$($(LOCALBIN)/kustomize version | awk '{print $$1}' | awk -F/ '{print $$2}') != $(KUSTOMIZE_VERSION) ]]; then \ + rm -f $(LOCALBIN)/kustomize && \ + { curl -s $(KUSTOMIZE_INSTALL_SCRIPT) | bash -s -- $(subst v,,$(KUSTOMIZE_VERSION)) $(LOCALBIN); }; \ + fi .PHONY: controller-gen -controller-gen: $(CONTROLLER_GEN) ## Download controller-gen locally if necessary. -$(CONTROLLER_GEN): $(LOCALBIN) - test -s $(LOCALBIN)/controller-gen || GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-tools/cmd/controller-gen@$(CONTROLLER_TOOLS_VERSION) +controller-gen: $(LOCALBIN) ## Download controller-gen locally if necessary. + if [[ ! -s $(LOCALBIN)/controller-gen || $$($(LOCALBIN)/controller-gen --version | awk '{print $$2}') != $(CONTROLLER_TOOLS_VERSION) ]]; then \ + rm -f $(LOCALBIN)/controller-gen && GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-tools/cmd/controller-gen@$(CONTROLLER_TOOLS_VERSION); \ + fi .PHONY: envtest envtest: $(ENVTEST) ## Download envtest-setup locally if necessary. From 40ccffb2de68e6e2d457b0a45b3ea26e32e463ce Mon Sep 17 00:00:00 2001 From: matthew-richerson <82597529+matthew-richerson@users.noreply.github.com> Date: Wed, 2 Aug 2023 10:42:51 -0500 Subject: [PATCH 2/5] github-46: Use ResourceError when returning errors (#119) * github-46: Use ResourceError when returning errors This commit uses the new ResourceError struct embedded in the status section of the DWS/nnf-sos resources. When returning an error, use the NewResourceError() call to return a ResourceError and fill it in with the correct information. This allows the end user and WLM to make informed decisions about what to do when there's an error. Signed-off-by: Matt Richerson * review comments Signed-off-by: Matt Richerson * re-vendor Signed-off-by: Matt Richerson --------- Signed-off-by: Matt Richerson --- controllers/datamovement_controller.go | 70 ++++---- go.mod | 4 +- go.sum | 8 +- .../dws/api/v1alpha2/clientmount_types.go | 2 + .../api/v1alpha2/directivebreakdown_types.go | 1 + .../persistentstorageinstance_types.go | 2 + .../dws/api/v1alpha2/resource_error.go | 167 +++++++++++++++--- .../dws/api/v1alpha2/servers_types.go | 10 ++ .../dws/api/v1alpha2/workflow_types.go | 25 +-- .../dws/api/v1alpha2/zz_generated.deepcopy.go | 1 + .../nnf-sos/api/v1alpha1/nnf_access_types.go | 3 +- .../api/v1alpha1/nnf_datamovement_types.go | 5 +- .../api/v1alpha1/nnf_node_storage_types.go | 6 +- .../nnf-sos/api/v1alpha1/nnf_storage_types.go | 7 +- .../api/v1alpha1/nnfcontainerprofile_types.go | 17 +- .../v1alpha1/nnfcontainerprofile_webhook.go | 75 ++++++-- .../api/v1alpha1/nnfstorageprofile_webhook.go | 16 +- .../nnf-sos/api/v1alpha1/workflow_error.go | 105 ----------- .../api/v1alpha1/zz_generated.deepcopy.go | 25 +-- .../bases/nnf.cray.hpe.com_nnfaccesses.yaml | 24 ++- ...nnf.cray.hpe.com_nnfcontainerprofiles.yaml | 20 +++ .../nnf.cray.hpe.com_nnfdatamovements.yaml | 32 ++++ .../nnf.cray.hpe.com_nnfnodestorages.yaml | 94 ++-------- .../bases/nnf.cray.hpe.com_nnfstorages.yaml | 33 +++- vendor/modules.txt | 44 ++++- 25 files changed, 481 insertions(+), 315 deletions(-) delete mode 100644 vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/workflow_error.go diff --git a/controllers/datamovement_controller.go b/controllers/datamovement_controller.go index d001d542..b21cbb98 100644 --- a/controllers/datamovement_controller.go +++ b/controllers/datamovement_controller.go @@ -51,6 +51,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/yaml" + dwsv1alpha2 "github.com/HewlettPackard/dws/api/v1alpha2" dmv1alpha1 "github.com/NearNodeFlash/nnf-dm/api/v1alpha1" "github.com/NearNodeFlash/nnf-dm/controllers/metrics" nnfv1alpha1 "github.com/NearNodeFlash/nnf-sos/api/v1alpha1" @@ -135,7 +136,7 @@ func (i *invalidError) Unwrap() error { return i.err } // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. -func (r *DataMovementReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { +func (r *DataMovementReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res ctrl.Result, err error) { log := log.FromContext(ctx) metrics.NnfDmDataMovementReconcilesTotal.Inc() @@ -145,6 +146,24 @@ func (r *DataMovementReconciler) Reconcile(ctx context.Context, req ctrl.Request return ctrl.Result{}, client.IgnoreNotFound(err) } + defer func() { + if err != nil { + resourceError, ok := err.(*dwsv1alpha2.ResourceErrorInfo) + if ok { + if resourceError.Severity != dwsv1alpha2.SeverityMinor { + dm.Status.State = nnfv1alpha1.DataMovementConditionTypeFinished + dm.Status.Status = nnfv1alpha1.DataMovementConditionReasonInvalid + } + } + dm.Status.SetResourceErrorAndLog(err, log) + dm.Status.Message = err.Error() + + if updateErr := r.Status().Update(ctx, dm); updateErr != nil { + err = updateErr + } + } + }() + if !dm.GetDeletionTimestamp().IsZero() { if err := r.cancel(ctx, dm); err != nil { @@ -181,7 +200,7 @@ func (r *DataMovementReconciler) Reconcile(ctx context.Context, req ctrl.Request // Handle cancellation if dm.Spec.Cancel { if err := r.cancel(ctx, dm); err != nil { - return ctrl.Result{}, err + return ctrl.Result{}, dwsv1alpha2.NewResourceError("").WithError(err).WithUserMessage("Unable to cancel data movement") } return ctrl.Result{}, nil @@ -203,32 +222,14 @@ func (r *DataMovementReconciler) Reconcile(ctx context.Context, req ctrl.Request log.Info("Restarting", "restarts", dm.Status.Restarts) } - // Handle invalid errors that can occur when setting up the data movement - // resource. An invalid error is unrecoverable. - handleInvalidError := func(err error) error { - if errors.Is(err, &invalidError{}) { - dm.Status.State = nnfv1alpha1.DataMovementConditionTypeFinished - dm.Status.Status = nnfv1alpha1.DataMovementConditionReasonInvalid - dm.Status.Message = err.Error() - - if err := r.Status().Update(ctx, dm); err != nil { - return err - } - - return nil - } - - return err - } - nodes, err := r.getStorageNodeNames(ctx, dm) if err != nil { - return ctrl.Result{}, handleInvalidError(err) + return ctrl.Result{}, dwsv1alpha2.NewResourceError("could not get storage nodes for data movement").WithError(err).WithMajor() } hosts, err := r.getWorkerHostnames(ctx, nodes) if err != nil { - return ctrl.Result{}, handleInvalidError(err) + return ctrl.Result{}, dwsv1alpha2.NewResourceError("could not get worker nodes for data movement").WithError(err).WithMajor() } // Expand the context with cancel and store it in the map so the cancel function can be used in @@ -240,16 +241,19 @@ func (r *DataMovementReconciler) Reconcile(ctx context.Context, req ctrl.Request }) // Get DM Config map - configMap := &corev1.ConfigMap{} - if err := r.Get(ctx, types.NamespacedName{Name: configMapName, Namespace: configMapNamespace}, configMap); err != nil { - log.Info("Config map not found - requeueing", "name", configMapName, "namespace", configMapNamespace) - return ctrl.Result{}, handleInvalidError(err) + configMap := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: configMapName, + Namespace: configMapNamespace, + }, + } + if err := r.Get(ctx, client.ObjectKeyFromObject(configMap), configMap); err != nil { + return ctrl.Result{}, dwsv1alpha2.NewResourceError("could not get data movement config map: %v", client.ObjectKeyFromObject(configMap)).WithError(err).WithMajor() } cfg := dmConfig{} if err := yaml.Unmarshal([]byte(configMap.Data[configMapKeyData]), &cfg); err != nil { - log.Error(err, "error reading config map data") - return ctrl.Result{}, handleInvalidError(err) + return ctrl.Result{}, dwsv1alpha2.NewResourceError("invalid data for config map: %v", client.ObjectKeyFromObject(configMap)).WithError(err).WithFatal() } log.Info("Using config map", "config", cfg) @@ -259,15 +263,13 @@ func (r *DataMovementReconciler) Reconcile(ctx context.Context, req ctrl.Request // Ensure profile exists profile, found := cfg.Profiles[configMapKeyProfileDefault] if !found { - return ctrl.Result{}, handleInvalidError(fmt.Errorf( - "'%s' profile not found in config map", configMapKeyProfileDefault)) + return ctrl.Result{}, dwsv1alpha2.NewResourceError("").WithUserMessage("'%s' profile not found in config map: %v", configMapKeyProfileDefault, client.ObjectKeyFromObject(configMap)).WithUser().WithFatal() } log.Info("Using profile", "name", configMapKeyProfileDefault, "profile", profile) cmdArgs, mpiHostfile, err := buildDMCommand(ctx, profile, hosts, dm) if err != nil { - log.Error(err, "error building DM command") - return ctrl.Result{}, handleInvalidError(err) + return ctrl.Result{}, dwsv1alpha2.NewResourceError("could not create data movement command").WithError(err).WithMajor() } if len(mpiHostfile) > 0 { log.Info("MPI Hostfile preview", "first line", peekMpiHostfile(mpiHostfile)) @@ -403,9 +405,9 @@ func (r *DataMovementReconciler) Reconcile(ctx context.Context, req ctrl.Request log.Error(err, "Data movement operation cancelled", "output", combinedOutBuf.String()) dm.Status.Status = nnfv1alpha1.DataMovementConditionReasonCancelled } else if err != nil { - log.Error(err, "Data movement operation failed", "output", combinedOutBuf.String()) dm.Status.Status = nnfv1alpha1.DataMovementConditionReasonFailed - dm.Status.Message = fmt.Sprintf("%s: %s", err.Error(), combinedOutBuf.String()) + resourceErr := dwsv1alpha2.NewResourceError("").WithError(err).WithUserMessage("data movement operation failed: %s", combinedOutBuf.String()).WithFatal() + dm.Status.SetResourceErrorAndLog(resourceErr, log) } else { log.Info("Data movement operation completed", "cmdStatus", cmdStatus) diff --git a/go.mod b/go.mod index 53fe752e..11b0b200 100644 --- a/go.mod +++ b/go.mod @@ -3,9 +3,9 @@ module github.com/NearNodeFlash/nnf-dm go 1.19 require ( - github.com/HewlettPackard/dws v0.0.1-0.20230613201835-73abc41bd83c + github.com/HewlettPackard/dws v0.0.1-0.20230802152955-11a333f31153 github.com/NearNodeFlash/lustre-fs-operator v0.0.1-0.20230613180840-6178f2b04900 - github.com/NearNodeFlash/nnf-sos v0.0.1-0.20230613203338-ea5b2f78692b + github.com/NearNodeFlash/nnf-sos v0.0.1-0.20230802153426-7b17a96bf2de github.com/onsi/ginkgo/v2 v2.9.1 github.com/onsi/gomega v1.27.3 github.com/prometheus/client_golang v1.14.0 diff --git a/go.sum b/go.sum index 9198cbaf..71e3a37c 100644 --- a/go.sum +++ b/go.sum @@ -1,14 +1,14 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= -github.com/HewlettPackard/dws v0.0.1-0.20230613201835-73abc41bd83c h1:atwVAI9Gslf501a4ADo/nkJol141DgF8YR4AiMtj4E8= -github.com/HewlettPackard/dws v0.0.1-0.20230613201835-73abc41bd83c/go.mod h1:YvNzcgAPmwhl/YQj6dMwsB9OpwbI5bp/41kINfFiXX8= +github.com/HewlettPackard/dws v0.0.1-0.20230802152955-11a333f31153 h1:9vMjataXTnCwXEGwxu0dQrOLUW5ujoJiTWAUTb8k50w= +github.com/HewlettPackard/dws v0.0.1-0.20230802152955-11a333f31153/go.mod h1:YvNzcgAPmwhl/YQj6dMwsB9OpwbI5bp/41kINfFiXX8= github.com/NearNodeFlash/lustre-fs-operator v0.0.1-0.20230613180840-6178f2b04900 h1:jOrP2H+D5amgHIONcucYS3/kJm6QfmqAG23Ke7elunI= github.com/NearNodeFlash/lustre-fs-operator v0.0.1-0.20230613180840-6178f2b04900/go.mod h1:O71nfDnuK7MZZYAW9kaOFTMo48nmDlaYnzISXEPsKSw= github.com/NearNodeFlash/nnf-ec v0.0.0-20230526161255-cfb2d89b35d7 h1:y4E3b/Ta6sqv+huYQXYKZmPCMWMZtG2kV8/qgTIpzFI= github.com/NearNodeFlash/nnf-ec v0.0.0-20230526161255-cfb2d89b35d7/go.mod h1:11Ol46sAWdqlj3WmIFTzKO+UxQX3lvWBqpe6yaiMEIg= -github.com/NearNodeFlash/nnf-sos v0.0.1-0.20230613203338-ea5b2f78692b h1:UKYwKExv3AwHLwEBKHHMDuVq3Kv9Vn2b5vcUluOe8bs= -github.com/NearNodeFlash/nnf-sos v0.0.1-0.20230613203338-ea5b2f78692b/go.mod h1:ROE7mG1W7t1APwH9gfRwDIIQqtBP04VcYVHlKcYA1P0= +github.com/NearNodeFlash/nnf-sos v0.0.1-0.20230802153426-7b17a96bf2de h1:HLjf2NO/e+U5Qc2bUif6/ta0HbFwAMfBMy8hQBeX2fc= +github.com/NearNodeFlash/nnf-sos v0.0.1-0.20230802153426-7b17a96bf2de/go.mod h1:ZqhqjoQO4sn3B5aPt4XwdS6ZpkEUtH8Eki7e2AaRprA= github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8= diff --git a/vendor/github.com/HewlettPackard/dws/api/v1alpha2/clientmount_types.go b/vendor/github.com/HewlettPackard/dws/api/v1alpha2/clientmount_types.go index 6380e23d..e4f375df 100644 --- a/vendor/github.com/HewlettPackard/dws/api/v1alpha2/clientmount_types.go +++ b/vendor/github.com/HewlettPackard/dws/api/v1alpha2/clientmount_types.go @@ -189,6 +189,8 @@ type ClientMountStatus struct { //+kubebuilder:object:root=true //+kubebuilder:storageversion //+kubebuilder:subresource:status +//+kubebuilder:printcolumn:name="ERROR",type="string",JSONPath=".status.error.severity" +//+kubebuilder:printcolumn:name="AGE",type="date",JSONPath=".metadata.creationTimestamp" // ClientMount is the Schema for the clientmounts API type ClientMount struct { diff --git a/vendor/github.com/HewlettPackard/dws/api/v1alpha2/directivebreakdown_types.go b/vendor/github.com/HewlettPackard/dws/api/v1alpha2/directivebreakdown_types.go index 19301de0..df5c95bc 100644 --- a/vendor/github.com/HewlettPackard/dws/api/v1alpha2/directivebreakdown_types.go +++ b/vendor/github.com/HewlettPackard/dws/api/v1alpha2/directivebreakdown_types.go @@ -190,6 +190,7 @@ type DirectiveBreakdownStatus struct { //+kubebuilder:storageversion //+kubebuilder:subresource:status //+kubebuilder:printcolumn:name="READY",type="boolean",JSONPath=".status.ready",description="True if allocation sets have been generated" +//+kubebuilder:printcolumn:name="ERROR",type="string",JSONPath=".status.error.severity" //+kubebuilder:printcolumn:name="AGE",type="date",JSONPath=".metadata.creationTimestamp" // DirectiveBreakdown is the Schema for the directivebreakdown API diff --git a/vendor/github.com/HewlettPackard/dws/api/v1alpha2/persistentstorageinstance_types.go b/vendor/github.com/HewlettPackard/dws/api/v1alpha2/persistentstorageinstance_types.go index 34e26f20..90e196b5 100644 --- a/vendor/github.com/HewlettPackard/dws/api/v1alpha2/persistentstorageinstance_types.go +++ b/vendor/github.com/HewlettPackard/dws/api/v1alpha2/persistentstorageinstance_types.go @@ -91,6 +91,8 @@ type PersistentStorageInstanceStatus struct { //+kubebuilder:object:root=true //+kubebuilder:storageversion //+kubebuilder:subresource:status +//+kubebuilder:printcolumn:name="ERROR",type="string",JSONPath=".status.error.severity" +//+kubebuilder:printcolumn:name="AGE",type="date",JSONPath=".metadata.creationTimestamp" // PersistentStorageInstance is the Schema for the Persistentstorageinstances API type PersistentStorageInstance struct { diff --git a/vendor/github.com/HewlettPackard/dws/api/v1alpha2/resource_error.go b/vendor/github.com/HewlettPackard/dws/api/v1alpha2/resource_error.go index 29dd3084..49ba6aa8 100644 --- a/vendor/github.com/HewlettPackard/dws/api/v1alpha2/resource_error.go +++ b/vendor/github.com/HewlettPackard/dws/api/v1alpha2/resource_error.go @@ -1,5 +1,5 @@ /* - * Copyright 2022 Hewlett Packard Enterprise Development LP + * Copyright 2022-2023 Hewlett Packard Enterprise Development LP * Other additional copyright holders may be indicated within. * * The entirety of this work is licensed under the Apache License, @@ -19,6 +19,42 @@ package v1alpha2 +import ( + "fmt" + "strings" + + "github.com/go-logr/logr" +) + +type ResourceErrorSeverity string +type ResourceErrorType string + +const ( + // Minor errors are very likely to eventually succeed (e.g., errors caused by a stale cache) + // The WLM doesn't see these errors directly. The workflow stays in the DriverWait state, and + // the error string is put in workflow.Status.Message. + SeverityMinor ResourceErrorSeverity = "Minor" + + // Major errors may or may not succeed. These are transient errors that could be persistent + // due to an underlying problem (e.g., errors from OS calls) + SeverityMajor ResourceErrorSeverity = "Major" + + // Fatal errors will never succeed. This is for situations where we can guarantee that retrying + // will not fix the error (e.g., a DW directive that is not valid) + SeverityFatal ResourceErrorSeverity = "Fatal" +) + +const ( + // Internal errors are due to an error in the DWS/driver code + TypeInternal ResourceErrorType = "Internal" + + // WLM errors are due to an error with the input from the WLM + TypeWLM ResourceErrorType = "WLM" + + // User errors are due to an error with the input from a user + TypeUser ResourceErrorType = "User" +) + type ResourceErrorInfo struct { // Optional user facing message if the error is relevant to an end user UserMessage string `json:"userMessage,omitempty"` @@ -26,8 +62,14 @@ type ResourceErrorInfo struct { // Internal debug message for the error DebugMessage string `json:"debugMessage"` - // Indication if the error is likely recoverable or not - Recoverable bool `json:"recoverable"` + // Internal or user error + // +kubebuilder:validation:Enum=Internal;User + Type ResourceErrorType `json:"type"` + + // Indication of how severe the error is. Minor will likely succeed, Major may + // succeed, and Fatal will never succeed. + // +kubebuilder:validation:Enum=Minor;Major;Fatal + Severity ResourceErrorSeverity `json:"severity"` } type ResourceError struct { @@ -35,54 +77,127 @@ type ResourceError struct { Error *ResourceErrorInfo `json:"error,omitempty"` } -func NewResourceError(message string, err error) *ResourceErrorInfo { - resourceError := &ResourceErrorInfo{ - Recoverable: true, +func NewResourceError(format string, a ...any) *ResourceErrorInfo { + return &ResourceErrorInfo{ + Type: TypeInternal, + Severity: SeverityMinor, + DebugMessage: fmt.Sprintf(format, a...), } +} - if err != nil { - // If the error provided is already a ResourceError, use it and concatenate - // the debug messages - _, ok := err.(*ResourceErrorInfo) - if ok { - resourceError = err.(*ResourceErrorInfo) - } +// A resource error can have an optional user message that is displayed in the workflow.Status.Message +// field. The user message of the lowest level error is all that's displayed. +func (e *ResourceErrorInfo) WithUserMessage(format string, a ...any) *ResourceErrorInfo { + // Only set the user message if it's empty. This prevents upper layers + // from overriding a user message set by a lower layer + if e.UserMessage == "" { + e.UserMessage = fmt.Sprintf(format, a...) + } - if message == "" { - message = err.Error() + return e +} + +func (e *ResourceErrorInfo) WithError(err error) *ResourceErrorInfo { + if err == nil { + return e + } + + // Concatenate the parent and child debug messages + debugMessageList := []string{} + if e.DebugMessage != "" { + debugMessageList = append(debugMessageList, e.DebugMessage) + } + + childError, ok := err.(*ResourceErrorInfo) + if ok { + // Inherit the severity and the user message if the child error is a ResourceError + e.Severity = childError.Severity + e.UserMessage = childError.UserMessage + e.Type = childError.Type + + // If the child resource error doesn't have a debug message, use the user message instead + if childError.DebugMessage == "" { + debugMessageList = append(debugMessageList, childError.UserMessage) } else { - message = message + ": " + err.Error() + debugMessageList = append(debugMessageList, childError.DebugMessage) } + } else { + debugMessageList = append(debugMessageList, err.Error()) } - resourceError.DebugMessage = message + e.DebugMessage = strings.Join(debugMessageList, ": ") - return resourceError + return e } func (e *ResourceErrorInfo) WithFatal() *ResourceErrorInfo { - e.Recoverable = false + e.Severity = SeverityFatal return e } -func (e *ResourceErrorInfo) WithUserMessage(message string) *ResourceErrorInfo { - // Only set the user message if it's empty. This prevents upper layers - // from overriding a user message set by a lower layer - if e.UserMessage == "" { - e.UserMessage = message +func (e *ResourceErrorInfo) WithMajor() *ResourceErrorInfo { + if e.Severity != SeverityFatal { + e.Severity = SeverityMajor + } + return e +} + +func (e *ResourceErrorInfo) WithMinor() *ResourceErrorInfo { + if e.Severity != SeverityFatal && e.Severity != SeverityMajor { + e.Severity = SeverityMinor } + return e +} + +func (e *ResourceErrorInfo) WithInternal() *ResourceErrorInfo { + e.Type = TypeInternal + return e +} + +func (e *ResourceErrorInfo) WithWLM() *ResourceErrorInfo { + e.Type = TypeWLM + return e +} +func (e *ResourceErrorInfo) WithUser() *ResourceErrorInfo { + e.Type = TypeUser return e } func (e *ResourceErrorInfo) Error() string { - return e.DebugMessage + message := "" + if e.DebugMessage == "" { + message = e.UserMessage + } else { + message = e.DebugMessage + } + return fmt.Sprintf("%s error: %s", strings.ToLower(string(e.Type)), message) +} + +func (e *ResourceError) SetResourceErrorAndLog(err error, log logr.Logger) { + e.SetResourceError(err) + if err == nil { + return + } + + childError, ok := err.(*ResourceErrorInfo) + if ok { + if childError.Severity == SeverityFatal { + log.Error(err, "Fatal error") + return + } + + log.Info("Recoverable Error", "Severity", childError.Severity, "Message", err.Error()) + return + } + + log.Info("Recoverable Error", "Message", err.Error()) } func (e *ResourceError) SetResourceError(err error) { if err == nil { e.Error = nil } else { - e.Error = NewResourceError("", err) + e.Error = NewResourceError("").WithError(err) } } diff --git a/vendor/github.com/HewlettPackard/dws/api/v1alpha2/servers_types.go b/vendor/github.com/HewlettPackard/dws/api/v1alpha2/servers_types.go index f3f2cc92..3ce6bc6f 100644 --- a/vendor/github.com/HewlettPackard/dws/api/v1alpha2/servers_types.go +++ b/vendor/github.com/HewlettPackard/dws/api/v1alpha2/servers_types.go @@ -20,6 +20,8 @@ package v1alpha2 import ( + "github.com/HewlettPackard/dws/utils/updater" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -79,12 +81,16 @@ type ServersStatus struct { Ready bool `json:"ready"` LastUpdate *metav1.MicroTime `json:"lastUpdate,omitempty"` AllocationSets []ServersStatusAllocationSet `json:"allocationSets,omitempty"` + + // Error information + ResourceError `json:",inline"` } //+kubebuilder:object:root=true //+kubebuilder:storageversion //+kubebuilder:subresource:status //+kubebuilder:printcolumn:name="READY",type="boolean",JSONPath=".status.ready",description="True if allocation sets have been generated" +//+kubebuilder:printcolumn:name="ERROR",type="string",JSONPath=".status.error.severity" //+kubebuilder:printcolumn:name="AGE",type="date",JSONPath=".metadata.creationTimestamp" // Servers is the Schema for the servers API @@ -96,6 +102,10 @@ type Servers struct { Status ServersStatus `json:"status,omitempty"` } +func (s *Servers) GetStatus() updater.Status[*ServersStatus] { + return &s.Status +} + //+kubebuilder:object:root=true // ServersList contains a list of Servers diff --git a/vendor/github.com/HewlettPackard/dws/api/v1alpha2/workflow_types.go b/vendor/github.com/HewlettPackard/dws/api/v1alpha2/workflow_types.go index 25a5b338..3d189f18 100644 --- a/vendor/github.com/HewlettPackard/dws/api/v1alpha2/workflow_types.go +++ b/vendor/github.com/HewlettPackard/dws/api/v1alpha2/workflow_types.go @@ -92,12 +92,13 @@ func (s WorkflowState) after(t WorkflowState) bool { // Strings associated with workflow statuses const ( - StatusPending = "Pending" - StatusQueued = "Queued" - StatusRunning = "Running" - StatusCompleted = "Completed" - StatusError = "Error" - StatusDriverWait = "DriverWait" + StatusPending = "Pending" + StatusQueued = "Queued" + StatusRunning = "Running" + StatusCompleted = "Completed" + StatusTransientCondition = "TransientCondition" + StatusError = "Error" + StatusDriverWait = "DriverWait" ) // WorkflowSpec defines the desired state of Workflow @@ -147,8 +148,8 @@ type WorkflowDriverStatus struct { // User readable reason. // For the CDS driver, this could be the state of the underlying - // data movement request: Pending, Queued, Running, Completed or Error - // +kubebuilder:validation:Enum=Pending;Queued;Running;Completed;Error;DriverWait + // data movement request + // +kubebuilder:validation:Enum=Pending;Queued;Running;Completed;TransientCondition;Error;DriverWait Status string `json:"status,omitempty"` // Message provides additional details on the current status of the resource @@ -172,8 +173,12 @@ type WorkflowStatus struct { // Indicates whether State has been reached. Ready bool `json:"ready"` - // User readable reason and status message - // +kubebuilder:validation:Enum=Completed;DriverWait;Error + // User readable reason and status message. + // - Completed: The workflow has reached the state in workflow.Status.State. + // - DriverWait: The underlying drivers are currently running. + // - TransientCondition: A driver has encountered an error that might be recoverable. + // - Error: A driver has encountered an error that will not recover. + // +kubebuilder:validation:Enum=Completed;DriverWait;TransientCondition;Error Status string `json:"status,omitempty"` // Message provides additional details on the current status of the resource diff --git a/vendor/github.com/HewlettPackard/dws/api/v1alpha2/zz_generated.deepcopy.go b/vendor/github.com/HewlettPackard/dws/api/v1alpha2/zz_generated.deepcopy.go index 2a727815..96f72a97 100644 --- a/vendor/github.com/HewlettPackard/dws/api/v1alpha2/zz_generated.deepcopy.go +++ b/vendor/github.com/HewlettPackard/dws/api/v1alpha2/zz_generated.deepcopy.go @@ -891,6 +891,7 @@ func (in *ServersStatus) DeepCopyInto(out *ServersStatus) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + in.ResourceError.DeepCopyInto(&out.ResourceError) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServersStatus. diff --git a/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnf_access_types.go b/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnf_access_types.go index fa63b98b..103a2294 100644 --- a/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnf_access_types.go +++ b/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnf_access_types.go @@ -1,5 +1,5 @@ /* - * Copyright 2021, 2022 Hewlett Packard Enterprise Development LP + * Copyright 2021-2023 Hewlett Packard Enterprise Development LP * Other additional copyright holders may be indicated within. * * The entirety of this work is licensed under the Apache License, @@ -83,6 +83,7 @@ type NnfAccessStatus struct { //+kubebuilder:printcolumn:name="DESIREDSTATE",type="string",JSONPath=".spec.desiredState",description="The desired state" //+kubebuilder:printcolumn:name="STATE",type="string",JSONPath=".status.state",description="The current state" //+kubebuilder:printcolumn:name="READY",type="boolean",JSONPath=".status.ready",description="Whether the state has been achieved" +//+kubebuilder:printcolumn:name="ERROR",type="string",JSONPath=".status.error.severity" //+kubebuilder:printcolumn:name="AGE",type="date",JSONPath=".metadata.creationTimestamp" // NnfAccess is the Schema for the nnfaccesses API diff --git a/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnf_datamovement_types.go b/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnf_datamovement_types.go index 09cea91d..e5fc744e 100644 --- a/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnf_datamovement_types.go +++ b/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnf_datamovement_types.go @@ -1,5 +1,5 @@ /* - * Copyright 2021, 2022 Hewlett Packard Enterprise Development LP + * Copyright 2021-2023 Hewlett Packard Enterprise Development LP * Other additional copyright holders may be indicated within. * * The entirety of this work is licensed under the Apache License, @@ -153,6 +153,8 @@ type NnfDataMovementStatus struct { // as it executes. The command status is polled at a certain frequency to avoid excessive // updates to the Data Movement resource. CommandStatus *NnfDataMovementCommandStatus `json:"commandStatus,omitempty"` + + dwsv1alpha2.ResourceError `json:",inline"` } // Types describing the various data movement status conditions. @@ -175,6 +177,7 @@ const ( //+kubebuilder:subresource:status //+kubebuilder:printcolumn:name="STATE",type="string",JSONPath=".status.state",description="Current state" //+kubebuilder:printcolumn:name="STATUS",type="string",JSONPath=".status.status",description="Status of current state" +//+kubebuilder:printcolumn:name="ERROR",type="string",JSONPath=".status.error.severity" //+kubebuilder:printcolumn:name="AGE",type="date",JSONPath=".metadata.creationTimestamp" // NnfDataMovement is the Schema for the datamovements API diff --git a/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnf_node_storage_types.go b/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnf_node_storage_types.go index 13d30567..579d8dca 100644 --- a/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnf_node_storage_types.go +++ b/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnf_node_storage_types.go @@ -1,5 +1,5 @@ /* - * Copyright 2021, 2022 Hewlett Packard Enterprise Development LP + * Copyright 2021-2023 Hewlett Packard Enterprise Development LP * Other additional copyright holders may be indicated within. * * The entirety of this work is licensed under the Apache License, @@ -169,8 +169,6 @@ type NnfNodeStorageAllocationStatus struct { StoragePool NnfResourceStatus `json:"storagePool,omitempty"` FileSystem NnfResourceStatus `json:"fileSystem,omitempty"` - - Conditions []metav1.Condition `json:"conditions,omitempty"` } // LustreStorageStatus describes the Lustre target created here. @@ -196,6 +194,8 @@ func (ns *NnfNodeStorage) GetStatus() updater.Status[*NnfNodeStorageStatus] { } //+kubebuilder:object:root=true +//+kubebuilder:printcolumn:name="ERROR",type="string",JSONPath=".status.error.severity" +//+kubebuilder:printcolumn:name="AGE",type="date",JSONPath=".metadata.creationTimestamp" // NnfNodeStorageList contains a list of NNF Nodes type NnfNodeStorageList struct { diff --git a/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnf_storage_types.go b/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnf_storage_types.go index 0ce009a1..e3f57917 100644 --- a/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnf_storage_types.go +++ b/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnf_storage_types.go @@ -1,5 +1,5 @@ /* - * Copyright 2021, 2022 Hewlett Packard Enterprise Development LP + * Copyright 2021-2023 Hewlett Packard Enterprise Development LP * Other additional copyright holders may be indicated within. * * The entirety of this work is licensed under the Apache License, @@ -106,9 +106,6 @@ type NnfStorageAllocationSetStatus struct { // Health reflects the health of this allocation set Health NnfResourceHealthType `json:"health,omitempty"` - // Error is the human readable error string - Error string `json:"error,omitempty"` - // AllocationCount is the total number of allocations that currently // exist AllocationCount int `json:"allocationCount"` @@ -135,6 +132,8 @@ type NnfStorageStatus struct { //+kubebuilder:object:root=true //+kubebuilder:subresource:status +//+kubebuilder:printcolumn:name="AGE",type="date",JSONPath=".metadata.creationTimestamp" +//+kubebuilder:printcolumn:name="ERROR",type="string",JSONPath=".status.error.severity" // NnfStorage is the Schema for the storages API type NnfStorage struct { diff --git a/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnfcontainerprofile_types.go b/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnfcontainerprofile_types.go index 7803f792..b193e2aa 100644 --- a/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnfcontainerprofile_types.go +++ b/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnfcontainerprofile_types.go @@ -52,6 +52,16 @@ type NnfContainerProfileData struct { // +kubebuilder:default:=6 RetryLimit int32 `json:"retryLimit"` + // UserID specifies the user ID that is allowed to use this profile. If this + // is specified, only Workflows that have a matching user ID can select + // this profile. + UserID *uint32 `json:"userID,omitempty"` + + // GroupID specifies the group ID that is allowed to use this profile. If this + // is specified, only Workflows that have a matching group ID can select + // this profile. + GroupID *uint32 `json:"groupID,omitempty"` + // Spec to define the containers created from container profile. This is used for non-MPI // containers. // Either this or MPISpec must be provided, but not both. @@ -73,6 +83,11 @@ type NnfContainerProfileStorage struct { // the user not supplying this filesystem in the #DW directives //+kubebuilder:default:=false Optional bool `json:"optional"` + + // For DW_GLOBAL_ (global lustre) storages, the access mode must match what is configured in + // the LustreFilesystem resource for the namespace. Defaults to `ReadWriteMany` for global + // lustre, otherwise empty. + PVCMode corev1.PersistentVolumeAccessMode `json:"pvcMode,omitempty"` } // +kubebuilder:object:root=true @@ -82,7 +97,7 @@ type NnfContainerProfile struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` - Data NnfContainerProfileData `json:"data,omitempty"` + Data NnfContainerProfileData `json:"data"` } // +kubebuilder:object:root=true diff --git a/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnfcontainerprofile_webhook.go b/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnfcontainerprofile_webhook.go index 73e786a7..1e69b150 100644 --- a/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnfcontainerprofile_webhook.go +++ b/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnfcontainerprofile_webhook.go @@ -21,7 +21,9 @@ package v1alpha1 import ( "fmt" + "os" "reflect" + "strings" "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" "k8s.io/apimachinery/pkg/runtime" @@ -48,6 +50,55 @@ var _ webhook.Validator = &NnfContainerProfile{} func (r *NnfContainerProfile) ValidateCreate() error { nnfcontainerprofilelog.Info("validate create", "name", r.Name) + // If it's not pinned, then it's being made available for users to select + // and it must be in the correct namespace. + profileNamespace := os.Getenv("NNF_CONTAINER_PROFILE_NAMESPACE") + if !r.Data.Pinned && r.GetNamespace() != profileNamespace { + err := fmt.Errorf("incorrect namespace for profile that is intended to be selected by users; the namespace should be '%s'", profileNamespace) + nnfstorageprofilelog.Error(err, "invalid") + return err + } + + if err := r.validateContent(); err != nil { + nnfcontainerprofilelog.Error(err, "invalid NnfContainerProfile resource") + return err + } + + return nil +} + +// ValidateUpdate implements webhook.Validator so a webhook will be registered for the type +func (r *NnfContainerProfile) ValidateUpdate(old runtime.Object) error { + nnfcontainerprofilelog.Info("validate update", "name", r.Name) + + obj := old.(*NnfContainerProfile) + + if obj.Data.Pinned != r.Data.Pinned { + err := fmt.Errorf("the pinned flag is immutable") + nnfcontainerprofilelog.Error(err, "invalid") + return err + } + + if obj.Data.Pinned { + // Allow metadata to be updated, for things like finalizers, + // ownerReferences, and labels, but do not allow Data to be + // updated. + if !reflect.DeepEqual(r.Data, obj.Data) { + err := fmt.Errorf("update on pinned resource not allowed") + nnfcontainerprofilelog.Error(err, "invalid") + return err + } + } + + if err := r.validateContent(); err != nil { + nnfcontainerprofilelog.Error(err, "invalid NnfContainerProfile resource") + return err + } + + return nil +} + +func (r *NnfContainerProfile) validateContent() error { mpiJob := r.Data.MPISpec != nil nonmpiJob := r.Data.Spec != nil @@ -89,22 +140,12 @@ func (r *NnfContainerProfile) ValidateCreate() error { } } - return nil -} - -// ValidateUpdate implements webhook.Validator so a webhook will be registered for the type -func (r *NnfContainerProfile) ValidateUpdate(old runtime.Object) error { - nnfcontainerprofilelog.Info("validate update", "name", r.Name) - - obj := old.(*NnfContainerProfile) - if obj.Data.Pinned { - // Allow metadata to be updated, for things like finalizers, - // ownerReferences, and labels, but do not allow Data to be - // updated. - if !reflect.DeepEqual(r.Data, obj.Data) { - err := fmt.Errorf("update on pinned resource not allowed") - nnfcontainerprofilelog.Error(err, "invalid") - return err + // Ensure only DW_GLOBAL_ storages have PVCMode + for _, storage := range r.Data.Storages { + if !strings.HasPrefix(storage.Name, "DW_GLOBAL_") { + if storage.PVCMode != "" { + return fmt.Errorf("PVCMode is only supported for global lustre storages (DW_GLOBAL_)") + } } } @@ -114,7 +155,5 @@ func (r *NnfContainerProfile) ValidateUpdate(old runtime.Object) error { // ValidateDelete implements webhook.Validator so a webhook will be registered for the type func (r *NnfContainerProfile) ValidateDelete() error { nnfcontainerprofilelog.Info("validate delete", "name", r.Name) - - // TODO(user): fill in your validation logic upon object deletion. return nil } diff --git a/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnfstorageprofile_webhook.go b/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnfstorageprofile_webhook.go index 23906e2c..1ecec597 100644 --- a/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnfstorageprofile_webhook.go +++ b/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/nnfstorageprofile_webhook.go @@ -1,5 +1,5 @@ /* - * Copyright 2022 Hewlett Packard Enterprise Development LP + * Copyright 2022-2023 Hewlett Packard Enterprise Development LP * Other additional copyright holders may be indicated within. * * The entirety of this work is licensed under the Apache License, @@ -21,6 +21,7 @@ package v1alpha1 import ( "fmt" + "os" "reflect" "k8s.io/apimachinery/pkg/runtime" @@ -47,6 +48,14 @@ var _ webhook.Validator = &NnfStorageProfile{} func (r *NnfStorageProfile) ValidateCreate() error { nnfstorageprofilelog.V(1).Info("validate create", "name", r.Name) + // If it's not pinned, then it's being made available for users to select + // and it must be in the correct namespace. + profileNamespace := os.Getenv("NNF_STORAGE_PROFILE_NAMESPACE") + if !r.Data.Pinned && r.GetNamespace() != profileNamespace { + err := fmt.Errorf("incorrect namespace for profile that is intended to be selected by users; the namespace should be '%s'", profileNamespace) + nnfstorageprofilelog.Error(err, "invalid") + return err + } if err := r.validateContent(); err != nil { nnfstorageprofilelog.Error(err, "invalid NnfStorageProfile resource") return err @@ -59,6 +68,11 @@ func (r *NnfStorageProfile) ValidateUpdate(old runtime.Object) error { nnfstorageprofilelog.V(1).Info("validate update", "name", r.Name) obj := old.(*NnfStorageProfile) + if obj.Data.Pinned != r.Data.Pinned { + err := fmt.Errorf("the pinned flag is immutable") + nnfcontainerprofilelog.Error(err, "invalid") + return err + } if obj.Data.Pinned { // Allow metadata to be updated, for things like finalizers, // ownerReferences, and labels, but do not allow Data to be diff --git a/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/workflow_error.go b/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/workflow_error.go deleted file mode 100644 index e3602e19..00000000 --- a/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/workflow_error.go +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright 2022 Hewlett Packard Enterprise Development LP - * Other additional copyright holders may be indicated within. - * - * The entirety of this work is licensed under the Apache License, - * Version 2.0 (the "License"); you may not use this file except - * in compliance with the License. - * - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package v1alpha1 - -import ( - "fmt" - - dwsv1alpha2 "github.com/HewlettPackard/dws/api/v1alpha2" -) - -// +kubebuilder:object:generate=false -type WorkflowError struct { - message string - recoverable bool - err error -} - -func NewWorkflowError(message string) *WorkflowError { - return &WorkflowError{ - message: message, - recoverable: true, - } -} - -func NewWorkflowErrorf(format string, a ...any) *WorkflowError { - return NewWorkflowError(fmt.Sprintf(format, a...)) -} - -func (e *WorkflowError) GetMessage() string { - return e.message -} - -func (e *WorkflowError) GetRecoverable() bool { - return e.recoverable -} - -func (e *WorkflowError) GetError() error { - return e.err -} - -func (e *WorkflowError) Error() string { - if e.err == nil { - return e.message - } - - return e.message + ": " + e.err.Error() -} - -func (e *WorkflowError) Unwrap() error { - return e.err -} - -func (e *WorkflowError) Inject(driverStatus *dwsv1alpha2.WorkflowDriverStatus) { - driverStatus.Message = e.GetMessage() - if e.GetRecoverable() { - driverStatus.Status = dwsv1alpha2.StatusRunning - } else { - driverStatus.Status = dwsv1alpha2.StatusError - } - - if e.Unwrap() != nil { - driverStatus.Error = e.Unwrap().Error() - } else { - driverStatus.Error = e.Error() - } -} - -func (e *WorkflowError) WithFatal() *WorkflowError { - e.recoverable = false - return e -} - -func (e *WorkflowError) WithError(err error) *WorkflowError { - // if the error is already a WorkflowError, then return it unmodified - workflowError, ok := err.(*WorkflowError) - if ok { - return workflowError - } - - resourceError, ok := err.(*dwsv1alpha2.ResourceErrorInfo) - if ok { - e.message = resourceError.UserMessage - e.recoverable = resourceError.Recoverable - } - - e.err = err - return e -} diff --git a/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/zz_generated.deepcopy.go b/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/zz_generated.deepcopy.go index bb75f789..9321abb9 100644 --- a/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/zz_generated.deepcopy.go +++ b/vendor/github.com/NearNodeFlash/nnf-sos/api/v1alpha1/zz_generated.deepcopy.go @@ -26,8 +26,7 @@ package v1alpha1 import ( "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" ) @@ -207,9 +206,19 @@ func (in *NnfContainerProfileData) DeepCopyInto(out *NnfContainerProfileData) { *out = make([]NnfContainerProfileStorage, len(*in)) copy(*out, *in) } + if in.UserID != nil { + in, out := &in.UserID, &out.UserID + *out = new(uint32) + **out = **in + } + if in.GroupID != nil { + in, out := &in.GroupID, &out.GroupID + *out = new(uint32) + **out = **in + } if in.Spec != nil { in, out := &in.Spec, &out.Spec - *out = new(corev1.PodSpec) + *out = new(v1.PodSpec) (*in).DeepCopyInto(*out) } if in.MPISpec != nil { @@ -434,6 +443,7 @@ func (in *NnfDataMovementStatus) DeepCopyInto(out *NnfDataMovementStatus) { *out = new(NnfDataMovementCommandStatus) (*in).DeepCopyInto(*out) } + in.ResourceError.DeepCopyInto(&out.ResourceError) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NnfDataMovementStatus. @@ -735,13 +745,6 @@ func (in *NnfNodeStorageAllocationStatus) DeepCopyInto(out *NnfNodeStorageAlloca out.FileShare = in.FileShare out.StoragePool = in.StoragePool out.FileSystem = in.FileSystem - if in.Conditions != nil { - in, out := &in.Conditions, &out.Conditions - *out = make([]v1.Condition, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NnfNodeStorageAllocationStatus. @@ -896,7 +899,7 @@ func (in *NnfPortManagerAllocationStatus) DeepCopyInto(out *NnfPortManagerAlloca *out = *in if in.Requester != nil { in, out := &in.Requester, &out.Requester - *out = new(corev1.ObjectReference) + *out = new(v1.ObjectReference) **out = **in } if in.Ports != nil { diff --git a/vendor/github.com/NearNodeFlash/nnf-sos/config/crd/bases/nnf.cray.hpe.com_nnfaccesses.yaml b/vendor/github.com/NearNodeFlash/nnf-sos/config/crd/bases/nnf.cray.hpe.com_nnfaccesses.yaml index ff2278cc..b3e32f2e 100644 --- a/vendor/github.com/NearNodeFlash/nnf-sos/config/crd/bases/nnf.cray.hpe.com_nnfaccesses.yaml +++ b/vendor/github.com/NearNodeFlash/nnf-sos/config/crd/bases/nnf.cray.hpe.com_nnfaccesses.yaml @@ -27,6 +27,9 @@ spec: jsonPath: .status.ready name: READY type: boolean + - jsonPath: .status.error.severity + name: ERROR + type: string - jsonPath: .metadata.creationTimestamp name: AGE type: date @@ -189,17 +192,28 @@ spec: debugMessage: description: Internal debug message for the error type: string - recoverable: - description: Indication if the error is likely recoverable or - not - type: boolean + severity: + description: Indication of how severe the error is. Minor will + likely succeed, Major may succeed, and Fatal will never succeed. + enum: + - Minor + - Major + - Fatal + type: string + type: + description: Internal or user error + enum: + - Internal + - User + type: string userMessage: description: Optional user facing message if the error is relevant to an end user type: string required: - debugMessage - - recoverable + - severity + - type type: object ready: description: Ready signifies whether status.state has been achieved diff --git a/vendor/github.com/NearNodeFlash/nnf-sos/config/crd/bases/nnf.cray.hpe.com_nnfcontainerprofiles.yaml b/vendor/github.com/NearNodeFlash/nnf-sos/config/crd/bases/nnf.cray.hpe.com_nnfcontainerprofiles.yaml index 2182ba5d..1ab85f2d 100644 --- a/vendor/github.com/NearNodeFlash/nnf-sos/config/crd/bases/nnf.cray.hpe.com_nnfcontainerprofiles.yaml +++ b/vendor/github.com/NearNodeFlash/nnf-sos/config/crd/bases/nnf.cray.hpe.com_nnfcontainerprofiles.yaml @@ -28,6 +28,12 @@ spec: data: description: NnfContainerProfileSpec defines the desired state of NnfContainerProfile properties: + groupID: + description: GroupID specifies the group ID that is allowed to use + this profile. If this is specified, only Workflows that have a matching + group ID can select this profile. + format: int32 + type: integer mpiSpec: description: MPIJobSpec to define the containers created from container profile. This is used for MPI containers via MPIJobs. See mpi-operator @@ -15730,11 +15736,23 @@ spec: to be mounted, but can be ignored by the user not supplying this filesystem in the #DW directives' type: boolean + pvcMode: + description: For DW_GLOBAL_ (global lustre) storages, the access + mode must match what is configured in the LustreFilesystem + resource for the namespace. Defaults to `ReadWriteMany` for + global lustre, otherwise empty. + type: string required: - name - optional type: object type: array + userID: + description: UserID specifies the user ID that is allowed to use this + profile. If this is specified, only Workflows that have a matching + user ID can select this profile. + format: int32 + type: integer required: - retryLimit type: object @@ -15745,6 +15763,8 @@ spec: type: string metadata: type: object + required: + - data type: object served: true storage: true diff --git a/vendor/github.com/NearNodeFlash/nnf-sos/config/crd/bases/nnf.cray.hpe.com_nnfdatamovements.yaml b/vendor/github.com/NearNodeFlash/nnf-sos/config/crd/bases/nnf.cray.hpe.com_nnfdatamovements.yaml index 30ea8fa0..96661f84 100644 --- a/vendor/github.com/NearNodeFlash/nnf-sos/config/crd/bases/nnf.cray.hpe.com_nnfdatamovements.yaml +++ b/vendor/github.com/NearNodeFlash/nnf-sos/config/crd/bases/nnf.cray.hpe.com_nnfdatamovements.yaml @@ -23,6 +23,9 @@ spec: jsonPath: .status.status name: STATUS type: string + - jsonPath: .status.error.severity + name: ERROR + type: string - jsonPath: .metadata.creationTimestamp name: AGE type: date @@ -232,6 +235,35 @@ spec: operation ended. format: date-time type: string + error: + description: Error information + properties: + debugMessage: + description: Internal debug message for the error + type: string + severity: + description: Indication of how severe the error is. Minor will + likely succeed, Major may succeed, and Fatal will never succeed. + enum: + - Minor + - Major + - Fatal + type: string + type: + description: Internal or user error + enum: + - Internal + - User + type: string + userMessage: + description: Optional user facing message if the error is relevant + to an end user + type: string + required: + - debugMessage + - severity + - type + type: object message: description: Message contains any text that explains the Status. If Data Movement failed or storeStdout is enabled, this will contain diff --git a/vendor/github.com/NearNodeFlash/nnf-sos/config/crd/bases/nnf.cray.hpe.com_nnfnodestorages.yaml b/vendor/github.com/NearNodeFlash/nnf-sos/config/crd/bases/nnf.cray.hpe.com_nnfnodestorages.yaml index b9807ce6..60365f77 100644 --- a/vendor/github.com/NearNodeFlash/nnf-sos/config/crd/bases/nnf.cray.hpe.com_nnfnodestorages.yaml +++ b/vendor/github.com/NearNodeFlash/nnf-sos/config/crd/bases/nnf.cray.hpe.com_nnfnodestorages.yaml @@ -154,79 +154,6 @@ spec: storage requirements (i.e. block size / stripe size). format: int64 type: integer - conditions: - items: - description: "Condition contains details for one aspect of - the current state of this API Resource. --- This struct - is intended for direct use as an array at the field path - .status.conditions. For example, \n type FooStatus struct{ - // Represents the observations of a foo's current state. - // Known .status.conditions.type are: \"Available\", \"Progressing\", - and \"Degraded\" // +patchMergeKey=type // +patchStrategy=merge - // +listType=map // +listMapKey=type Conditions []metav1.Condition - `json:\"conditions,omitempty\" patchStrategy:\"merge\" patchMergeKey:\"type\" - protobuf:\"bytes,1,rep,name=conditions\"` \n // other fields - }" - properties: - lastTransitionTime: - description: lastTransitionTime is the last time the condition - transitioned from one status to another. This should - be when the underlying condition changed. If that is - not known, then using the time when the API field changed - is acceptable. - format: date-time - type: string - message: - description: message is a human readable message indicating - details about the transition. This may be an empty string. - maxLength: 32768 - type: string - observedGeneration: - description: observedGeneration represents the .metadata.generation - that the condition was set based upon. For instance, - if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration - is 9, the condition is out of date with respect to the - current state of the instance. - format: int64 - minimum: 0 - type: integer - reason: - description: reason contains a programmatic identifier - indicating the reason for the condition's last transition. - Producers of specific condition types may define expected - values and meanings for this field, and whether the - values are considered a guaranteed API. The value should - be a CamelCase string. This field may not be empty. - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - description: status of the condition, one of True, False, - Unknown. - enum: - - "True" - - "False" - - Unknown - type: string - type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. - --- Many .condition.type values are consistent across - resources like Available, but because arbitrary conditions - can be useful (see .node.status.conditions), the ability - to deconflict is important. The regex it matches is - (dns1123SubdomainFmt/)?(qualifiedNameFmt) - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - type: array creationTime: description: Represents the time when the storage was created by the controller It is represented in RFC3339 form and is @@ -364,17 +291,28 @@ spec: debugMessage: description: Internal debug message for the error type: string - recoverable: - description: Indication if the error is likely recoverable or - not - type: boolean + severity: + description: Indication of how severe the error is. Minor will + likely succeed, Major may succeed, and Fatal will never succeed. + enum: + - Minor + - Major + - Fatal + type: string + type: + description: Internal or user error + enum: + - Internal + - User + type: string userMessage: description: Optional user facing message if the error is relevant to an end user type: string required: - debugMessage - - recoverable + - severity + - type type: object lustreStorage: description: LustreStorageStatus describes the Lustre targets created diff --git a/vendor/github.com/NearNodeFlash/nnf-sos/config/crd/bases/nnf.cray.hpe.com_nnfstorages.yaml b/vendor/github.com/NearNodeFlash/nnf-sos/config/crd/bases/nnf.cray.hpe.com_nnfstorages.yaml index 1c584ac7..07dd1b98 100644 --- a/vendor/github.com/NearNodeFlash/nnf-sos/config/crd/bases/nnf.cray.hpe.com_nnfstorages.yaml +++ b/vendor/github.com/NearNodeFlash/nnf-sos/config/crd/bases/nnf.cray.hpe.com_nnfstorages.yaml @@ -14,7 +14,14 @@ spec: singular: nnfstorage scope: Namespaced versions: - - name: v1alpha1 + - additionalPrinterColumns: + - jsonPath: .metadata.creationTimestamp + name: AGE + type: date + - jsonPath: .status.error.severity + name: ERROR + type: string + name: v1alpha1 schema: openAPIV3Schema: description: NnfStorage is the Schema for the storages API @@ -144,9 +151,6 @@ spec: description: AllocationCount is the total number of allocations that currently exist type: integer - error: - description: Error is the human readable error string - type: string health: description: Health reflects the health of this allocation set type: string @@ -163,17 +167,28 @@ spec: debugMessage: description: Internal debug message for the error type: string - recoverable: - description: Indication if the error is likely recoverable or - not - type: boolean + severity: + description: Indication of how severe the error is. Minor will + likely succeed, Major may succeed, and Fatal will never succeed. + enum: + - Minor + - Major + - Fatal + type: string + type: + description: Internal or user error + enum: + - Internal + - User + type: string userMessage: description: Optional user facing message if the error is relevant to an end user type: string required: - debugMessage - - recoverable + - severity + - type type: object mgsNode: description: MgsNode is the NID of the MGS. diff --git a/vendor/modules.txt b/vendor/modules.txt index 9b580f1e..6587917f 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1,8 +1,10 @@ -# github.com/HewlettPackard/dws v0.0.1-0.20230613201835-73abc41bd83c +# github.com/HewlettPackard/dws v0.0.1-0.20230802152955-11a333f31153 ## explicit; go 1.19 github.com/HewlettPackard/dws/api/v1alpha2 github.com/HewlettPackard/dws/utils/dwdparse github.com/HewlettPackard/dws/utils/updater +# github.com/HewlettPackard/structex v1.0.4 +## explicit; go 1.14 # github.com/NearNodeFlash/lustre-fs-operator v0.0.1-0.20230613180840-6178f2b04900 ## explicit; go 1.19 github.com/NearNodeFlash/lustre-fs-operator/api/v1beta1 @@ -10,19 +12,27 @@ github.com/NearNodeFlash/lustre-fs-operator/config/crd/bases # github.com/NearNodeFlash/nnf-ec v0.0.0-20230526161255-cfb2d89b35d7 ## explicit; go 1.18 github.com/NearNodeFlash/nnf-ec/pkg/rfsf/pkg/models -# github.com/NearNodeFlash/nnf-sos v0.0.1-0.20230613203338-ea5b2f78692b +# github.com/NearNodeFlash/nnf-sos v0.0.1-0.20230802153426-7b17a96bf2de ## explicit; go 1.19 github.com/NearNodeFlash/nnf-sos/api/v1alpha1 github.com/NearNodeFlash/nnf-sos/config/crd/bases # github.com/beorn7/perks v1.0.1 ## explicit; go 1.11 github.com/beorn7/perks/quantile +# github.com/cespare/xxhash v1.1.0 +## explicit # github.com/cespare/xxhash/v2 v2.2.0 ## explicit; go 1.11 github.com/cespare/xxhash/v2 # github.com/davecgh/go-spew v1.1.1 ## explicit github.com/davecgh/go-spew/spew +# github.com/dgraph-io/badger/v3 v3.2103.5 +## explicit; go 1.12 +# github.com/dgraph-io/ristretto v0.1.1 +## explicit; go 1.12 +# github.com/dustin/go-humanize v1.0.1 +## explicit; go 1.16 # github.com/emicklei/go-restful/v3 v3.10.1 ## explicit; go 1.13 github.com/emicklei/go-restful/v3 @@ -57,6 +67,8 @@ github.com/go-task/slim-sprig ## explicit; go 1.15 github.com/gogo/protobuf/proto github.com/gogo/protobuf/sortkeys +# github.com/golang/glog v1.1.0 +## explicit; go 1.18 # github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da ## explicit github.com/golang/groupcache/lru @@ -68,6 +80,10 @@ github.com/golang/protobuf/ptypes github.com/golang/protobuf/ptypes/any github.com/golang/protobuf/ptypes/duration github.com/golang/protobuf/ptypes/timestamp +# github.com/golang/snappy v0.0.4 +## explicit +# github.com/google/flatbuffers v23.1.21+incompatible +## explicit # github.com/google/gnostic v0.6.9 ## explicit; go 1.12 github.com/google/gnostic/compiler @@ -92,6 +108,8 @@ github.com/google/pprof/profile # github.com/google/uuid v1.3.0 ## explicit github.com/google/uuid +# github.com/gorilla/mux v1.8.0 +## explicit; go 1.12 # github.com/imdario/mergo v0.3.13 ## explicit; go 1.13 github.com/imdario/mergo @@ -101,6 +119,8 @@ github.com/josharian/intern # github.com/json-iterator/go v1.1.12 ## explicit; go 1.12 github.com/json-iterator/go +# github.com/klauspost/compress v1.16.0 +## explicit; go 1.18 # github.com/kr/pretty v0.3.0 ## explicit; go 1.12 # github.com/kubeflow/common v0.4.6 @@ -114,9 +134,13 @@ github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1 github.com/mailru/easyjson/buffer github.com/mailru/easyjson/jlexer github.com/mailru/easyjson/jwriter +# github.com/mattn/go-isatty v0.0.17 +## explicit; go 1.15 # github.com/matttproud/golang_protobuf_extensions v1.0.4 ## explicit; go 1.9 github.com/matttproud/golang_protobuf_extensions/pbutil +# github.com/moby/sys/mountinfo v0.6.2 +## explicit; go 1.16 # github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd ## explicit github.com/modern-go/concurrent @@ -165,6 +189,8 @@ github.com/onsi/gomega/types # github.com/pkg/errors v0.9.1 ## explicit github.com/pkg/errors +# github.com/pkg/term v1.1.0 +## explicit; go 1.14 # github.com/prometheus/client_golang v1.14.0 ## explicit; go 1.17 github.com/prometheus/client_golang/prometheus @@ -186,12 +212,24 @@ github.com/prometheus/procfs/internal/fs github.com/prometheus/procfs/internal/util # github.com/rogpeppe/go-internal v1.8.0 ## explicit; go 1.11 +# github.com/rs/cors v1.8.3 +## explicit; go 1.13 +# github.com/senseyeio/duration v0.0.0-20180430131211-7c2a214ada46 +## explicit +# github.com/sigurn/crc8 v0.0.0-20220107193325-2243fe600f9f +## explicit; go 1.17 +# github.com/sirupsen/logrus v1.9.0 +## explicit; go 1.13 # github.com/spf13/pflag v1.0.5 ## explicit; go 1.12 github.com/spf13/pflag # github.com/takama/daemon v1.0.0 ## explicit; go 1.14 github.com/takama/daemon +# go.chromium.org/luci v0.0.0-20230227223707-c4460eb434d8 +## explicit; go 1.19 +# go.opencensus.io v0.24.0 +## explicit; go 1.13 # go.uber.org/atomic v1.11.0 ## explicit; go 1.18 go.uber.org/atomic @@ -664,6 +702,8 @@ k8s.io/kube-openapi/pkg/schemamutation k8s.io/kube-openapi/pkg/spec3 k8s.io/kube-openapi/pkg/util/proto k8s.io/kube-openapi/pkg/validation/spec +# k8s.io/mount-utils v0.27.1 +## explicit; go 1.20 # k8s.io/utils v0.0.0-20230505201702-9f6742963106 ## explicit; go 1.18 k8s.io/utils/buffer From e1d7fa948fa79e6c8d27a4b51ebbadb1ad8324c5 Mon Sep 17 00:00:00 2001 From: Blake Devcich Date: Tue, 15 Aug 2023 09:55:10 -0500 Subject: [PATCH 3/5] Add dm.Status.Message back on DM failure The Copy Offload API relies on this so the output can be relayed back to the user. Signed-off-by: Blake Devcich --- controllers/datamovement_controller.go | 1 + 1 file changed, 1 insertion(+) diff --git a/controllers/datamovement_controller.go b/controllers/datamovement_controller.go index b21cbb98..3bbc0082 100644 --- a/controllers/datamovement_controller.go +++ b/controllers/datamovement_controller.go @@ -406,6 +406,7 @@ func (r *DataMovementReconciler) Reconcile(ctx context.Context, req ctrl.Request dm.Status.Status = nnfv1alpha1.DataMovementConditionReasonCancelled } else if err != nil { dm.Status.Status = nnfv1alpha1.DataMovementConditionReasonFailed + dm.Status.Message = fmt.Sprintf("%s: %s", err.Error(), combinedOutBuf.String()) resourceErr := dwsv1alpha2.NewResourceError("").WithError(err).WithUserMessage("data movement operation failed: %s", combinedOutBuf.String()).WithFatal() dm.Status.SetResourceErrorAndLog(resourceErr, log) } else { From 42560b097c8206a8f93c108be1369a03312e8cb8 Mon Sep 17 00:00:00 2001 From: Blake Devcich Date: Tue, 22 Aug 2023 10:03:11 -0500 Subject: [PATCH 4/5] Remove install of openmpi/bash (already installed) These packages are already provided via nnf-mfu. --- Dockerfile | 7 ------- 1 file changed, 7 deletions(-) diff --git a/Dockerfile b/Dockerfile index e2f62c68..93ab416a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -58,13 +58,6 @@ ENTRYPOINT [ "make", "test" ] ############################################################################### FROM $NNFMFU_TAG_BASE:$NNFMFU_VERSION -RUN apt update - -RUN apt install -y openmpi-bin - -# TODO Remove this -RUN apt install -y bash - # The following lines are from the mpiFileUtils (nnf-mfu) Dockerfile; # do not change them unless you know what it is you are doing RUN sed -i "s/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g" /etc/ssh/ssh_config \ From bd4b5ba0f6b374b609621a90526a278387f5855a Mon Sep 17 00:00:00 2001 From: Matt Richerson Date: Thu, 7 Sep 2023 15:22:18 -0500 Subject: [PATCH 5/5] update nnf-mfu version Signed-off-by: Matt Richerson --- Dockerfile | 2 +- Makefile | 2 +- config/manager/kustomization.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 93ab416a..20ad2c33 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,7 +18,7 @@ # These ARGs must be before the first FROM. This allows them to be valid for # use in FROM instructions. ARG NNFMFU_TAG_BASE=ghcr.io/nearnodeflash/nnf-mfu -ARG NNFMFU_VERSION=master +ARG NNFMFU_VERSION=v0.0.2 # Build the manager binary FROM golang:1.19-alpine as builder diff --git a/Makefile b/Makefile index aac8e8ec..ba9b874c 100644 --- a/Makefile +++ b/Makefile @@ -55,7 +55,7 @@ IMAGE_TAG_BASE ?= ghcr.io/nearnodeflash/nnf-dm # The NNF-MFU container image to use in NNFContainerProfile resources. NNFMFU_TAG_BASE ?= ghcr.io/nearnodeflash/nnf-mfu -NNFMFU_VERSION ?= master +NNFMFU_VERSION ?= v0.0.2 DOCKER_BUILDARGS=--build-arg NNFMFU_TAG_BASE=$(NNFMFU_TAG_BASE) --build-arg NNFMFU_VERSION=$(NNFMFU_VERSION) diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml index 873b0a4b..fb5d30c6 100644 --- a/config/manager/kustomization.yaml +++ b/config/manager/kustomization.yaml @@ -19,4 +19,4 @@ images: newTag: 0.0.1 - name: nnf-mfu newName: ghcr.io/nearnodeflash/nnf-mfu - newTag: master + newTag: 0.0.2