Skip to content

Commit

Permalink
Merge pull request #122 from NearNodeFlash/release-v0.0.5
Browse files Browse the repository at this point in the history
Release v0.0.5
  • Loading branch information
matthew-richerson authored Sep 8, 2023
2 parents 9be4640 + bd4b5ba commit 6e79eba
Show file tree
Hide file tree
Showing 28 changed files with 499 additions and 330 deletions.
9 changes: 1 addition & 8 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
# These ARGs must be before the first FROM. This allows them to be valid for
# use in FROM instructions.
ARG NNFMFU_TAG_BASE=ghcr.io/nearnodeflash/nnf-mfu
ARG NNFMFU_VERSION=master
ARG NNFMFU_VERSION=v0.0.2

# Build the manager binary
FROM golang:1.19-alpine as builder
Expand Down Expand Up @@ -58,13 +58,6 @@ ENTRYPOINT [ "make", "test" ]
###############################################################################
FROM $NNFMFU_TAG_BASE:$NNFMFU_VERSION

RUN apt update

RUN apt install -y openmpi-bin

# TODO Remove this
RUN apt install -y bash

# The following lines are from the mpiFileUtils (nnf-mfu) Dockerfile;
# do not change them unless you know what it is you are doing
RUN sed -i "s/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g" /etc/ssh/ssh_config \
Expand Down
23 changes: 16 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ IMAGE_TAG_BASE ?= ghcr.io/nearnodeflash/nnf-dm

# The NNF-MFU container image to use in NNFContainerProfile resources.
NNFMFU_TAG_BASE ?= ghcr.io/nearnodeflash/nnf-mfu
NNFMFU_VERSION ?= master
NNFMFU_VERSION ?= v0.0.2

DOCKER_BUILDARGS=--build-arg NNFMFU_TAG_BASE=$(NNFMFU_TAG_BASE) --build-arg NNFMFU_VERSION=$(NNFMFU_VERSION)

Expand Down Expand Up @@ -184,6 +184,12 @@ LOCALBIN ?= $(shell pwd)/bin
$(LOCALBIN):
mkdir -p $(LOCALBIN)

.PHONY: clean-bin
clean-bin:
if [[ -d $(LOCALBIN) ]]; then \
chmod -R u+w $(LOCALBIN) && rm -rf $(LOCALBIN); \
fi

## Tool Binaries
KUSTOMIZE ?= $(LOCALBIN)/kustomize
CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen
Expand All @@ -195,14 +201,17 @@ CONTROLLER_TOOLS_VERSION ?= v0.12.0

KUSTOMIZE_INSTALL_SCRIPT ?= "https://raw.githubusercontent.com/kubernetes-sigs/kustomize/master/hack/install_kustomize.sh"
.PHONY: kustomize
kustomize: $(KUSTOMIZE) ## Download kustomize locally if necessary.
$(KUSTOMIZE): $(LOCALBIN)
test -s $(LOCALBIN)/kustomize || { curl -s $(KUSTOMIZE_INSTALL_SCRIPT) | bash -s -- $(subst v,,$(KUSTOMIZE_VERSION)) $(LOCALBIN); }
kustomize: $(LOCALBIN) ## Download kustomize locally if necessary.
if [[ ! -s $(LOCALBIN)/kustomize || $$($(LOCALBIN)/kustomize version | awk '{print $$1}' | awk -F/ '{print $$2}') != $(KUSTOMIZE_VERSION) ]]; then \
rm -f $(LOCALBIN)/kustomize && \
{ curl -s $(KUSTOMIZE_INSTALL_SCRIPT) | bash -s -- $(subst v,,$(KUSTOMIZE_VERSION)) $(LOCALBIN); }; \
fi

.PHONY: controller-gen
controller-gen: $(CONTROLLER_GEN) ## Download controller-gen locally if necessary.
$(CONTROLLER_GEN): $(LOCALBIN)
test -s $(LOCALBIN)/controller-gen || GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-tools/cmd/controller-gen@$(CONTROLLER_TOOLS_VERSION)
controller-gen: $(LOCALBIN) ## Download controller-gen locally if necessary.
if [[ ! -s $(LOCALBIN)/controller-gen || $$($(LOCALBIN)/controller-gen --version | awk '{print $$2}') != $(CONTROLLER_TOOLS_VERSION) ]]; then \
rm -f $(LOCALBIN)/controller-gen && GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-tools/cmd/controller-gen@$(CONTROLLER_TOOLS_VERSION); \
fi

.PHONY: envtest
envtest: $(ENVTEST) ## Download envtest-setup locally if necessary.
Expand Down
2 changes: 1 addition & 1 deletion config/manager/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ images:
newTag: 0.0.1
- name: nnf-mfu
newName: ghcr.io/nearnodeflash/nnf-mfu
newTag: master
newTag: 0.0.2
69 changes: 36 additions & 33 deletions controllers/datamovement_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/predicate"
"sigs.k8s.io/yaml"

dwsv1alpha2 "github.com/HewlettPackard/dws/api/v1alpha2"
dmv1alpha1 "github.com/NearNodeFlash/nnf-dm/api/v1alpha1"
"github.com/NearNodeFlash/nnf-dm/controllers/metrics"
nnfv1alpha1 "github.com/NearNodeFlash/nnf-sos/api/v1alpha1"
Expand Down Expand Up @@ -135,7 +136,7 @@ func (i *invalidError) Unwrap() error { return i.err }

// Reconcile is part of the main kubernetes reconciliation loop which aims to
// move the current state of the cluster closer to the desired state.
func (r *DataMovementReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
func (r *DataMovementReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res ctrl.Result, err error) {
log := log.FromContext(ctx)

metrics.NnfDmDataMovementReconcilesTotal.Inc()
Expand All @@ -145,6 +146,24 @@ func (r *DataMovementReconciler) Reconcile(ctx context.Context, req ctrl.Request
return ctrl.Result{}, client.IgnoreNotFound(err)
}

defer func() {
if err != nil {
resourceError, ok := err.(*dwsv1alpha2.ResourceErrorInfo)
if ok {
if resourceError.Severity != dwsv1alpha2.SeverityMinor {
dm.Status.State = nnfv1alpha1.DataMovementConditionTypeFinished
dm.Status.Status = nnfv1alpha1.DataMovementConditionReasonInvalid
}
}
dm.Status.SetResourceErrorAndLog(err, log)
dm.Status.Message = err.Error()

if updateErr := r.Status().Update(ctx, dm); updateErr != nil {
err = updateErr
}
}
}()

if !dm.GetDeletionTimestamp().IsZero() {

if err := r.cancel(ctx, dm); err != nil {
Expand Down Expand Up @@ -181,7 +200,7 @@ func (r *DataMovementReconciler) Reconcile(ctx context.Context, req ctrl.Request
// Handle cancellation
if dm.Spec.Cancel {
if err := r.cancel(ctx, dm); err != nil {
return ctrl.Result{}, err
return ctrl.Result{}, dwsv1alpha2.NewResourceError("").WithError(err).WithUserMessage("Unable to cancel data movement")
}

return ctrl.Result{}, nil
Expand All @@ -203,32 +222,14 @@ func (r *DataMovementReconciler) Reconcile(ctx context.Context, req ctrl.Request
log.Info("Restarting", "restarts", dm.Status.Restarts)
}

// Handle invalid errors that can occur when setting up the data movement
// resource. An invalid error is unrecoverable.
handleInvalidError := func(err error) error {
if errors.Is(err, &invalidError{}) {
dm.Status.State = nnfv1alpha1.DataMovementConditionTypeFinished
dm.Status.Status = nnfv1alpha1.DataMovementConditionReasonInvalid
dm.Status.Message = err.Error()

if err := r.Status().Update(ctx, dm); err != nil {
return err
}

return nil
}

return err
}

nodes, err := r.getStorageNodeNames(ctx, dm)
if err != nil {
return ctrl.Result{}, handleInvalidError(err)
return ctrl.Result{}, dwsv1alpha2.NewResourceError("could not get storage nodes for data movement").WithError(err).WithMajor()
}

hosts, err := r.getWorkerHostnames(ctx, nodes)
if err != nil {
return ctrl.Result{}, handleInvalidError(err)
return ctrl.Result{}, dwsv1alpha2.NewResourceError("could not get worker nodes for data movement").WithError(err).WithMajor()
}

// Expand the context with cancel and store it in the map so the cancel function can be used in
Expand All @@ -240,16 +241,19 @@ func (r *DataMovementReconciler) Reconcile(ctx context.Context, req ctrl.Request
})

// Get DM Config map
configMap := &corev1.ConfigMap{}
if err := r.Get(ctx, types.NamespacedName{Name: configMapName, Namespace: configMapNamespace}, configMap); err != nil {
log.Info("Config map not found - requeueing", "name", configMapName, "namespace", configMapNamespace)
return ctrl.Result{}, handleInvalidError(err)
configMap := &corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: configMapName,
Namespace: configMapNamespace,
},
}
if err := r.Get(ctx, client.ObjectKeyFromObject(configMap), configMap); err != nil {
return ctrl.Result{}, dwsv1alpha2.NewResourceError("could not get data movement config map: %v", client.ObjectKeyFromObject(configMap)).WithError(err).WithMajor()
}

cfg := dmConfig{}
if err := yaml.Unmarshal([]byte(configMap.Data[configMapKeyData]), &cfg); err != nil {
log.Error(err, "error reading config map data")
return ctrl.Result{}, handleInvalidError(err)
return ctrl.Result{}, dwsv1alpha2.NewResourceError("invalid data for config map: %v", client.ObjectKeyFromObject(configMap)).WithError(err).WithFatal()
}
log.Info("Using config map", "config", cfg)

Expand All @@ -259,15 +263,13 @@ func (r *DataMovementReconciler) Reconcile(ctx context.Context, req ctrl.Request
// Ensure profile exists
profile, found := cfg.Profiles[configMapKeyProfileDefault]
if !found {
return ctrl.Result{}, handleInvalidError(fmt.Errorf(
"'%s' profile not found in config map", configMapKeyProfileDefault))
return ctrl.Result{}, dwsv1alpha2.NewResourceError("").WithUserMessage("'%s' profile not found in config map: %v", configMapKeyProfileDefault, client.ObjectKeyFromObject(configMap)).WithUser().WithFatal()
}
log.Info("Using profile", "name", configMapKeyProfileDefault, "profile", profile)

cmdArgs, mpiHostfile, err := buildDMCommand(ctx, profile, hosts, dm)
if err != nil {
log.Error(err, "error building DM command")
return ctrl.Result{}, handleInvalidError(err)
return ctrl.Result{}, dwsv1alpha2.NewResourceError("could not create data movement command").WithError(err).WithMajor()
}
if len(mpiHostfile) > 0 {
log.Info("MPI Hostfile preview", "first line", peekMpiHostfile(mpiHostfile))
Expand Down Expand Up @@ -403,9 +405,10 @@ func (r *DataMovementReconciler) Reconcile(ctx context.Context, req ctrl.Request
log.Error(err, "Data movement operation cancelled", "output", combinedOutBuf.String())
dm.Status.Status = nnfv1alpha1.DataMovementConditionReasonCancelled
} else if err != nil {
log.Error(err, "Data movement operation failed", "output", combinedOutBuf.String())
dm.Status.Status = nnfv1alpha1.DataMovementConditionReasonFailed
dm.Status.Message = fmt.Sprintf("%s: %s", err.Error(), combinedOutBuf.String())
resourceErr := dwsv1alpha2.NewResourceError("").WithError(err).WithUserMessage("data movement operation failed: %s", combinedOutBuf.String()).WithFatal()
dm.Status.SetResourceErrorAndLog(resourceErr, log)
} else {
log.Info("Data movement operation completed", "cmdStatus", cmdStatus)

Expand Down
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ module github.com/NearNodeFlash/nnf-dm
go 1.19

require (
github.com/HewlettPackard/dws v0.0.1-0.20230613201835-73abc41bd83c
github.com/HewlettPackard/dws v0.0.1-0.20230802152955-11a333f31153
github.com/NearNodeFlash/lustre-fs-operator v0.0.1-0.20230613180840-6178f2b04900
github.com/NearNodeFlash/nnf-sos v0.0.1-0.20230613203338-ea5b2f78692b
github.com/NearNodeFlash/nnf-sos v0.0.1-0.20230802153426-7b17a96bf2de
github.com/onsi/ginkgo/v2 v2.9.1
github.com/onsi/gomega v1.27.3
github.com/prometheus/client_golang v1.14.0
Expand Down
8 changes: 4 additions & 4 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/HewlettPackard/dws v0.0.1-0.20230613201835-73abc41bd83c h1:atwVAI9Gslf501a4ADo/nkJol141DgF8YR4AiMtj4E8=
github.com/HewlettPackard/dws v0.0.1-0.20230613201835-73abc41bd83c/go.mod h1:YvNzcgAPmwhl/YQj6dMwsB9OpwbI5bp/41kINfFiXX8=
github.com/HewlettPackard/dws v0.0.1-0.20230802152955-11a333f31153 h1:9vMjataXTnCwXEGwxu0dQrOLUW5ujoJiTWAUTb8k50w=
github.com/HewlettPackard/dws v0.0.1-0.20230802152955-11a333f31153/go.mod h1:YvNzcgAPmwhl/YQj6dMwsB9OpwbI5bp/41kINfFiXX8=
github.com/NearNodeFlash/lustre-fs-operator v0.0.1-0.20230613180840-6178f2b04900 h1:jOrP2H+D5amgHIONcucYS3/kJm6QfmqAG23Ke7elunI=
github.com/NearNodeFlash/lustre-fs-operator v0.0.1-0.20230613180840-6178f2b04900/go.mod h1:O71nfDnuK7MZZYAW9kaOFTMo48nmDlaYnzISXEPsKSw=
github.com/NearNodeFlash/nnf-ec v0.0.0-20230526161255-cfb2d89b35d7 h1:y4E3b/Ta6sqv+huYQXYKZmPCMWMZtG2kV8/qgTIpzFI=
github.com/NearNodeFlash/nnf-ec v0.0.0-20230526161255-cfb2d89b35d7/go.mod h1:11Ol46sAWdqlj3WmIFTzKO+UxQX3lvWBqpe6yaiMEIg=
github.com/NearNodeFlash/nnf-sos v0.0.1-0.20230613203338-ea5b2f78692b h1:UKYwKExv3AwHLwEBKHHMDuVq3Kv9Vn2b5vcUluOe8bs=
github.com/NearNodeFlash/nnf-sos v0.0.1-0.20230613203338-ea5b2f78692b/go.mod h1:ROE7mG1W7t1APwH9gfRwDIIQqtBP04VcYVHlKcYA1P0=
github.com/NearNodeFlash/nnf-sos v0.0.1-0.20230802153426-7b17a96bf2de h1:HLjf2NO/e+U5Qc2bUif6/ta0HbFwAMfBMy8hQBeX2fc=
github.com/NearNodeFlash/nnf-sos v0.0.1-0.20230802153426-7b17a96bf2de/go.mod h1:ZqhqjoQO4sn3B5aPt4XwdS6ZpkEUtH8Eki7e2AaRprA=
github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU=
github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY=
github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8=
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 6e79eba

Please sign in to comment.