Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce a copy-offload daemon in a user container #238

Merged
merged 1 commit into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,18 @@
"--socket=/tmp/nnf.sock"
]
},
{
"name": "Debug Copy-Offload server",
"type": "go",
"request": "launch",
"mode": "debug",
"program": "${workspaceFolder}/daemons/copy-offload/cmd",
"env": {"NNF_NODE_NAME": "rabbit-node-1"},
"args" :[
"--port=8080",
"--mock",
]
},
{
"name": "Launch file",
"type": "go",
Expand Down
49 changes: 45 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,7 @@ ARG NNFMFU_TAG_BASE=ghcr.io/nearnodeflash/nnf-mfu
ARG NNFMFU_VERSION=master

# Build the manager binary
FROM golang:1.21-alpine AS builder

ARG TARGETARCH
ARG TARGETOS
FROM golang:1.21-alpine AS builder_setup

WORKDIR /workspace
# Copy the Go Modules manifests
Expand All @@ -40,16 +37,39 @@ COPY cmd/ cmd/
COPY api/ api/
COPY internal/ internal/

###############################################################################
FROM builder_setup AS builder

ARG TARGETARCH
ARG TARGETOS

# Build
# the GOARCH has a default value to allow the binary be built according to the host where the command
# was called. For example, if we call make docker-build in a local env which has the Apple Silicon M1 SO
# the docker BUILDPLATFORM arg will be linux/arm64 when for Apple x86 it will be linux/amd64. Therefore,
# by leaving it empty we can ensure that the container and binary shipped on it will have the same platform.
RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -a -o manager cmd/main.go

###############################################################################
FROM builder_setup AS copy_offload_builder

ARG TARGETARCH
ARG TARGETOS

COPY daemons/copy-offload/ daemons/copy-offload/

# Build
# the GOARCH has a default value to allow the binary be built according to the host where the command
# was called. For example, if we call make docker-build in a local env which has the Apple Silicon M1 SO
# the docker BUILDPLATFORM arg will be linux/arm64 when for Apple x86 it will be linux/amd64. Therefore,
# by leaving it empty we can ensure that the container and binary shipped on it will have the same platform.
RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -a -o nnf-copy-offload daemons/copy-offload/cmd/main.go

###############################################################################
FROM builder AS testing

COPY daemons/copy-offload/ daemons/copy-offload/

WORKDIR /workspace

COPY config/ config/
Expand Down Expand Up @@ -102,3 +122,24 @@ ENTRYPOINT ["/manager"]
ARG NNFMFU_TAG_BASE
ARG NNFMFU_VERSION
LABEL nnf-mfu="$NNFMFU_TAG_BASE-debug:$NNFMFU_VERSION"

###############################################################################
FROM $NNFMFU_TAG_BASE:$NNFMFU_VERSION AS copy_offload_production

# The following lines are from the mpiFileUtils (nnf-mfu) Dockerfile;
# do not change them unless you know what it is you are doing
RUN sed -i "s/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g" /etc/ssh/ssh_config \
&& echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config

# Copy the executable and execute
WORKDIR /
COPY --from=copy_offload_builder /workspace/nnf-copy-offload .

ENTRYPOINT ["/nnf-copy-offload"]

# Make it easy to figure out which nnf-mfu was used.
# docker inspect --format='{{json .Config.Labels}}' image:tag
ARG NNFMFU_TAG_BASE
ARG NNFMFU_VERSION
LABEL nnf-mfu="$NNFMFU_TAG_BASE:$NNFMFU_VERSION"

36 changes: 32 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,9 @@ BUNDLE_METADATA_OPTS ?= $(BUNDLE_CHANNELS) $(BUNDLE_DEFAULT_CHANNEL)
# For example, running 'make bundle-build bundle-push catalog-build catalog-push' will build and push both
# cray.hpe.com/nnf-dm-bundle:$VERSION and cray.hpe.com/nnf-dm-catalog:$VERSION.
IMAGE_TAG_BASE ?= ghcr.io/nearnodeflash/nnf-dm
IMAGE_COPY_OFFLOAD_TAG_BASE = $(IMAGE_TAG_BASE)-copy-offload
IMAGE_TARGET ?= production
IMAGE_COPY_OFFLOAD_TARGET = copy_offload_$(IMAGE_TARGET)

# The NNF-MFU container image to use in NNFContainerProfile resources.
NNFMFU_TAG_BASE ?= ghcr.io/nearnodeflash/nnf-mfu
Expand Down Expand Up @@ -136,7 +138,7 @@ container-unit-test: .version ## Run tests inside a container image
${CONTAINER_TOOL} build -f Dockerfile --label $(IMAGE_TAG_BASE)-$@:$(VERSION)-$@ -t $(IMAGE_TAG_BASE)-$@:$(VERSION) --target testing $(CONTAINER_BUILDARGS) .
${CONTAINER_TOOL} run --rm -t --name $@-nnf-dm $(IMAGE_TAG_BASE)-$@:$(VERSION)

##@ Build
##@ Build the controller manager.
RPM_PLATFORM ?= linux/amd64
RPM_TARGET ?= x86_64
.PHONY: build-daemon-rpm
Expand Down Expand Up @@ -168,6 +170,35 @@ build: generate fmt vet ## Build manager binary.
run: manifests generate fmt vet ## Run a controller from your host.
CGO_ENABLED=0 go run cmd/main.go

##@ Build the copy-offload container's daemon outside the container.
.PHONY: build-copy-offload-local
build-copy-offload-local: GOOS = $(shell go env GOOS)
build-copy-offload-local: GOARCH = $(shell go env GOARCH)
build-copy-offload-local: build-copy-offload-with

.PHONY: build-copy-offload-with
build-copy-offload-with: $(LOCALBIN)
build-copy-offload-with: fmt vet ## Build standalone copy-offload daemon
CGO_ENABLED=0 GOOS=$(GOOS) GOARCH=$(GOARCH) go build -o bin/nnf-copy-offload daemons/copy-offload/cmd/main.go

.PHONY: build-copy-offload-docker-local
build-copy-offload-docker-local: GOARCH = $(shell go env GOARCH)
build-copy-offload-docker-local: build-copy-offload-docker-with

.PHONY: build-copy-offload-docker-amd64
build-copy-offload-docker-amd64: GOARCH = amd64
build-copy-offload-docker-amd64: build-copy-offload-docker-with

.PHONY: build-copy-offload-docker-with
build-copy-offload-docker-with: VERSION ?= $(shell cat .version)
build-copy-offload-docker-with: .version ## Build docker image with the manager.
${CONTAINER_TOOL} build --platform linux/$(GOARCH) --target $(IMAGE_COPY_OFFLOAD_TARGET) -t $(IMAGE_COPY_OFFLOAD_TAG_BASE):$(VERSION) $(CONTAINER_BUILDARGS) .

.PHONY: kind-push-copy-offload
kind-push-copy-offload: VERSION ?= $(shell cat .version)
kind-push-copy-offload: .version
kind load docker-image $(IMAGE_COPY_OFFLOAD_TAG_BASE):$(VERSION)

# If you wish to build the manager image targeting other platforms you can use the --platform flag.
# (i.e. docker build --platform linux/arm64). However, you must enable docker buildKit for it.
# More info: https://docs.docker.com/develop/develop-images/build_enhancements/
Expand Down Expand Up @@ -215,9 +246,6 @@ docker-buildx-debug: docker-buildx

kind-push: VERSION ?= $(shell cat .version)
kind-push: .version ## Push docker image to kind
# Nnf-dm is used on all nodes. It's on the management node for the
# nnf-dm-controller-manager deployment, and on the rabbit nodes for
# the nnf-dm-rsyncnode daemonset that is created by that deployment.
kind load docker-image $(IMAGE_TAG_BASE):$(VERSION)

kind-push-debug: VERSION ?= $(shell cat .version)
Expand Down
2 changes: 1 addition & 1 deletion crd-bumper.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# A comma-separated list of directories where more Go code can be found, beyond
# the usual cmd/, api/, internal/ that kubebuilder would put in place. The Go
# files in these dirs will be bumped to the new hub version.
extra_go_dirs: daemons/compute/server/servers
extra_go_dirs: daemons/compute/server,daemons/copy-offload

# A comma-separated list of directories of Kustomize config files that have
# references to the API and that must be updated to the new hub version so
Expand Down
4 changes: 2 additions & 2 deletions daemons/compute/server/servers/server_default.go
Original file line number Diff line number Diff line change
Expand Up @@ -866,7 +866,7 @@ func (s *defaultServer) findRabbitRelativeSource(ctx context.Context, computeMou
}

if len(clientMounts.Items) == 0 {
return "", fmt.Errorf("no client mounts found on node '%s'", s.namespace)
return "", fmt.Errorf("no client mounts found for node '%s'", s.namespace)
}

for _, clientMount := range clientMounts.Items {
Expand Down Expand Up @@ -899,7 +899,7 @@ func (s *defaultServer) findComputeMountInfo(ctx context.Context, req *pb.DataMo
}

if len(clientMounts.Items) == 0 {
return nil, nil, fmt.Errorf("no client mounts found on node '%s'", s.name)
return nil, nil, fmt.Errorf("no client mounts found for node '%s'", s.name)
}

for _, clientMount := range clientMounts.Items {
Expand Down
126 changes: 126 additions & 0 deletions daemons/copy-offload/cmd/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
/*
* Copyright 2024 Hewlett Packard Enterprise Development LP
* Other additional copyright holders may be indicated within.
*
* The entirety of this work is licensed under the Apache License,
* Version 2.0 (the "License"); you may not use this file except
* in compliance with the License.
*
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package main

import (
"context"
"errors"
"flag"
"fmt"
"log/slog"
"net/http"
"os"

"github.com/go-logr/logr"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
zapcr "sigs.k8s.io/controller-runtime/pkg/log/zap"

dwsv1alpha2 "github.com/DataWorkflowServices/dws/api/v1alpha2"
"github.com/NearNodeFlash/nnf-dm/daemons/copy-offload/pkg/driver"
userHttp "github.com/NearNodeFlash/nnf-dm/daemons/copy-offload/pkg/server"
nnfv1alpha4 "github.com/NearNodeFlash/nnf-sos/api/v1alpha4"
)

var (
scheme = runtime.NewScheme()
rabbitName string
)

func init() {
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
utilruntime.Must(nnfv1alpha4.AddToScheme(scheme))
utilruntime.Must(dwsv1alpha2.AddToScheme(scheme))
}

func setupLog() logr.Logger {
encoder := zapcore.NewConsoleEncoder(zap.NewDevelopmentEncoderConfig())
zaplogger := zapcr.New(zapcr.Encoder(encoder), zapcr.UseDevMode(true))
ctrl.SetLogger(zaplogger)

// controllerruntime logger.
crLog := ctrl.Log.WithName("copy-offload")
return crLog
}

func setupClient(crLog logr.Logger) client.Client {
config := ctrl.GetConfigOrDie()

clnt, err := client.New(config, client.Options{Scheme: scheme})
if err != nil {
crLog.Error(err, "Unable to create client")
os.Exit(1)
}
return clnt
}

func clientSanity(crLog logr.Logger, clnt client.Client, rabbitName string) {
// Sanity check the client connection.
nnfNode := &nnfv1alpha4.NnfNode{}
if err := clnt.Get(context.TODO(), types.NamespacedName{Name: "nnf-nlc", Namespace: rabbitName}, nnfNode); err != nil {
crLog.Error(err, "Failed to retrieve my own NnfNode")
os.Exit(1)
}
}

func main() {
port := "8080"
mock := false

flag.StringVar(&port, "port", port, "Port for server.")
flag.BoolVar(&mock, "mock", mock, "Mock mode for tests; does not use k8s.")
flag.Parse()

rabbitName = os.Getenv("NNF_NODE_NAME")
if rabbitName == "" {
fmt.Println("Did not find NNF_NODE_NAME")
os.Exit(1)
}

crLog := setupLog()
// Make one of these for this server, and use it in all requests.
drvr := &driver.Driver{Log: crLog, RabbitName: rabbitName, Mock: mock}
if !mock {
clnt := setupClient(crLog)
clientSanity(crLog, clnt, rabbitName)
drvr.Client = clnt
}
slog.Info("Ready", "node", rabbitName, "port", port, "mock", mock)

httpHandler := &userHttp.UserHttp{Log: crLog, Drvr: drvr, Mock: mock}

http.HandleFunc("/hello", httpHandler.Hello)
http.HandleFunc("/trial", httpHandler.TrialRequest)
http.HandleFunc("/cancel/", httpHandler.CancelRequest)
http.HandleFunc("/list", httpHandler.ListRequests)

err := http.ListenAndServe(fmt.Sprintf(":%s", port), nil)
if errors.Is(err, http.ErrServerClosed) {
slog.Info("the server is closed")
} else if err != nil {
slog.Error("unable to start server", "err", err.Error())
}
}
82 changes: 82 additions & 0 deletions daemons/copy-offload/pkg/driver/dmrequest.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
/*
* Copyright 2024 Hewlett Packard Enterprise Development LP
* Other additional copyright holders may be indicated within.
*
* The entirety of this work is licensed under the Apache License,
* Version 2.0 (the "License"); you may not use this file except
* in compliance with the License.
*
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package driver

import (
"fmt"

corev1 "k8s.io/api/core/v1"
)

// DMRequest represents the content of one http request. This has a
// one-to-one relationship with a DriverRequest object.
type DMRequest struct {
ComputeName string `json:"computeName"`

// The name and namespace of the initiating workflow
WorkflowName string `json:"workflowName"`
WorkflowNamespace string `json:"workflowNamespace"`
// Source file or directory
SourcePath string `json:"sourcePath"`
// Destination file or directory
DestinationPath string `json:"destinationPath"`
// If True, the data movement command runs `/bin/true` rather than perform actual data movement
Dryrun bool `json:"dryrun"`
// Extra options to pass to `dcp` if present in the Data Movement command.
DcpOptions string `json:"dcpOptions"`
// If true, enable server-side logging of stdout when the command is successful. Failure output is always logged.
LogStdout bool `json:"logStdout"`
// If true, store stdout in DataMovementStatusResponse.Message when the command is successful. Failure output is always contained in the message.
StoreStdout bool `json:"storeStdout"`
// The number of slots specified in the MPI hostfile. A value of 0 disables the use of slots in
// the hostfile. -1 will defer to the server side configuration.
Slots int32 `json:"slots"`
// The number of max_slots specified in the MPI hostfile. A value of 0 disables the use of
// max_slots in the hostfile. -1 will defer to the server side configuration.
MaxSlots int32 `json:"maxSlots"`
// The name of the data movement configuration profile to use. The above parameters (e.g. slots,
// logStdout) will override the settings defined by the profile. This profile must exist on the
// server otherwise the data movement operation will be invalid. Empty will default to the
// default profile.
DMProfile string `json:"dmProfile"`
// Extra options to pass to `mpirun` if present in the Data Movement command.
MpirunOptions string `json:"mpirunOptions"`
}

func (m *DMRequest) Validator() error {

if m.ComputeName == "" {
return fmt.Errorf("compute name must be supplied")
}
if m.WorkflowName == "" {
return fmt.Errorf("workflow name must be supplied")
}
if m.SourcePath == "" {
return fmt.Errorf("source path must be supplied")
}
if m.DestinationPath == "" {
return fmt.Errorf("destination path must be supplied")
}
if m.WorkflowNamespace == "" {
m.WorkflowNamespace = corev1.NamespaceDefault
}

return nil
}
Loading
Loading