Skip to content

Commit

Permalink
Introduce a copy-offload daemon in a user container (#238)
Browse files Browse the repository at this point in the history
This daemon is an http server that accepts copy-offload requests and
executes them directly. The daemon is in a container, on the rabbit,
and was placed there by a "#DW container" directive.

This commit includes a daemon that can be built locally and run outside
a container, or it can be built in a container image.
The necessary RBAC bits for config/ are coming in the next step.

Signed-off-by: Dean Roehrich <[email protected]>
  • Loading branch information
roehrich-hpe authored Dec 6, 2024
1 parent 86bb0a1 commit ab9ffbe
Show file tree
Hide file tree
Showing 12 changed files with 1,570 additions and 12 deletions.
12 changes: 12 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,18 @@
"--socket=/tmp/nnf.sock"
]
},
{
"name": "Debug Copy-Offload server",
"type": "go",
"request": "launch",
"mode": "debug",
"program": "${workspaceFolder}/daemons/copy-offload/cmd",
"env": {"NNF_NODE_NAME": "rabbit-node-1"},
"args" :[
"--port=8080",
"--mock",
]
},
{
"name": "Launch file",
"type": "go",
Expand Down
49 changes: 45 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,7 @@ ARG NNFMFU_TAG_BASE=ghcr.io/nearnodeflash/nnf-mfu
ARG NNFMFU_VERSION=master

# Build the manager binary
FROM golang:1.21-alpine AS builder

ARG TARGETARCH
ARG TARGETOS
FROM golang:1.21-alpine AS builder_setup

WORKDIR /workspace
# Copy the Go Modules manifests
Expand All @@ -40,16 +37,39 @@ COPY cmd/ cmd/
COPY api/ api/
COPY internal/ internal/

###############################################################################
FROM builder_setup AS builder

ARG TARGETARCH
ARG TARGETOS

# Build
# the GOARCH has a default value to allow the binary be built according to the host where the command
# was called. For example, if we call make docker-build in a local env which has the Apple Silicon M1 SO
# the docker BUILDPLATFORM arg will be linux/arm64 when for Apple x86 it will be linux/amd64. Therefore,
# by leaving it empty we can ensure that the container and binary shipped on it will have the same platform.
RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -a -o manager cmd/main.go

###############################################################################
FROM builder_setup AS copy_offload_builder

ARG TARGETARCH
ARG TARGETOS

COPY daemons/copy-offload/ daemons/copy-offload/

# Build
# the GOARCH has a default value to allow the binary be built according to the host where the command
# was called. For example, if we call make docker-build in a local env which has the Apple Silicon M1 SO
# the docker BUILDPLATFORM arg will be linux/arm64 when for Apple x86 it will be linux/amd64. Therefore,
# by leaving it empty we can ensure that the container and binary shipped on it will have the same platform.
RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -a -o nnf-copy-offload daemons/copy-offload/cmd/main.go

###############################################################################
FROM builder AS testing

COPY daemons/copy-offload/ daemons/copy-offload/

WORKDIR /workspace

COPY config/ config/
Expand Down Expand Up @@ -102,3 +122,24 @@ ENTRYPOINT ["/manager"]
ARG NNFMFU_TAG_BASE
ARG NNFMFU_VERSION
LABEL nnf-mfu="$NNFMFU_TAG_BASE-debug:$NNFMFU_VERSION"

###############################################################################
FROM $NNFMFU_TAG_BASE:$NNFMFU_VERSION AS copy_offload_production

# The following lines are from the mpiFileUtils (nnf-mfu) Dockerfile;
# do not change them unless you know what it is you are doing
RUN sed -i "s/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g" /etc/ssh/ssh_config \
&& echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config

# Copy the executable and execute
WORKDIR /
COPY --from=copy_offload_builder /workspace/nnf-copy-offload .

ENTRYPOINT ["/nnf-copy-offload"]

# Make it easy to figure out which nnf-mfu was used.
# docker inspect --format='{{json .Config.Labels}}' image:tag
ARG NNFMFU_TAG_BASE
ARG NNFMFU_VERSION
LABEL nnf-mfu="$NNFMFU_TAG_BASE:$NNFMFU_VERSION"

36 changes: 32 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,9 @@ BUNDLE_METADATA_OPTS ?= $(BUNDLE_CHANNELS) $(BUNDLE_DEFAULT_CHANNEL)
# For example, running 'make bundle-build bundle-push catalog-build catalog-push' will build and push both
# cray.hpe.com/nnf-dm-bundle:$VERSION and cray.hpe.com/nnf-dm-catalog:$VERSION.
IMAGE_TAG_BASE ?= ghcr.io/nearnodeflash/nnf-dm
IMAGE_COPY_OFFLOAD_TAG_BASE = $(IMAGE_TAG_BASE)-copy-offload
IMAGE_TARGET ?= production
IMAGE_COPY_OFFLOAD_TARGET = copy_offload_$(IMAGE_TARGET)

# The NNF-MFU container image to use in NNFContainerProfile resources.
NNFMFU_TAG_BASE ?= ghcr.io/nearnodeflash/nnf-mfu
Expand Down Expand Up @@ -136,7 +138,7 @@ container-unit-test: .version ## Run tests inside a container image
${CONTAINER_TOOL} build -f Dockerfile --label $(IMAGE_TAG_BASE)-$@:$(VERSION)-$@ -t $(IMAGE_TAG_BASE)-$@:$(VERSION) --target testing $(CONTAINER_BUILDARGS) .
${CONTAINER_TOOL} run --rm -t --name $@-nnf-dm $(IMAGE_TAG_BASE)-$@:$(VERSION)

##@ Build
##@ Build the controller manager.
RPM_PLATFORM ?= linux/amd64
RPM_TARGET ?= x86_64
.PHONY: build-daemon-rpm
Expand Down Expand Up @@ -168,6 +170,35 @@ build: generate fmt vet ## Build manager binary.
run: manifests generate fmt vet ## Run a controller from your host.
CGO_ENABLED=0 go run cmd/main.go

##@ Build the copy-offload container's daemon outside the container.
.PHONY: build-copy-offload-local
build-copy-offload-local: GOOS = $(shell go env GOOS)
build-copy-offload-local: GOARCH = $(shell go env GOARCH)
build-copy-offload-local: build-copy-offload-with

.PHONY: build-copy-offload-with
build-copy-offload-with: $(LOCALBIN)
build-copy-offload-with: fmt vet ## Build standalone copy-offload daemon
CGO_ENABLED=0 GOOS=$(GOOS) GOARCH=$(GOARCH) go build -o bin/nnf-copy-offload daemons/copy-offload/cmd/main.go

.PHONY: build-copy-offload-docker-local
build-copy-offload-docker-local: GOARCH = $(shell go env GOARCH)
build-copy-offload-docker-local: build-copy-offload-docker-with

.PHONY: build-copy-offload-docker-amd64
build-copy-offload-docker-amd64: GOARCH = amd64
build-copy-offload-docker-amd64: build-copy-offload-docker-with

.PHONY: build-copy-offload-docker-with
build-copy-offload-docker-with: VERSION ?= $(shell cat .version)
build-copy-offload-docker-with: .version ## Build docker image with the manager.
${CONTAINER_TOOL} build --platform linux/$(GOARCH) --target $(IMAGE_COPY_OFFLOAD_TARGET) -t $(IMAGE_COPY_OFFLOAD_TAG_BASE):$(VERSION) $(CONTAINER_BUILDARGS) .

.PHONY: kind-push-copy-offload
kind-push-copy-offload: VERSION ?= $(shell cat .version)
kind-push-copy-offload: .version
kind load docker-image $(IMAGE_COPY_OFFLOAD_TAG_BASE):$(VERSION)

# If you wish to build the manager image targeting other platforms you can use the --platform flag.
# (i.e. docker build --platform linux/arm64). However, you must enable docker buildKit for it.
# More info: https://docs.docker.com/develop/develop-images/build_enhancements/
Expand Down Expand Up @@ -215,9 +246,6 @@ docker-buildx-debug: docker-buildx

kind-push: VERSION ?= $(shell cat .version)
kind-push: .version ## Push docker image to kind
# Nnf-dm is used on all nodes. It's on the management node for the
# nnf-dm-controller-manager deployment, and on the rabbit nodes for
# the nnf-dm-rsyncnode daemonset that is created by that deployment.
kind load docker-image $(IMAGE_TAG_BASE):$(VERSION)

kind-push-debug: VERSION ?= $(shell cat .version)
Expand Down
2 changes: 1 addition & 1 deletion crd-bumper.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# A comma-separated list of directories where more Go code can be found, beyond
# the usual cmd/, api/, internal/ that kubebuilder would put in place. The Go
# files in these dirs will be bumped to the new hub version.
extra_go_dirs: daemons/compute/server/servers
extra_go_dirs: daemons/compute/server,daemons/copy-offload

# A comma-separated list of directories of Kustomize config files that have
# references to the API and that must be updated to the new hub version so
Expand Down
4 changes: 2 additions & 2 deletions daemons/compute/server/servers/server_default.go
Original file line number Diff line number Diff line change
Expand Up @@ -866,7 +866,7 @@ func (s *defaultServer) findRabbitRelativeSource(ctx context.Context, computeMou
}

if len(clientMounts.Items) == 0 {
return "", fmt.Errorf("no client mounts found on node '%s'", s.namespace)
return "", fmt.Errorf("no client mounts found for node '%s'", s.namespace)
}

for _, clientMount := range clientMounts.Items {
Expand Down Expand Up @@ -899,7 +899,7 @@ func (s *defaultServer) findComputeMountInfo(ctx context.Context, req *pb.DataMo
}

if len(clientMounts.Items) == 0 {
return nil, nil, fmt.Errorf("no client mounts found on node '%s'", s.name)
return nil, nil, fmt.Errorf("no client mounts found for node '%s'", s.name)
}

for _, clientMount := range clientMounts.Items {
Expand Down
126 changes: 126 additions & 0 deletions daemons/copy-offload/cmd/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
/*
* Copyright 2024 Hewlett Packard Enterprise Development LP
* Other additional copyright holders may be indicated within.
*
* The entirety of this work is licensed under the Apache License,
* Version 2.0 (the "License"); you may not use this file except
* in compliance with the License.
*
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package main

import (
"context"
"errors"
"flag"
"fmt"
"log/slog"
"net/http"
"os"

"github.com/go-logr/logr"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
zapcr "sigs.k8s.io/controller-runtime/pkg/log/zap"

dwsv1alpha2 "github.com/DataWorkflowServices/dws/api/v1alpha2"
"github.com/NearNodeFlash/nnf-dm/daemons/copy-offload/pkg/driver"
userHttp "github.com/NearNodeFlash/nnf-dm/daemons/copy-offload/pkg/server"
nnfv1alpha4 "github.com/NearNodeFlash/nnf-sos/api/v1alpha4"
)

var (
scheme = runtime.NewScheme()
rabbitName string
)

func init() {
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
utilruntime.Must(nnfv1alpha4.AddToScheme(scheme))
utilruntime.Must(dwsv1alpha2.AddToScheme(scheme))
}

func setupLog() logr.Logger {
encoder := zapcore.NewConsoleEncoder(zap.NewDevelopmentEncoderConfig())
zaplogger := zapcr.New(zapcr.Encoder(encoder), zapcr.UseDevMode(true))
ctrl.SetLogger(zaplogger)

// controllerruntime logger.
crLog := ctrl.Log.WithName("copy-offload")
return crLog
}

func setupClient(crLog logr.Logger) client.Client {
config := ctrl.GetConfigOrDie()

clnt, err := client.New(config, client.Options{Scheme: scheme})
if err != nil {
crLog.Error(err, "Unable to create client")
os.Exit(1)
}
return clnt
}

func clientSanity(crLog logr.Logger, clnt client.Client, rabbitName string) {
// Sanity check the client connection.
nnfNode := &nnfv1alpha4.NnfNode{}
if err := clnt.Get(context.TODO(), types.NamespacedName{Name: "nnf-nlc", Namespace: rabbitName}, nnfNode); err != nil {
crLog.Error(err, "Failed to retrieve my own NnfNode")
os.Exit(1)
}
}

func main() {
port := "8080"
mock := false

flag.StringVar(&port, "port", port, "Port for server.")
flag.BoolVar(&mock, "mock", mock, "Mock mode for tests; does not use k8s.")
flag.Parse()

rabbitName = os.Getenv("NNF_NODE_NAME")
if rabbitName == "" {
fmt.Println("Did not find NNF_NODE_NAME")
os.Exit(1)
}

crLog := setupLog()
// Make one of these for this server, and use it in all requests.
drvr := &driver.Driver{Log: crLog, RabbitName: rabbitName, Mock: mock}
if !mock {
clnt := setupClient(crLog)
clientSanity(crLog, clnt, rabbitName)
drvr.Client = clnt
}
slog.Info("Ready", "node", rabbitName, "port", port, "mock", mock)

httpHandler := &userHttp.UserHttp{Log: crLog, Drvr: drvr, Mock: mock}

http.HandleFunc("/hello", httpHandler.Hello)
http.HandleFunc("/trial", httpHandler.TrialRequest)
http.HandleFunc("/cancel/", httpHandler.CancelRequest)
http.HandleFunc("/list", httpHandler.ListRequests)

err := http.ListenAndServe(fmt.Sprintf(":%s", port), nil)
if errors.Is(err, http.ErrServerClosed) {
slog.Info("the server is closed")
} else if err != nil {
slog.Error("unable to start server", "err", err.Error())
}
}
82 changes: 82 additions & 0 deletions daemons/copy-offload/pkg/driver/dmrequest.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
/*
* Copyright 2024 Hewlett Packard Enterprise Development LP
* Other additional copyright holders may be indicated within.
*
* The entirety of this work is licensed under the Apache License,
* Version 2.0 (the "License"); you may not use this file except
* in compliance with the License.
*
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package driver

import (
"fmt"

corev1 "k8s.io/api/core/v1"
)

// DMRequest represents the content of one http request. This has a
// one-to-one relationship with a DriverRequest object.
type DMRequest struct {
ComputeName string `json:"computeName"`

// The name and namespace of the initiating workflow
WorkflowName string `json:"workflowName"`
WorkflowNamespace string `json:"workflowNamespace"`
// Source file or directory
SourcePath string `json:"sourcePath"`
// Destination file or directory
DestinationPath string `json:"destinationPath"`
// If True, the data movement command runs `/bin/true` rather than perform actual data movement
Dryrun bool `json:"dryrun"`
// Extra options to pass to `dcp` if present in the Data Movement command.
DcpOptions string `json:"dcpOptions"`
// If true, enable server-side logging of stdout when the command is successful. Failure output is always logged.
LogStdout bool `json:"logStdout"`
// If true, store stdout in DataMovementStatusResponse.Message when the command is successful. Failure output is always contained in the message.
StoreStdout bool `json:"storeStdout"`
// The number of slots specified in the MPI hostfile. A value of 0 disables the use of slots in
// the hostfile. -1 will defer to the server side configuration.
Slots int32 `json:"slots"`
// The number of max_slots specified in the MPI hostfile. A value of 0 disables the use of
// max_slots in the hostfile. -1 will defer to the server side configuration.
MaxSlots int32 `json:"maxSlots"`
// The name of the data movement configuration profile to use. The above parameters (e.g. slots,
// logStdout) will override the settings defined by the profile. This profile must exist on the
// server otherwise the data movement operation will be invalid. Empty will default to the
// default profile.
DMProfile string `json:"dmProfile"`
// Extra options to pass to `mpirun` if present in the Data Movement command.
MpirunOptions string `json:"mpirunOptions"`
}

func (m *DMRequest) Validator() error {

if m.ComputeName == "" {
return fmt.Errorf("compute name must be supplied")
}
if m.WorkflowName == "" {
return fmt.Errorf("workflow name must be supplied")
}
if m.SourcePath == "" {
return fmt.Errorf("source path must be supplied")
}
if m.DestinationPath == "" {
return fmt.Errorf("destination path must be supplied")
}
if m.WorkflowNamespace == "" {
m.WorkflowNamespace = corev1.NamespaceDefault
}

return nil
}
Loading

0 comments on commit ab9ffbe

Please sign in to comment.