Skip to content

Commit

Permalink
Added slots/max_slots to Copy Offload API
Browse files Browse the repository at this point in the history
slots and maxSlots can now be supplied in the Copy Offload API. This
allows the user to override the server side configuration (nnf-dm-config
ConfigMap) to set the slots/max_slots in the mpirun hostfile.

Updated nnf-sos to support this.

For copy_in/copy_out, there is equivalent at this time.

Signed-off-by: Blake Devcich <[email protected]>
  • Loading branch information
bdevcich committed Sep 12, 2023
1 parent 42560b0 commit ee2d25c
Show file tree
Hide file tree
Showing 228 changed files with 19,416 additions and 4,871 deletions.
10 changes: 10 additions & 0 deletions controllers/datamovement_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,7 @@ func (r *DataMovementReconciler) Reconcile(ctx context.Context, req ctrl.Request
log.Error(err, "Data movement operation cancelled", "output", combinedOutBuf.String())
dm.Status.Status = nnfv1alpha1.DataMovementConditionReasonCancelled
} else if err != nil {
log.Error(err, "Data movement operation failed", "output", combinedOutBuf.String())
dm.Status.Status = nnfv1alpha1.DataMovementConditionReasonFailed
dm.Status.Message = fmt.Sprintf("%s: %s", err.Error(), combinedOutBuf.String())
resourceErr := dwsv1alpha2.NewResourceError("").WithError(err).WithUserMessage("data movement operation failed: %s", combinedOutBuf.String()).WithFatal()
Expand Down Expand Up @@ -471,6 +472,14 @@ func buildDMCommand(ctx context.Context, profile dmConfigProfile, hosts []string
slots := profile.Slots
maxSlots := profile.MaxSlots

// Allow the user to override the slots and max_slots in the hostfile.
if userConfig && dm.Spec.UserConfig.Slots != nil && *dm.Spec.UserConfig.Slots >= 0 {
slots = *dm.Spec.UserConfig.Slots
}
if userConfig && dm.Spec.UserConfig.MaxSlots != nil && *dm.Spec.UserConfig.MaxSlots >= 0 {
maxSlots = *dm.Spec.UserConfig.MaxSlots
}

hostfile, err = createMpiHostfile(dm.Name, hosts, slots, maxSlots)
if err != nil {
return nil, "", fmt.Errorf("error creating MPI hostfile: %v", err)
Expand Down Expand Up @@ -512,6 +521,7 @@ func buildDMCommand(ctx context.Context, profile dmConfigProfile, hosts []string

// Create an MPI Hostfile given a list of hosts, slots, and maxSlots. A temporary directory is
// created based on the DM Name. The hostfile is created inside of this directory.
// A value of 0 for slots or maxSlots will not use it in the hostfile.
func createMpiHostfile(dmName string, hosts []string, slots, maxSlots int) (string, error) {

tmpdir := filepath.Join("/tmp", dmName)
Expand Down
45 changes: 45 additions & 0 deletions controllers/datamovement_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
. "github.com/onsi/gomega/gstruct"
"go.openly.dev/pointy"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -748,6 +749,50 @@ var _ = Describe("Data Movement Test", func() {
Expect(strings.Join(cmd, " ")).Should(MatchRegexp(expectedCmdRegex))
})
})

When("slots/maxSlots are specified in the request", func() {
DescribeTable("it should use the user slots vs the profile",
func(numSlots *int) {
profileSlots, profileMaxSlots := 3, 8

profile := dmConfigProfile{
Command: defaultCommand,
Slots: profileSlots,
MaxSlots: profileMaxSlots,
}
dm.Spec.UserConfig = &nnfv1alpha1.NnfDataMovementConfig{
Slots: numSlots,
MaxSlots: numSlots,
}
_, hostfilePath, err := buildDMCommand(context.TODO(), profile, hosts, &dm)
Expect(err).ToNot(HaveOccurred())
Expect(hostfilePath).ToNot(BeEmpty())
DeferCleanup(func() {
Expect(os.Remove(hostfilePath)).ToNot(HaveOccurred())
})

content, err := os.ReadFile(hostfilePath)
Expect(err).ToNot(HaveOccurred())
Expect(string(content)).ToNot(BeEmpty())

if numSlots == nil {
// if nil, use the profile's slots
Expect(string(content)).Should(MatchRegexp(fmt.Sprintf(" slots=%d", profileSlots)))
Expect(string(content)).Should(MatchRegexp(fmt.Sprintf(" max_slots=%d", profileMaxSlots)))
} else if *numSlots == 0 {
// if 0, then don't use slots at all
Expect(string(content)).ShouldNot(MatchRegexp(" slots"))
Expect(string(content)).ShouldNot(MatchRegexp(" max_slots"))
} else {
Expect(string(content)).Should(MatchRegexp(fmt.Sprintf(" slots=%d", *numSlots)))
Expect(string(content)).Should(MatchRegexp(fmt.Sprintf(" max_slots=%d", *numSlots)))
}
},
Entry("when non-zero", pointy.Int(17)),
Entry("when zero it should omit", pointy.Int(0)),
Entry("when nil it should use the profile", nil),
)
})
})
})
})
13 changes: 13 additions & 0 deletions daemons/compute/Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,19 @@ To build the generated code used for the Go and Python example clients, run `./p

`client-go` and `client-py` will have updates to the generated files when the API changes.

To run this script, you will need the following installed on your system:

- [protoc](https://grpc.io/docs/protoc-installation/)
- [protoc-gen-doc](https://github.com/pseudomuto/protoc-gen-doc#installation)
- [grpc and grpc tools python modules](https://grpc.io/docs/languages/python/quickstart/)

On OSX, this can be done easily via:

```shell
brew install protobuf protoc-gen-go protoc-gen-go-grpc
pip3 install grpcio grpcio_tools
```

#### C and C++

For the C and C++ clients, the clients must be built to generate the source code to support the API. Run the `Makefiles` in the `_client-c`, `client-cpp`, and `lib-cpp` directories to update the generated API files.
Expand Down
8 changes: 8 additions & 0 deletions daemons/compute/api/datamovement.proto
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,14 @@ message DataMovementCreateRequest {

// If true, store stdout in DataMovementStatusResponse.Message when the command is successful. Failure output is always contained in the message.
bool storeStdout = 7;

// The number of slots specified in the MPI hostfile. A value of 0 disables the use of slots in
// the hostfile. -1 will defer to the server side configuration.
int32 slots = 8;

// The number of max_slots specified in the MPI hostfile. A value of 0 disables the use of
// max_slots in the hostfile. -1 will defer to the server side configuration.
int32 maxSlots = 9;
}

// The Data Movement Create Response to indicate the status of of the Data Movement Request.
Expand Down
Loading

0 comments on commit ee2d25c

Please sign in to comment.