Skip to content

Commit

Permalink
Add integration tests for GPU Partitioner node controller
Browse files Browse the repository at this point in the history
  • Loading branch information
Telemaco019 committed Apr 6, 2023
1 parent b756568 commit 284a9a6
Show file tree
Hide file tree
Showing 3 changed files with 320 additions and 0 deletions.
150 changes: 150 additions & 0 deletions internal/controllers/gpupartitioner/node_controller_int_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
//go:build integration

/*
* Copyright 2023 nebuly.com
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package gpupartitioner_test

import (
"fmt"
"github.com/nebuly-ai/nos/pkg/api/nos.nebuly.com/v1alpha1"
"github.com/nebuly-ai/nos/pkg/constant"
"github.com/nebuly-ai/nos/pkg/gpu"
"github.com/nebuly-ai/nos/pkg/test/factory"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/stretchr/testify/mock"
"time"
)

var _ = Describe("Node Controller", func() {
const (
timeout = time.Second * 10
interval = time.Second * 1
)

BeforeEach(func() {
})

AfterEach(func() {
})

When("A node does not have GPU Count label", func() {
It("Should not be added to the Cluster State", func() {
By("By creating a node without GPU Count label")
nodeName := "node-without-gpu-count-label"
node := factory.BuildNode(nodeName).WithLabels(map[string]string{
constant.LabelNvidiaProduct: gpu.GPUModel_A100_PCIe_80GB.String(),
v1alpha1.LabelGpuPartitioning: gpu.PartitioningKindMps.String(),
}).Get()
Expect(k8sClient.Create(ctx, &node)).To(Succeed())

By("Checking that the node is not added to the Cluster State")
Consistently(func() bool {
_, ok := clusterState.GetNode(nodeName)
return ok
}, 3, interval).Should(BeFalse())
})
})

When("A node does not have GPU Model label", func() {
It("Should not be added to the Cluster State", func() {

By("By creating a node without GPU Model label")
nodeName := "node-without-gpu-model-label"
node := factory.BuildNode(nodeName).WithLabels(map[string]string{
constant.LabelNvidiaCount: "1",
v1alpha1.LabelGpuPartitioning: gpu.PartitioningKindMps.String(),
}).Get()
Expect(k8sClient.Create(ctx, &node)).To(Succeed())

By("Checking that the node is not added to the Cluster State")
Consistently(func() bool {
_, ok := clusterState.GetNode(nodeName)
return ok
}, 3, interval).Should(BeFalse())
})
})

When("A node with GPU labels has MPS partitioning enabled", func() {
It("Should always be added to the Cluster State", func() {
By("By creating a node with MPS partitioning enabled")
nodeName := "node-mps"
node := factory.BuildNode(nodeName).WithLabels(map[string]string{
constant.LabelNvidiaProduct: gpu.GPUModel_A100_PCIe_80GB.String(),
constant.LabelNvidiaCount: "1",
v1alpha1.LabelGpuPartitioning: gpu.PartitioningKindMps.String(),
}).Get()
Expect(k8sClient.Create(ctx, &node)).To(Succeed())

By("Checking that the node is added to the Cluster State")
Eventually(func() bool {
_, ok := clusterState.GetNode(nodeName)
return ok
}, timeout, interval).Should(BeTrue())
})
})

When("A node with GPU labels has MIG partitioning enabled", func() {
It("Should *not* be added to the Cluster State it is not initialized", func() {
By("By creating a node with MIG partitioning enabled, but not initialized")
nodeName := "node-mig-not-initialized"
node := factory.BuildNode(nodeName).WithLabels(map[string]string{
constant.LabelNvidiaProduct: gpu.GPUModel_A100_PCIe_80GB.String(),
constant.LabelNvidiaCount: "1",
v1alpha1.LabelGpuPartitioning: gpu.PartitioningKindMig.String(),
}).Get()
Expect(k8sClient.Create(ctx, &node)).To(Succeed())

By("Checking that the controller triggers Node initialization")
migNodeInitializer.On("InitNodePartitioning", mock.Anything, mock.Anything).
Return(nil).
Once()

By("Checking that the node is *not* added to the Cluster State")
Consistently(func() bool {
_, ok := clusterState.GetNode(nodeName)
return ok
}, 5, interval).Should(BeFalse())
})

It("Should be added to the Cluster State it is initialized", func() {
By("By creating an initialized node with MIG partitioning enabled")
nodeName := "node-mig-initialized"
node := factory.BuildNode(nodeName).
WithLabels(map[string]string{
constant.LabelNvidiaProduct: gpu.GPUModel_A100_PCIe_80GB.String(),
constant.LabelNvidiaCount: "1",
v1alpha1.LabelGpuPartitioning: gpu.PartitioningKindMig.String(),
}).
WithAnnotations(map[string]string{
fmt.Sprintf(v1alpha1.AnnotationGpuSpecFormat, 0, "10gb"): "1",
}).
Get()
Expect(k8sClient.Create(ctx, &node)).To(Succeed())

migNodeInitializer.On("InitNodePartitioning", mock.Anything, mock.Anything).
Return(nil).
Maybe()

By("Checking that the node is added to the Cluster State")
Eventually(func() bool {
_, ok := clusterState.GetNode(nodeName)
return ok
}, timeout, interval).Should(BeTrue())
})
})
})
109 changes: 109 additions & 0 deletions internal/controllers/gpupartitioner/suite_int_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
//go:build integration

/*
* Copyright 2023 nebuly.com
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package gpupartitioner_test

import (
"context"
"github.com/nebuly-ai/nos/internal/controllers/gpupartitioner"
"github.com/nebuly-ai/nos/internal/partitioning/core"
"github.com/nebuly-ai/nos/internal/partitioning/state"
"github.com/nebuly-ai/nos/pkg/api/nos.nebuly.com/v1alpha1"
partitioningmock "github.com/nebuly-ai/nos/pkg/test/mocks/partitioning"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"k8s.io/client-go/kubernetes/scheme"
"k8s.io/client-go/rest"
"k8s.io/kubernetes/pkg/scheduler/framework"
"path/filepath"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/envtest"
logf "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
"testing"
)

var cfg *rest.Config
var k8sClient client.Client
var testEnv *envtest.Environment
var (
ctx context.Context
cancel context.CancelFunc
migNodeInitializer *partitioningmock.NodeInitializer
clusterState *state.ClusterState
)

var _ core.NodeInitializer = migNodeInitializer

func TestAPIs(t *testing.T) {
RegisterFailHandler(Fail)
migNodeInitializer = partitioningmock.NewNodeInitializer(t)
RunSpecs(t, "Controllers Suite")
}

var _ = BeforeSuite(func() {
logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true)))
ctx, cancel = context.WithCancel(context.Background())

By("bootstrapping test environment")
testEnv = &envtest.Environment{
CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "config", "operator", "crd", "bases")},
ErrorIfCRDPathMissing: true,
}

var err error

// cfg is defined in this file globally.
cfg, err = testEnv.Start()
Expect(err).NotTo(HaveOccurred())
Expect(cfg).NotTo(BeNil())

err = v1alpha1.AddToScheme(scheme.Scheme)
Expect(err).NotTo(HaveOccurred())

k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme})
Expect(err).NotTo(HaveOccurred())
Expect(k8sClient).NotTo(BeNil())

k8sManager, err := ctrl.NewManager(cfg, ctrl.Options{
Scheme: scheme.Scheme,
MetricsBindAddress: ":8083",
})
Expect(err).ToNot(HaveOccurred())

// Init Cluster State
clusterState = state.NewClusterState(map[string]framework.NodeInfo{})

// Setup Node Controller
reporter := gpupartitioner.NewNodeController(k8sClient, scheme.Scheme, migNodeInitializer, clusterState)
Expect(reporter.SetupWithManager(k8sManager, "NodeController")).To(Succeed())

go func() {
defer GinkgoRecover()
err = k8sManager.Start(ctx)
Expect(err).ToNot(HaveOccurred(), "failed to run manager")
}()
})

var _ = AfterSuite(func() {
cancel()
By("tearing down the test environment")
err := testEnv.Stop()
Expect(err).NotTo(HaveOccurred())
})
61 changes: 61 additions & 0 deletions pkg/test/mocks/partitioning/initializer.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 284a9a6

Please sign in to comment.