diff --git a/internal/controllers/gpupartitioner/node_controller_int_test.go b/internal/controllers/gpupartitioner/node_controller_int_test.go new file mode 100644 index 0000000..93c6147 --- /dev/null +++ b/internal/controllers/gpupartitioner/node_controller_int_test.go @@ -0,0 +1,150 @@ +//go:build integration + +/* + * Copyright 2023 nebuly.com + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package gpupartitioner_test + +import ( + "fmt" + "github.com/nebuly-ai/nos/pkg/api/nos.nebuly.com/v1alpha1" + "github.com/nebuly-ai/nos/pkg/constant" + "github.com/nebuly-ai/nos/pkg/gpu" + "github.com/nebuly-ai/nos/pkg/test/factory" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/stretchr/testify/mock" + "time" +) + +var _ = Describe("Node Controller", func() { + const ( + timeout = time.Second * 10 + interval = time.Second * 1 + ) + + BeforeEach(func() { + }) + + AfterEach(func() { + }) + + When("A node does not have GPU Count label", func() { + It("Should not be added to the Cluster State", func() { + By("By creating a node without GPU Count label") + nodeName := "node-without-gpu-count-label" + node := factory.BuildNode(nodeName).WithLabels(map[string]string{ + constant.LabelNvidiaProduct: gpu.GPUModel_A100_PCIe_80GB.String(), + v1alpha1.LabelGpuPartitioning: gpu.PartitioningKindMps.String(), + }).Get() + Expect(k8sClient.Create(ctx, &node)).To(Succeed()) + + By("Checking that the node is not added to the Cluster State") + Consistently(func() bool { + _, ok := clusterState.GetNode(nodeName) + return ok + }, 3, interval).Should(BeFalse()) + }) + }) + + When("A node does not have GPU Model label", func() { + It("Should not be added to the Cluster State", func() { + + By("By creating a node without GPU Model label") + nodeName := "node-without-gpu-model-label" + node := factory.BuildNode(nodeName).WithLabels(map[string]string{ + constant.LabelNvidiaCount: "1", + v1alpha1.LabelGpuPartitioning: gpu.PartitioningKindMps.String(), + }).Get() + Expect(k8sClient.Create(ctx, &node)).To(Succeed()) + + By("Checking that the node is not added to the Cluster State") + Consistently(func() bool { + _, ok := clusterState.GetNode(nodeName) + return ok + }, 3, interval).Should(BeFalse()) + }) + }) + + When("A node with GPU labels has MPS partitioning enabled", func() { + It("Should always be added to the Cluster State", func() { + By("By creating a node with MPS partitioning enabled") + nodeName := "node-mps" + node := factory.BuildNode(nodeName).WithLabels(map[string]string{ + constant.LabelNvidiaProduct: gpu.GPUModel_A100_PCIe_80GB.String(), + constant.LabelNvidiaCount: "1", + v1alpha1.LabelGpuPartitioning: gpu.PartitioningKindMps.String(), + }).Get() + Expect(k8sClient.Create(ctx, &node)).To(Succeed()) + + By("Checking that the node is added to the Cluster State") + Eventually(func() bool { + _, ok := clusterState.GetNode(nodeName) + return ok + }, timeout, interval).Should(BeTrue()) + }) + }) + + When("A node with GPU labels has MIG partitioning enabled", func() { + It("Should *not* be added to the Cluster State it is not initialized", func() { + By("By creating a node with MIG partitioning enabled, but not initialized") + nodeName := "node-mig-not-initialized" + node := factory.BuildNode(nodeName).WithLabels(map[string]string{ + constant.LabelNvidiaProduct: gpu.GPUModel_A100_PCIe_80GB.String(), + constant.LabelNvidiaCount: "1", + v1alpha1.LabelGpuPartitioning: gpu.PartitioningKindMig.String(), + }).Get() + Expect(k8sClient.Create(ctx, &node)).To(Succeed()) + + By("Checking that the controller triggers Node initialization") + migNodeInitializer.On("InitNodePartitioning", mock.Anything, mock.Anything). + Return(nil). + Once() + + By("Checking that the node is *not* added to the Cluster State") + Consistently(func() bool { + _, ok := clusterState.GetNode(nodeName) + return ok + }, 5, interval).Should(BeFalse()) + }) + + It("Should be added to the Cluster State it is initialized", func() { + By("By creating an initialized node with MIG partitioning enabled") + nodeName := "node-mig-initialized" + node := factory.BuildNode(nodeName). + WithLabels(map[string]string{ + constant.LabelNvidiaProduct: gpu.GPUModel_A100_PCIe_80GB.String(), + constant.LabelNvidiaCount: "1", + v1alpha1.LabelGpuPartitioning: gpu.PartitioningKindMig.String(), + }). + WithAnnotations(map[string]string{ + fmt.Sprintf(v1alpha1.AnnotationGpuSpecFormat, 0, "10gb"): "1", + }). + Get() + Expect(k8sClient.Create(ctx, &node)).To(Succeed()) + + migNodeInitializer.On("InitNodePartitioning", mock.Anything, mock.Anything). + Return(nil). + Maybe() + + By("Checking that the node is added to the Cluster State") + Eventually(func() bool { + _, ok := clusterState.GetNode(nodeName) + return ok + }, timeout, interval).Should(BeTrue()) + }) + }) +}) diff --git a/internal/controllers/gpupartitioner/suite_int_test.go b/internal/controllers/gpupartitioner/suite_int_test.go new file mode 100644 index 0000000..9e8d9d2 --- /dev/null +++ b/internal/controllers/gpupartitioner/suite_int_test.go @@ -0,0 +1,109 @@ +//go:build integration + +/* + * Copyright 2023 nebuly.com + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package gpupartitioner_test + +import ( + "context" + "github.com/nebuly-ai/nos/internal/controllers/gpupartitioner" + "github.com/nebuly-ai/nos/internal/partitioning/core" + "github.com/nebuly-ai/nos/internal/partitioning/state" + "github.com/nebuly-ai/nos/pkg/api/nos.nebuly.com/v1alpha1" + partitioningmock "github.com/nebuly-ai/nos/pkg/test/mocks/partitioning" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/kubernetes/pkg/scheduler/framework" + "path/filepath" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/envtest" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + "testing" +) + +var cfg *rest.Config +var k8sClient client.Client +var testEnv *envtest.Environment +var ( + ctx context.Context + cancel context.CancelFunc + migNodeInitializer *partitioningmock.NodeInitializer + clusterState *state.ClusterState +) + +var _ core.NodeInitializer = migNodeInitializer + +func TestAPIs(t *testing.T) { + RegisterFailHandler(Fail) + migNodeInitializer = partitioningmock.NewNodeInitializer(t) + RunSpecs(t, "Controllers Suite") +} + +var _ = BeforeSuite(func() { + logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) + ctx, cancel = context.WithCancel(context.Background()) + + By("bootstrapping test environment") + testEnv = &envtest.Environment{ + CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "config", "operator", "crd", "bases")}, + ErrorIfCRDPathMissing: true, + } + + var err error + + // cfg is defined in this file globally. + cfg, err = testEnv.Start() + Expect(err).NotTo(HaveOccurred()) + Expect(cfg).NotTo(BeNil()) + + err = v1alpha1.AddToScheme(scheme.Scheme) + Expect(err).NotTo(HaveOccurred()) + + k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) + Expect(err).NotTo(HaveOccurred()) + Expect(k8sClient).NotTo(BeNil()) + + k8sManager, err := ctrl.NewManager(cfg, ctrl.Options{ + Scheme: scheme.Scheme, + MetricsBindAddress: ":8083", + }) + Expect(err).ToNot(HaveOccurred()) + + // Init Cluster State + clusterState = state.NewClusterState(map[string]framework.NodeInfo{}) + + // Setup Node Controller + reporter := gpupartitioner.NewNodeController(k8sClient, scheme.Scheme, migNodeInitializer, clusterState) + Expect(reporter.SetupWithManager(k8sManager, "NodeController")).To(Succeed()) + + go func() { + defer GinkgoRecover() + err = k8sManager.Start(ctx) + Expect(err).ToNot(HaveOccurred(), "failed to run manager") + }() +}) + +var _ = AfterSuite(func() { + cancel() + By("tearing down the test environment") + err := testEnv.Stop() + Expect(err).NotTo(HaveOccurred()) +}) diff --git a/pkg/test/mocks/partitioning/initializer.go b/pkg/test/mocks/partitioning/initializer.go new file mode 100644 index 0000000..4bd2c1f --- /dev/null +++ b/pkg/test/mocks/partitioning/initializer.go @@ -0,0 +1,61 @@ +/* + * Copyright 2023 nebuly.com + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Code generated by mockery v2.23.1. DO NOT EDIT. + +package partitioning + +import ( + context "context" + + mock "github.com/stretchr/testify/mock" + + v1 "k8s.io/api/core/v1" +) + +// NodeInitializer is an autogenerated mock type for the NodeInitializer type +type NodeInitializer struct { + mock.Mock +} + +// InitNodePartitioning provides a mock function with given fields: ctx, node +func (_m *NodeInitializer) InitNodePartitioning(ctx context.Context, node v1.Node) error { + ret := _m.Called(ctx, node) + + var r0 error + if rf, ok := ret.Get(0).(func(context.Context, v1.Node) error); ok { + r0 = rf(ctx, node) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +type mockConstructorTestingTNewNodeInitializer interface { + mock.TestingT + Cleanup(func()) +} + +// NewNodeInitializer creates a new instance of NodeInitializer. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. +func NewNodeInitializer(t mockConstructorTestingTNewNodeInitializer) *NodeInitializer { + mock := &NodeInitializer{} + mock.Mock.Test(t) + + t.Cleanup(func() { mock.AssertExpectations(t) }) + + return mock +}