-
Notifications
You must be signed in to change notification settings - Fork 36
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add integration tests for GPU Partitioner node controller
- Loading branch information
1 parent
b756568
commit 284a9a6
Showing
3 changed files
with
320 additions
and
0 deletions.
There are no files selected for viewing
150 changes: 150 additions & 0 deletions
150
internal/controllers/gpupartitioner/node_controller_int_test.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
//go:build integration | ||
|
||
/* | ||
* Copyright 2023 nebuly.com | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package gpupartitioner_test | ||
|
||
import ( | ||
"fmt" | ||
"github.com/nebuly-ai/nos/pkg/api/nos.nebuly.com/v1alpha1" | ||
"github.com/nebuly-ai/nos/pkg/constant" | ||
"github.com/nebuly-ai/nos/pkg/gpu" | ||
"github.com/nebuly-ai/nos/pkg/test/factory" | ||
. "github.com/onsi/ginkgo/v2" | ||
. "github.com/onsi/gomega" | ||
"github.com/stretchr/testify/mock" | ||
"time" | ||
) | ||
|
||
var _ = Describe("Node Controller", func() { | ||
const ( | ||
timeout = time.Second * 10 | ||
interval = time.Second * 1 | ||
) | ||
|
||
BeforeEach(func() { | ||
}) | ||
|
||
AfterEach(func() { | ||
}) | ||
|
||
When("A node does not have GPU Count label", func() { | ||
It("Should not be added to the Cluster State", func() { | ||
By("By creating a node without GPU Count label") | ||
nodeName := "node-without-gpu-count-label" | ||
node := factory.BuildNode(nodeName).WithLabels(map[string]string{ | ||
constant.LabelNvidiaProduct: gpu.GPUModel_A100_PCIe_80GB.String(), | ||
v1alpha1.LabelGpuPartitioning: gpu.PartitioningKindMps.String(), | ||
}).Get() | ||
Expect(k8sClient.Create(ctx, &node)).To(Succeed()) | ||
|
||
By("Checking that the node is not added to the Cluster State") | ||
Consistently(func() bool { | ||
_, ok := clusterState.GetNode(nodeName) | ||
return ok | ||
}, 3, interval).Should(BeFalse()) | ||
}) | ||
}) | ||
|
||
When("A node does not have GPU Model label", func() { | ||
It("Should not be added to the Cluster State", func() { | ||
|
||
By("By creating a node without GPU Model label") | ||
nodeName := "node-without-gpu-model-label" | ||
node := factory.BuildNode(nodeName).WithLabels(map[string]string{ | ||
constant.LabelNvidiaCount: "1", | ||
v1alpha1.LabelGpuPartitioning: gpu.PartitioningKindMps.String(), | ||
}).Get() | ||
Expect(k8sClient.Create(ctx, &node)).To(Succeed()) | ||
|
||
By("Checking that the node is not added to the Cluster State") | ||
Consistently(func() bool { | ||
_, ok := clusterState.GetNode(nodeName) | ||
return ok | ||
}, 3, interval).Should(BeFalse()) | ||
}) | ||
}) | ||
|
||
When("A node with GPU labels has MPS partitioning enabled", func() { | ||
It("Should always be added to the Cluster State", func() { | ||
By("By creating a node with MPS partitioning enabled") | ||
nodeName := "node-mps" | ||
node := factory.BuildNode(nodeName).WithLabels(map[string]string{ | ||
constant.LabelNvidiaProduct: gpu.GPUModel_A100_PCIe_80GB.String(), | ||
constant.LabelNvidiaCount: "1", | ||
v1alpha1.LabelGpuPartitioning: gpu.PartitioningKindMps.String(), | ||
}).Get() | ||
Expect(k8sClient.Create(ctx, &node)).To(Succeed()) | ||
|
||
By("Checking that the node is added to the Cluster State") | ||
Eventually(func() bool { | ||
_, ok := clusterState.GetNode(nodeName) | ||
return ok | ||
}, timeout, interval).Should(BeTrue()) | ||
}) | ||
}) | ||
|
||
When("A node with GPU labels has MIG partitioning enabled", func() { | ||
It("Should *not* be added to the Cluster State it is not initialized", func() { | ||
By("By creating a node with MIG partitioning enabled, but not initialized") | ||
nodeName := "node-mig-not-initialized" | ||
node := factory.BuildNode(nodeName).WithLabels(map[string]string{ | ||
constant.LabelNvidiaProduct: gpu.GPUModel_A100_PCIe_80GB.String(), | ||
constant.LabelNvidiaCount: "1", | ||
v1alpha1.LabelGpuPartitioning: gpu.PartitioningKindMig.String(), | ||
}).Get() | ||
Expect(k8sClient.Create(ctx, &node)).To(Succeed()) | ||
|
||
By("Checking that the controller triggers Node initialization") | ||
migNodeInitializer.On("InitNodePartitioning", mock.Anything, mock.Anything). | ||
Return(nil). | ||
Once() | ||
|
||
By("Checking that the node is *not* added to the Cluster State") | ||
Consistently(func() bool { | ||
_, ok := clusterState.GetNode(nodeName) | ||
return ok | ||
}, 5, interval).Should(BeFalse()) | ||
}) | ||
|
||
It("Should be added to the Cluster State it is initialized", func() { | ||
By("By creating an initialized node with MIG partitioning enabled") | ||
nodeName := "node-mig-initialized" | ||
node := factory.BuildNode(nodeName). | ||
WithLabels(map[string]string{ | ||
constant.LabelNvidiaProduct: gpu.GPUModel_A100_PCIe_80GB.String(), | ||
constant.LabelNvidiaCount: "1", | ||
v1alpha1.LabelGpuPartitioning: gpu.PartitioningKindMig.String(), | ||
}). | ||
WithAnnotations(map[string]string{ | ||
fmt.Sprintf(v1alpha1.AnnotationGpuSpecFormat, 0, "10gb"): "1", | ||
}). | ||
Get() | ||
Expect(k8sClient.Create(ctx, &node)).To(Succeed()) | ||
|
||
migNodeInitializer.On("InitNodePartitioning", mock.Anything, mock.Anything). | ||
Return(nil). | ||
Maybe() | ||
|
||
By("Checking that the node is added to the Cluster State") | ||
Eventually(func() bool { | ||
_, ok := clusterState.GetNode(nodeName) | ||
return ok | ||
}, timeout, interval).Should(BeTrue()) | ||
}) | ||
}) | ||
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
//go:build integration | ||
|
||
/* | ||
* Copyright 2023 nebuly.com | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package gpupartitioner_test | ||
|
||
import ( | ||
"context" | ||
"github.com/nebuly-ai/nos/internal/controllers/gpupartitioner" | ||
"github.com/nebuly-ai/nos/internal/partitioning/core" | ||
"github.com/nebuly-ai/nos/internal/partitioning/state" | ||
"github.com/nebuly-ai/nos/pkg/api/nos.nebuly.com/v1alpha1" | ||
partitioningmock "github.com/nebuly-ai/nos/pkg/test/mocks/partitioning" | ||
. "github.com/onsi/ginkgo/v2" | ||
. "github.com/onsi/gomega" | ||
"k8s.io/client-go/kubernetes/scheme" | ||
"k8s.io/client-go/rest" | ||
"k8s.io/kubernetes/pkg/scheduler/framework" | ||
"path/filepath" | ||
ctrl "sigs.k8s.io/controller-runtime" | ||
"sigs.k8s.io/controller-runtime/pkg/client" | ||
"sigs.k8s.io/controller-runtime/pkg/envtest" | ||
logf "sigs.k8s.io/controller-runtime/pkg/log" | ||
"sigs.k8s.io/controller-runtime/pkg/log/zap" | ||
"testing" | ||
) | ||
|
||
var cfg *rest.Config | ||
var k8sClient client.Client | ||
var testEnv *envtest.Environment | ||
var ( | ||
ctx context.Context | ||
cancel context.CancelFunc | ||
migNodeInitializer *partitioningmock.NodeInitializer | ||
clusterState *state.ClusterState | ||
) | ||
|
||
var _ core.NodeInitializer = migNodeInitializer | ||
|
||
func TestAPIs(t *testing.T) { | ||
RegisterFailHandler(Fail) | ||
migNodeInitializer = partitioningmock.NewNodeInitializer(t) | ||
RunSpecs(t, "Controllers Suite") | ||
} | ||
|
||
var _ = BeforeSuite(func() { | ||
logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) | ||
ctx, cancel = context.WithCancel(context.Background()) | ||
|
||
By("bootstrapping test environment") | ||
testEnv = &envtest.Environment{ | ||
CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "config", "operator", "crd", "bases")}, | ||
ErrorIfCRDPathMissing: true, | ||
} | ||
|
||
var err error | ||
|
||
// cfg is defined in this file globally. | ||
cfg, err = testEnv.Start() | ||
Expect(err).NotTo(HaveOccurred()) | ||
Expect(cfg).NotTo(BeNil()) | ||
|
||
err = v1alpha1.AddToScheme(scheme.Scheme) | ||
Expect(err).NotTo(HaveOccurred()) | ||
|
||
k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) | ||
Expect(err).NotTo(HaveOccurred()) | ||
Expect(k8sClient).NotTo(BeNil()) | ||
|
||
k8sManager, err := ctrl.NewManager(cfg, ctrl.Options{ | ||
Scheme: scheme.Scheme, | ||
MetricsBindAddress: ":8083", | ||
}) | ||
Expect(err).ToNot(HaveOccurred()) | ||
|
||
// Init Cluster State | ||
clusterState = state.NewClusterState(map[string]framework.NodeInfo{}) | ||
|
||
// Setup Node Controller | ||
reporter := gpupartitioner.NewNodeController(k8sClient, scheme.Scheme, migNodeInitializer, clusterState) | ||
Expect(reporter.SetupWithManager(k8sManager, "NodeController")).To(Succeed()) | ||
|
||
go func() { | ||
defer GinkgoRecover() | ||
err = k8sManager.Start(ctx) | ||
Expect(err).ToNot(HaveOccurred(), "failed to run manager") | ||
}() | ||
}) | ||
|
||
var _ = AfterSuite(func() { | ||
cancel() | ||
By("tearing down the test environment") | ||
err := testEnv.Stop() | ||
Expect(err).NotTo(HaveOccurred()) | ||
}) |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.