7g.79gb does not work as expected. #51

houms-sony · 2024-03-12T23:36:30Z

using gpu-operator (helm 23.9.1), and nos (helm 0.1.2)

I have an issue with nvidia.com/mig-7g.79gb. when specifying it it causes nos to create the mig configuration as expected, but it seems to be specified as nvidia.com/mig-7g.80gb as shown in log below from nvidia-device-plugin.

I0312 23:04:34.682199       1 server.go:165] Starting GRPC server for 'nvidia.com/mig-7g.80gb'
I0312 23:04:34.682673       1 server.go:117] Starting to serve 'nvidia.com/mig-7g.80gb' on /var/lib/kubelet/device-plugins/nvidia-mig-7g.80gb.sock
I0312 23:04:34.684745       1 server.go:125] Registered device plugin for 'nvidia.com/mig-7g.80gb' with Kubelet

Additionally, the labels created on the node look like this

But the issue is because we specified nvidia.com/mig-7g.79gb the pod stays in pending. Note the config below (all other nvidia examples commented out below work except 7g.79gb.

---
apiVersion: batch/v1
kind: Job
metadata:
  name: job-test-7g80g
spec:
  template:
    spec:
      runtimeClassName: nvidia
      restartPolicy: Never
      containers:
      - name: nvidia
        image: nvidia/cuda:12.3.2-devel-ubuntu22.04
        command: ["sleep", "12000"]
        resources:
          limits:
            nvidia.com/mig-7g.79gb: 1
            #nvidia.com/mig-1g.10gb: 1
            #nvidia.com/mig-2g.20gb: 1
            #nvidia.com/mig-4g.40gb: 1

I tried adding 7g.80gb to allowedGeometries, but it did not work as expected. Briefly looked at code and see https://github.com/nebuly-ai/nos/blob/main/pkg/gpu/mig/known_configs.go#L93, so not sure if I missed something, or if there is a way to get the desired behavior?

The text was updated successfully, but these errors were encountered:

houms-sony · 2024-03-14T00:50:37Z

just wanted to confirm that if we update to the code to use 7g.80gb as shown in diff below, and rebuild the images, it seems to work when specifying nvidia.com/mig-7g.80gb in the resources field(and updating allowedGeometries)

--- a/config/gpupartitioner/manager/known_mig_geometries.yaml
+++ b/config/gpupartitioner/manager/known_mig_geometries.yaml
@@ -56,4 +56,4 @@
     - 1g.10gb: 1
       2g.20gb: 1
       4g.40gb: 1
-    - 7g.79gb: 1
+    - 7g.80gb: 1
diff --git a/helm-charts/nos/values.yaml b/helm-charts/nos/values.yaml
index 3806a6c..58719f8 100644
--- a/helm-charts/nos/values.yaml
+++ b/helm-charts/nos/values.yaml
@@ -373,4 +373,4 @@ gpuPartitioner:
         - 1g.10gb: 1
           2g.20gb: 1
           4g.40gb: 1
-        - 7g.79gb: 1
+        - 7g.80gb: 1
diff --git a/internal/partitioning/core/util_test.go b/internal/partitioning/core/util_test.go
index d5b9f9c..b3d8000 100644
--- a/internal/partitioning/core/util_test.go
+++ b/internal/partitioning/core/util_test.go
@@ -73,14 +73,14 @@ func TestPodSorter(t *testing.T) {
                                ).Get(),
                                factory.BuildPod("ns-1", "pd-2").WithPriority(2).WithContainer(
                                        factory.BuildContainer("c1", "test").
-                                               WithScalarResourceRequest(mig.Profile7g79gb.AsResourceName(), 1).
+                                               WithScalarResourceRequest(mig.Profile7g80gb.AsResourceName(), 1).
                                                Get(),
                                ).Get(),
                        },
                        expected: []v1.Pod{
                                factory.BuildPod("ns-1", "pd-2").WithPriority(2).WithContainer(
                                        factory.BuildContainer("c1", "test").
-                                               WithScalarResourceRequest(mig.Profile7g79gb.AsResourceName(), 1).
+                                               WithScalarResourceRequest(mig.Profile7g80gb.AsResourceName(), 1).
                                                Get(),
                                ).Get(),
                                factory.BuildPod("ns-1", "pd-1").WithPriority(1).WithContainer(
diff --git a/pkg/gpu/mig/gpu_test.go b/pkg/gpu/mig/gpu_test.go
index 9c23b87..13ae527 100644
--- a/pkg/gpu/mig/gpu_test.go
+++ b/pkg/gpu/mig/gpu_test.go
@@ -346,7 +346,7 @@ func TestGPU__UpdateGeometryFor(t *testing.T) {
                                map[mig.ProfileName]int{},
                        ),
                        profiles: map[gpu.Slice]int{
-                               mig.Profile7g79gb: 1,
+                               mig.Profile7g80gb: 1,
                        },
                        expectedGeometry: map[gpu.Slice]int{
                                mig.Profile2g20gb: 1, // unchanged
diff --git a/pkg/gpu/mig/known_configs.go b/pkg/gpu/mig/known_configs.go
index 60c6d16..9d6f346 100644
--- a/pkg/gpu/mig/known_configs.go
+++ b/pkg/gpu/mig/known_configs.go
@@ -90,7 +90,7 @@ var (
                },
                gpu.GPUModel_A100_PCIe_80GB: {
                        {
-                               Profile7g79gb: 1,
+                               Profile7g80gb: 1,
                        },
                        {
                                Profile4g40gb: 1,
diff --git a/pkg/gpu/mig/profile.go b/pkg/gpu/mig/profile.go
index e8eaf5c..21ae3a9 100644
--- a/pkg/gpu/mig/profile.go
+++ b/pkg/gpu/mig/profile.go
@@ -43,7 +43,7 @@ const (
        Profile2g20gb ProfileName = "2g.20gb"
        Profile3g40gb ProfileName = "3g.40gb"
        Profile4g40gb ProfileName = "4g.40gb"
haghabozorgi@debhop:~/nos$ git diff
diff --git a/config/gpupartitioner/manager/known_mig_geometries.yaml b/config/gpupartitioner/manager/known_mig_geometries.yaml
index 1bcd8f9..39e3d27 100644
--- a/config/gpupartitioner/manager/known_mig_geometries.yaml
+++ b/config/gpupartitioner/manager/known_mig_geometries.yaml
@@ -56,4 +56,4 @@
     - 1g.10gb: 1
       2g.20gb: 1
       4g.40gb: 1
-    - 7g.79gb: 1
+    - 7g.80gb: 1
diff --git a/helm-charts/nos/values.yaml b/helm-charts/nos/values.yaml
index 3806a6c..58719f8 100644
--- a/helm-charts/nos/values.yaml
+++ b/helm-charts/nos/values.yaml
@@ -373,4 +373,4 @@ gpuPartitioner:
         - 1g.10gb: 1
           2g.20gb: 1
           4g.40gb: 1
-        - 7g.79gb: 1
+        - 7g.80gb: 1
diff --git a/internal/partitioning/core/util_test.go b/internal/partitioning/core/util_test.go
index d5b9f9c..b3d8000 100644
--- a/internal/partitioning/core/util_test.go
+++ b/internal/partitioning/core/util_test.go
@@ -73,14 +73,14 @@ func TestPodSorter(t *testing.T) {
                                ).Get(),
                                factory.BuildPod("ns-1", "pd-2").WithPriority(2).WithContainer(
                                        factory.BuildContainer("c1", "test").
-                                               WithScalarResourceRequest(mig.Profile7g79gb.AsResourceName(), 1).
+                                               WithScalarResourceRequest(mig.Profile7g80gb.AsResourceName(), 1).
                                                Get(),
                                ).Get(),
                        },
                        expected: []v1.Pod{
                                factory.BuildPod("ns-1", "pd-2").WithPriority(2).WithContainer(
                                        factory.BuildContainer("c1", "test").
-                                               WithScalarResourceRequest(mig.Profile7g79gb.AsResourceName(), 1).
+                                               WithScalarResourceRequest(mig.Profile7g80gb.AsResourceName(), 1).
                                                Get(),
                                ).Get(),
                                factory.BuildPod("ns-1", "pd-1").WithPriority(1).WithContainer(
diff --git a/pkg/gpu/mig/gpu_test.go b/pkg/gpu/mig/gpu_test.go
index 9c23b87..13ae527 100644
--- a/pkg/gpu/mig/gpu_test.go
+++ b/pkg/gpu/mig/gpu_test.go
@@ -346,7 +346,7 @@ func TestGPU__UpdateGeometryFor(t *testing.T) {
                                map[mig.ProfileName]int{},
                        ),
                        profiles: map[gpu.Slice]int{
-                               mig.Profile7g79gb: 1,
+                               mig.Profile7g80gb: 1,
                        },
                        expectedGeometry: map[gpu.Slice]int{
                                mig.Profile2g20gb: 1, // unchanged
diff --git a/pkg/gpu/mig/known_configs.go b/pkg/gpu/mig/known_configs.go
index 60c6d16..9d6f346 100644
--- a/pkg/gpu/mig/known_configs.go
+++ b/pkg/gpu/mig/known_configs.go
@@ -90,7 +90,7 @@ var (
                },
                gpu.GPUModel_A100_PCIe_80GB: {
                        {
-                               Profile7g79gb: 1,
+                               Profile7g80gb: 1,
                        },
                        {
                                Profile4g40gb: 1,
diff --git a/pkg/gpu/mig/profile.go b/pkg/gpu/mig/profile.go
index e8eaf5c..21ae3a9 100644
--- a/pkg/gpu/mig/profile.go
+++ b/pkg/gpu/mig/profile.go
@@ -43,7 +43,7 @@ const (
        Profile2g20gb ProfileName = "2g.20gb"
        Profile3g40gb ProfileName = "3g.40gb"
        Profile4g40gb ProfileName = "4g.40gb"
-       Profile7g79gb ProfileName = "7g.79gb"
+       Profile7g80gb ProfileName = "7g.80gb"
 )
 
 var (

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

7g.79gb does not work as expected. #51

7g.79gb does not work as expected. #51

houms-sony commented Mar 12, 2024 •

edited

Loading

houms-sony commented Mar 14, 2024

7g.79gb does not work as expected. #51

7g.79gb does not work as expected. #51

Comments

houms-sony commented Mar 12, 2024 • edited Loading

houms-sony commented Mar 14, 2024

houms-sony commented Mar 12, 2024 •

edited

Loading