Merge pull request #1465 from bharathappali/gpu-update-impl-4

KAMS - 4: Adds Recommendation updater and MIG based recommendation mechanism
kruize · Feb 18, 2025 · 8d41db9 · 8d41db9
2 parents 3ffe5ad + 7c6ecab
commit 8d41db9
Show file tree

Hide file tree

Showing 16 changed files with 808 additions and 51 deletions.
diff --git a/design/AutoScalingModeAPI.md b/design/AutoScalingModeAPI.md
@@ -19,6 +19,8 @@ with minimal manual intervention.
     - Example Request and Response
     - Invalid Scenarios
 
+3. [Accelerator Autoscaler](#accelerator-autoscaler)
+
 ## Defaults
 
 | Parameter                | Default Value |
@@ -160,3 +162,35 @@ Status:
         Memory:  39Mi
 Events:          <none>
 ```
+
+
+## Autoscaling Mode for Accelerator Workloads:
+
+<a name="accelerator-autoscaler"></a>
+
+### Accelerator Autoscaler
+
+#### Overview
+
+The Accelerator Autoscaler is designed to optimize the allocation of GPU resources for workloads running on Kubernetes clusters. It intelligently adjusts resource configurations based on accelerator recommendations generated by Kruize. The Accelerator Autoscaler is only used when the user specifies `auto` or `recreate` mode while creating experiment.
+
+In Kruize, if the experiment is created with either of these modes and GPU metrics are found, the Accelerator Autoscaler is utilized.
+
+#### When is Accelerator Autoscaler Used?
+
+During the Generate Recommendations phase, if GPU metrics are detected and GPU recommendations are created, the autoscaler is set to Accelerator Autoscaler.
+
+This ensures that the resource configurations are tailored to the GPU requirements of the workload, maximizing performance and efficiency.
+
+#### Why Not Use VPA?
+
+- Vertical Pod Autoscaler (VPA) only works for CPU and Memory but not for GPU.
+- In traditional setups, VPA handles CPU and memory scaling, while GPU scaling requires a different approach.
+- The Accelerator Autoscaler directly updates the CPU, Memory, and GPU resources of the Kubernetes object, ensuring all resource requirements are met in a coordinated way.
+
+
+#### Key Takeaways
+
+- The Accelerator Autoscaler is activated when GPU metrics are detected and GPU recommendations are generated.
+- It directly updates CPU, Memory, and GPU resources, bypassing VPA for GPU workloads.
+- Currently, only the update process is implemented, with a revert mechanism planned for future releases.
diff --git a/src/main/java/com/autotune/analyzer/autoscaler/AutoscalerImpl.java b/src/main/java/com/autotune/analyzer/autoscaler/AutoscalerImpl.java
@@ -43,7 +43,7 @@ public AutoscalerImpl getAutoscalerInstance(String updaterType) throws InvalidRe
         if (AnalyzerConstants.AutoscalerConstants.SupportedUpdaters.VPA.equalsIgnoreCase(updaterType)) {
             return VpaAutoscalerImpl.getInstance();
         } else {
-            throw new InvalidRecommendationUpdaterType(String.format(AnalyzerErrorConstants.RecommendationUpdaterErrors.UNSUPPORTED_UPDATER_TYPE, updaterType));
+            throw new InvalidRecommendationUpdaterType(String.format(AnalyzerErrorConstants.AutoscalerErrors.UNSUPPORTED_UPDATER_TYPE, updaterType));
         }
     }
 
@@ -85,7 +85,7 @@ public KruizeObject generateResourceRecommendationsForExperiment(String experime
                 throw new Exception(validationMessage);
             }
         } catch (Exception | FetchMetricsError e) {
-            LOGGER.error(AnalyzerErrorConstants.RecommendationUpdaterErrors.GENERATE_RECOMMNEDATION_FAILED, experimentName);
+            LOGGER.error(AnalyzerErrorConstants.AutoscalerErrors.GENERATE_RECOMMENDATION_FAILED, experimentName);
             LOGGER.debug(e.getMessage());
             return null;
         }

diff --git a/src/main/java/com/autotune/analyzer/autoscaler/AutoscalerService.java b/src/main/java/com/autotune/analyzer/autoscaler/AutoscalerService.java
@@ -18,6 +18,7 @@
 
 import com.autotune.analyzer.autoscaler.vpa.VpaAutoscalerImpl;
 import com.autotune.analyzer.kruizeObject.KruizeObject;
+import com.autotune.analyzer.autoscaler.accelerator.AcceleratorAutoscalerImpl;
 import com.autotune.analyzer.utils.AnalyzerConstants;
 import com.autotune.analyzer.utils.AnalyzerErrorConstants;
 import com.autotune.database.service.ExperimentDBService;
@@ -56,6 +57,11 @@ public static void initiateAutoscalerService() {
                             VpaAutoscalerImpl vpaUpdater = VpaAutoscalerImpl.getInstance();
                             vpaUpdater.applyResourceRecommendationsForExperiment(kruizeObject);
                         }
+
+                        if (kruizeObject.getDefaultUpdater().equalsIgnoreCase(AnalyzerConstants.AutoscalerConstants.SupportedUpdaters.ACCELERATOR)) {
+                            AcceleratorAutoscalerImpl acceleratorUpdater = AcceleratorAutoscalerImpl.getInstance();
+                            acceleratorUpdater.applyResourceRecommendationsForExperiment(kruizeObject);
+                        }
                     }
                 } catch (Exception e) {
                     LOGGER.error(e.getMessage());
@@ -64,7 +70,7 @@ public static void initiateAutoscalerService() {
                     AnalyzerConstants.AutoscalerConstants.DEFAULT_SLEEP_INTERVAL,
                     TimeUnit.SECONDS);
         } catch (Exception e) {
-            LOGGER.error(AnalyzerErrorConstants.RecommendationUpdaterErrors.UPDTAER_SERVICE_START_ERROR + e.getMessage());
+            LOGGER.error(AnalyzerErrorConstants.AutoscalerErrors.UPDATER_SERVICE_START_ERROR + e.getMessage());
         }
     }