ray-project · bveeramani · Dec 3, 2024 · Nov 22, 2024 · Nov 25, 2024 · Nov 26, 2024
diff --git a/...taset/multi_node_autoscaling_compute.yaml → ...ests/dataset/autoscaling_cpu_compute.yaml b/...taset/multi_node_autoscaling_compute.yaml → ...ests/dataset/autoscaling_cpu_compute.yaml
@@ -2,7 +2,7 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-max_workers: 0
+max_workers: 10
 
 head_node_type:
     name: head_node

diff --git a/release/nightly_tests/dataset/autoscaling_gpu_compute.yaml b/release/nightly_tests/dataset/autoscaling_gpu_compute.yaml
@@ -0,0 +1,19 @@
+# This config matches the default config for Anyscale workspaces with autoscaling, 
+# except instead of using CPU instances, it uses GPU instances.
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west-2
+
+max_workers: 10
+
+head_node_type:
+    name: head_node
+    instance_type: g4dn.2xlarge
+    resources:
+      cpu: 0
+
+worker_node_types:
+    - name: worker_node
+      instance_type: g4dn.2xlarge
+      min_workers: 0
+      max_workers: 10
+      use_spot: false
diff --git a/release/nightly_tests/dataset/autoscaling_hetero_compute.yaml b/release/nightly_tests/dataset/autoscaling_hetero_compute.yaml
@@ -0,0 +1,23 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west-2
+
+max_workers: 20
+
+head_node_type:
+    name: head_node
+    instance_type: m5.2xlarge
+    resources:
+      cpu: 0
+
+worker_node_types:
+    - name: worker_node_gpu
+      instance_type: g4dn.2xlarge
+      min_workers: 0
+      max_workers: 10
+      use_spot: false
+
+    - name: worker_node_cpu
+      instance_type: m5.2xlarge
+      min_workers: 0
+      max_workers: 10
+      use_spot: false
diff --git a/release/nightly_tests/dataset/compute_hetero_10x10_aws.yaml b/release/nightly_tests/dataset/compute_hetero_10x10_aws.yaml
diff --git a/release/release_data_tests.yaml b/release/release_data_tests.yaml
@@ -7,8 +7,9 @@
 
   cluster:
     byod:
+      # 'type: gpu' means: use the 'ray-ml' image.
       type: gpu
-    cluster_compute: multi_node_autoscaling_compute.yaml
+    cluster_compute: autoscaling_cpu_compute.yaml
 
 ###############
 # Reading tests
@@ -545,177 +546,45 @@
         cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
 
 
-############################
-# Batch Inference Benchmarks
-############################
+#######################
+# Batch inference tests
+#######################
 
-# 10 GB image classification raw images with 1 GPU.
-# 1 g4dn.4xlarge
-- name: torch_batch_inference_1_gpu_10gb_raw
-  group: data-tests
-  working_dir: nightly_tests/dataset
-
-  frequency: nightly
-  team: data
-  cluster:
-    byod:
-      type: gpu
-    cluster_compute: compute_gpu_1_cpu_16_aws.yaml
-
-  run:
-    timeout: 500
-    script: python gpu_batch_inference.py --data-directory=10G-image-data-synthetic-raw --data-format raw
-
-  alert: default
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: compute_gpu_1_cpu_16_gce.yaml
+# 300 GB image classification parquet data with 0-10 GPUs
+# 0-10 g4dn.12xlarge.
+- name: batch_inference
 
-# 10 GB image classification parquet with 1 GPU.
-# 1 g4dn.4xlarge
-- name: torch_batch_inference_1_gpu_10gb_parquet
-  group: data-tests
-  working_dir: nightly_tests/dataset
-
-  frequency: nightly
-  team: data
   cluster:
     byod:
       type: gpu
-    cluster_compute: compute_gpu_1_cpu_16_aws.yaml
-
-  run:
-    timeout: 500
-    script: python gpu_batch_inference.py --data-directory=10G-image-data-synthetic-raw-parquet --data-format parquet
-
-  alert: default
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: compute_gpu_1_cpu_16_gce.yaml
-
-
-# 300 GB image classification raw images with 16 GPUs
-# 4 g4dn.12xlarge
-- name: torch_batch_inference_16_gpu_300gb_raw
-  group: data-tests
-  working_dir: nightly_tests/dataset
-
-  frequency: nightly
-  team: data
-  cluster:
-    byod:
-      type: gpu
-    cluster_compute: compute_gpu_4x4_aws.yaml
+    cluster_compute: autoscaling_gpu_compute.yaml
 
   run:
     timeout: 1000
-    script: python gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw --data-format raw
-
-    wait_for_nodes:
-        num_nodes: 4
-
-  alert: default
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: compute_gpu_4x4_gce.yaml
+    script: python gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw-parquet --data-format parquet
 
+- name: batch_inference_chaos
 
-- name: chaos_torch_batch_inference_16_gpu_300gb_raw
-  group: data-tests
-  working_dir: nightly_tests
-  stable: false
-
-  frequency: nightly
-  team: data
   cluster:
     byod:
       type: gpu
-    cluster_compute: dataset/compute_gpu_4x4_aws.yaml
+    cluster_compute: dataset/autoscaling_gpu_compute.yaml
 
   run:
     timeout: 1000
     prepare: python setup_chaos.py --max-to-kill 2 --kill-delay 30
-    script: python dataset/gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw --data-format raw
-
-    wait_for_nodes:
-        num_nodes: 4
-
-  alert: default
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: dataset/compute_gpu_4x4_gce.yaml
-
-
-# 300 GB image classification parquet data with 16 GPUs
-# 4 g4dn.12xlarge
-- name: torch_batch_inference_16_gpu_300gb_parquet
-  group: data-tests
-  working_dir: nightly_tests/dataset
-
-  frequency: nightly
-  team: data
-
-  cluster:
-    byod:
-      type: gpu
-    cluster_compute: compute_gpu_4x4_aws.yaml
-
-  run:
-    timeout: 1000
-    script: python gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw-parquet --data-format parquet
-
-    wait_for_nodes:
-        num_nodes: 4
-
-  alert: default
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: compute_gpu_4x4_gce.yaml
+    script: python dataset/gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw-parquet --data-format parquet
 
 # 10 TB image classification parquet data with heterogenous cluster
 # 10 g4dn.12xlarge, 10 m5.16xlarge
-- name: torch_batch_inference_hetero_10tb_parquet
-  group: data-tests
-  working_dir: nightly_tests/dataset
-
+- name: batch_inference_hetero
   frequency: weekly
-  team: data
 
   cluster:
     byod:
       type: gpu
-    cluster_compute: compute_hetero_10x10_aws.yaml
+    cluster_compute: autoscaling_hetero_compute.yaml
 
   run:
     timeout: 2000
     script: python gpu_batch_inference.py --data-directory 10T-image-data-synthetic-raw-parquet --data-format parquet
-
-    wait_for_nodes:
-      num_nodes: 20
-
-  alert: default