Merge branch 'ray-project:master' into clickhouse_datasource

ray-project · Dec 3, 2024 · 4a31a52 · 4a31a52
2 parents 29b20e3 + 24acab2
commit 4a31a52
Show file tree

Hide file tree

Showing 8 changed files with 119 additions and 225 deletions.
diff --git a/...taset/multi_node_autoscaling_compute.yaml → ...ests/dataset/autoscaling_cpu_compute.yaml b/...taset/multi_node_autoscaling_compute.yaml → ...ests/dataset/autoscaling_cpu_compute.yaml
@@ -2,7 +2,7 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-max_workers: 0
+max_workers: 10
 
 head_node_type:
     name: head_node

diff --git a/release/nightly_tests/dataset/autoscaling_gpu_compute.yaml b/release/nightly_tests/dataset/autoscaling_gpu_compute.yaml
@@ -0,0 +1,19 @@
+# This config matches the default config for Anyscale workspaces with autoscaling, 
+# except instead of using CPU instances, it uses GPU instances.
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west-2
+
+max_workers: 10
+
+head_node_type:
+    name: head_node
+    instance_type: m5.2xlarge
+    resources:
+      cpu: 0
+
+worker_node_types:
+    - name: worker_node
+      instance_type: g4dn.2xlarge
+      min_workers: 0
+      max_workers: 10
+      use_spot: false
diff --git a/release/nightly_tests/dataset/autoscaling_hetero_compute.yaml b/release/nightly_tests/dataset/autoscaling_hetero_compute.yaml
@@ -0,0 +1,23 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west-2
+
+max_workers: 20
+
+head_node_type:
+    name: head_node
+    instance_type: m5.2xlarge
+    resources:
+      cpu: 0
+
+worker_node_types:
+    - name: worker_node_gpu
+      instance_type: g4dn.2xlarge
+      min_workers: 0
+      max_workers: 10
+      use_spot: false
+
+    - name: worker_node_cpu
+      instance_type: m5.2xlarge
+      min_workers: 0
+      max_workers: 10
+      use_spot: false
diff --git a/release/nightly_tests/dataset/compute_hetero_10x10_aws.yaml b/release/nightly_tests/dataset/compute_hetero_10x10_aws.yaml
diff --git a/release/nightly_tests/dataset/gpu_batch_inference.py b/release/nightly_tests/dataset/gpu_batch_inference.py
@@ -88,16 +88,17 @@ def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
     start_time_without_metadata_fetching = time.time()
 
     if smoke_test:
-        actor_pool_size = 4
+        compute = ActorPoolStrategy(size=4)
         num_gpus = 0
     else:
-        actor_pool_size = int(ray.cluster_resources().get("GPU"))
+        # Autoscale to use as many GPUs as possible.
+        compute = ActorPoolStrategy(min_size=1, max_size=None)
         num_gpus = 1
     ds = ds.map_batches(preprocess)
     ds = ds.map_batches(
         Predictor,
         batch_size=BATCH_SIZE,
-        compute=ActorPoolStrategy(size=actor_pool_size),
+        compute=compute,
         num_gpus=num_gpus,
         fn_constructor_kwargs={"model": model_ref},
         max_concurrency=2,

diff --git a/release/release_data_tests.yaml b/release/release_data_tests.yaml
@@ -7,8 +7,9 @@
 
   cluster:
     byod:
+      # 'type: gpu' means: use the 'ray-ml' image.
       type: gpu
-    cluster_compute: multi_node_autoscaling_compute.yaml
+    cluster_compute: autoscaling_cpu_compute.yaml
 
 ###############
 # Reading tests
@@ -33,31 +34,6 @@
     timeout: 600
     script: python read_and_consume_benchmark.py s3://ray-benchmark-data/parquet/10TiB --format parquet --count
 
-- name: stable_diffusion_benchmark
-  group: data-tests
-  working_dir: nightly_tests/dataset
-
-  frequency: nightly
-  team: data
-
-  cluster:
-    byod:
-      type: gpu
-      post_build_script: byod_stable_diffusion.sh
-    cluster_compute: stable_diffusion_benchmark_compute.yaml
-
-  run:
-    timeout: 1800
-    script: python stable_diffusion_benchmark.py
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: stable_diffusion_benchmark_compute_gce.yaml
-
 - name: streaming_data_ingest_benchmark_1tb
   group: data-tests
   working_dir: nightly_tests/dataset
@@ -545,177 +521,49 @@
         cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml
 
 
-############################
-# Batch Inference Benchmarks
-############################
-
-# 10 GB image classification raw images with 1 GPU.
-# 1 g4dn.4xlarge
-- name: torch_batch_inference_1_gpu_10gb_raw
-  group: data-tests
-  working_dir: nightly_tests/dataset
-
-  frequency: nightly
-  team: data
-  cluster:
-    byod:
-      type: gpu
-    cluster_compute: compute_gpu_1_cpu_16_aws.yaml
-
-  run:
-    timeout: 500
-    script: python gpu_batch_inference.py --data-directory=10G-image-data-synthetic-raw --data-format raw
-
-  alert: default
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: compute_gpu_1_cpu_16_gce.yaml
-
-# 10 GB image classification parquet with 1 GPU.
-# 1 g4dn.4xlarge
-- name: torch_batch_inference_1_gpu_10gb_parquet
-  group: data-tests
-  working_dir: nightly_tests/dataset
-
-  frequency: nightly
-  team: data
-  cluster:
-    byod:
-      type: gpu
-    cluster_compute: compute_gpu_1_cpu_16_aws.yaml
-
-  run:
-    timeout: 500
-    script: python gpu_batch_inference.py --data-directory=10G-image-data-synthetic-raw-parquet --data-format parquet
-
-  alert: default
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: compute_gpu_1_cpu_16_gce.yaml
-
+#######################
+# Batch inference tests
+#######################
 
-# 300 GB image classification raw images with 16 GPUs
-# 4 g4dn.12xlarge
-- name: torch_batch_inference_16_gpu_300gb_raw
-  group: data-tests
-  working_dir: nightly_tests/dataset
+# 300 GB image classification parquet data up to 10 GPUs
+# 10 g4dn.12xlarge.
+- name: batch_inference
 
-  frequency: nightly
-  team: data
   cluster:
-    byod:
-      type: gpu
-    cluster_compute: compute_gpu_4x4_aws.yaml
+    cluster_compute: autoscaling_gpu_compute.yaml
 
   run:
-    timeout: 1000
-    script: python gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw --data-format raw
-
-    wait_for_nodes:
-        num_nodes: 4
-
-  alert: default
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: compute_gpu_4x4_gce.yaml
-
+    timeout: 1800
+    script: >
+      python gpu_batch_inference.py 
+      --data-directory 300G-image-data-synthetic-raw-parquet --data-format parquet
 
-- name: chaos_torch_batch_inference_16_gpu_300gb_raw
-  group: data-tests
+- name: batch_inference_chaos
+  stable: False
+  # Don't use 'nightly_tests/dataset' as the working directory because we need to run 
+  # the 'setup_chaos.py' script.
   working_dir: nightly_tests
-  stable: false
 
-  frequency: nightly
-  team: data
   cluster:
-    byod:
-      type: gpu
-    cluster_compute: dataset/compute_gpu_4x4_aws.yaml
+    cluster_compute: dataset/autoscaling_gpu_compute.yaml
 
   run:
-    timeout: 1000
+    timeout: 1800
     prepare: python setup_chaos.py --max-to-kill 2 --kill-delay 30
-    script: python dataset/gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw --data-format raw
-
-    wait_for_nodes:
-        num_nodes: 4
-
-  alert: default
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: dataset/compute_gpu_4x4_gce.yaml
-
-
-# 300 GB image classification parquet data with 16 GPUs
-# 4 g4dn.12xlarge
-- name: torch_batch_inference_16_gpu_300gb_parquet
-  group: data-tests
-  working_dir: nightly_tests/dataset
-
-  frequency: nightly
-  team: data
-
-  cluster:
-    byod:
-      type: gpu
-    cluster_compute: compute_gpu_4x4_aws.yaml
-
-  run:
-    timeout: 1000
-    script: python gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw-parquet --data-format parquet
-
-    wait_for_nodes:
-        num_nodes: 4
-
-  alert: default
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: compute_gpu_4x4_gce.yaml
+    script: >
+      python dataset/gpu_batch_inference.py 
+      --data-directory 300G-image-data-synthetic-raw-parquet --data-format parquet
 
-# 10 TB image classification parquet data with heterogenous cluster
+# 10 TB image classification parquet data with autoscaling heterogenous cluster
 # 10 g4dn.12xlarge, 10 m5.16xlarge
-- name: torch_batch_inference_hetero_10tb_parquet
-  group: data-tests
-  working_dir: nightly_tests/dataset
-
+- name: batch_inference_hetero
   frequency: weekly
-  team: data
 
   cluster:
-    byod:
-      type: gpu
-    cluster_compute: compute_hetero_10x10_aws.yaml
+    cluster_compute: autoscaling_hetero_compute.yaml
 
   run:
-    timeout: 2000
-    script: python gpu_batch_inference.py --data-directory 10T-image-data-synthetic-raw-parquet --data-format parquet
-
-    wait_for_nodes:
-      num_nodes: 20
-
-  alert: default
+    timeout: 3600
+    script: >
+      python gpu_batch_inference.py 
+      --data-directory 10T-image-data-synthetic-raw-parquet --data-format parquet