Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Data] Update batch inference release tests #49012

Merged
merged 12 commits into from
Dec 3, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

max_workers: 0
max_workers: 10
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure why this wasn't an issue before. Maybe because we override max_workers for the specific node type


head_node_type:
name: head_node
Expand Down
19 changes: 19 additions & 0 deletions release/nightly_tests/dataset/autoscaling_gpu_compute.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# This config matches the default config for Anyscale workspaces with autoscaling,
# except instead of using CPU instances, it uses GPU instances.
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

max_workers: 10

head_node_type:
name: head_node
instance_type: g4dn.2xlarge
resources:
cpu: 0

worker_node_types:
- name: worker_node
instance_type: g4dn.2xlarge
min_workers: 0
max_workers: 10
use_spot: false
23 changes: 23 additions & 0 deletions release/nightly_tests/dataset/autoscaling_hetero_compute.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

max_workers: 20

head_node_type:
name: head_node
instance_type: m5.2xlarge
resources:
cpu: 0

worker_node_types:
- name: worker_node_gpu
instance_type: g4dn.2xlarge
min_workers: 0
max_workers: 10
use_spot: false

- name: worker_node_cpu
instance_type: m5.2xlarge
min_workers: 0
max_workers: 10
use_spot: false
21 changes: 0 additions & 21 deletions release/nightly_tests/dataset/compute_hetero_10x10_aws.yaml

This file was deleted.

161 changes: 15 additions & 146 deletions release/release_data_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@

cluster:
byod:
# 'type: gpu' means: use the 'ray-ml' image.
type: gpu
cluster_compute: multi_node_autoscaling_compute.yaml
cluster_compute: autoscaling_cpu_compute.yaml

###############
# Reading tests
Expand Down Expand Up @@ -545,177 +546,45 @@
cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml


############################
# Batch Inference Benchmarks
############################
#######################
# Batch inference tests
#######################

# 10 GB image classification raw images with 1 GPU.
# 1 g4dn.4xlarge
- name: torch_batch_inference_1_gpu_10gb_raw
group: data-tests
working_dir: nightly_tests/dataset

frequency: nightly
team: data
cluster:
byod:
type: gpu
cluster_compute: compute_gpu_1_cpu_16_aws.yaml

run:
timeout: 500
script: python gpu_batch_inference.py --data-directory=10G-image-data-synthetic-raw --data-format raw

alert: default

variations:
- __suffix__: aws
- __suffix__: gce
env: gce
frequency: manual
cluster:
cluster_compute: compute_gpu_1_cpu_16_gce.yaml
# 300 GB image classification parquet data with 0-10 GPUs
# 0-10 g4dn.12xlarge.
- name: batch_inference

# 10 GB image classification parquet with 1 GPU.
# 1 g4dn.4xlarge
- name: torch_batch_inference_1_gpu_10gb_parquet
group: data-tests
working_dir: nightly_tests/dataset

frequency: nightly
team: data
cluster:
byod:
type: gpu
cluster_compute: compute_gpu_1_cpu_16_aws.yaml

run:
timeout: 500
script: python gpu_batch_inference.py --data-directory=10G-image-data-synthetic-raw-parquet --data-format parquet

alert: default

variations:
- __suffix__: aws
- __suffix__: gce
env: gce
frequency: manual
cluster:
cluster_compute: compute_gpu_1_cpu_16_gce.yaml


# 300 GB image classification raw images with 16 GPUs
# 4 g4dn.12xlarge
- name: torch_batch_inference_16_gpu_300gb_raw
group: data-tests
working_dir: nightly_tests/dataset

frequency: nightly
team: data
cluster:
byod:
type: gpu
cluster_compute: compute_gpu_4x4_aws.yaml
cluster_compute: autoscaling_gpu_compute.yaml

run:
timeout: 1000
script: python gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw --data-format raw

wait_for_nodes:
num_nodes: 4

alert: default

variations:
- __suffix__: aws
- __suffix__: gce
env: gce
frequency: manual
cluster:
cluster_compute: compute_gpu_4x4_gce.yaml
script: python gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw-parquet --data-format parquet

- name: batch_inference_chaos

- name: chaos_torch_batch_inference_16_gpu_300gb_raw
group: data-tests
working_dir: nightly_tests
stable: false

frequency: nightly
team: data
cluster:
byod:
type: gpu
cluster_compute: dataset/compute_gpu_4x4_aws.yaml
cluster_compute: dataset/autoscaling_gpu_compute.yaml

run:
timeout: 1000
prepare: python setup_chaos.py --max-to-kill 2 --kill-delay 30
script: python dataset/gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw --data-format raw

wait_for_nodes:
num_nodes: 4

alert: default

variations:
- __suffix__: aws
- __suffix__: gce
env: gce
frequency: manual
cluster:
cluster_compute: dataset/compute_gpu_4x4_gce.yaml


# 300 GB image classification parquet data with 16 GPUs
# 4 g4dn.12xlarge
- name: torch_batch_inference_16_gpu_300gb_parquet
group: data-tests
working_dir: nightly_tests/dataset

frequency: nightly
team: data

cluster:
byod:
type: gpu
cluster_compute: compute_gpu_4x4_aws.yaml

run:
timeout: 1000
script: python gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw-parquet --data-format parquet

wait_for_nodes:
num_nodes: 4

alert: default

variations:
- __suffix__: aws
- __suffix__: gce
env: gce
frequency: manual
cluster:
cluster_compute: compute_gpu_4x4_gce.yaml
script: python dataset/gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw-parquet --data-format parquet

# 10 TB image classification parquet data with heterogenous cluster
# 10 g4dn.12xlarge, 10 m5.16xlarge
- name: torch_batch_inference_hetero_10tb_parquet
group: data-tests
working_dir: nightly_tests/dataset

- name: batch_inference_hetero
frequency: weekly
team: data

cluster:
byod:
type: gpu
cluster_compute: compute_hetero_10x10_aws.yaml
cluster_compute: autoscaling_hetero_compute.yaml

run:
timeout: 2000
script: python gpu_batch_inference.py --data-directory 10T-image-data-synthetic-raw-parquet --data-format parquet

wait_for_nodes:
num_nodes: 20

alert: default
Loading