Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add v6e TPU Head Resource Autoscaling Support #48201

Merged
merged 9 commits into from
Dec 3, 2024
7 changes: 4 additions & 3 deletions python/ray/autoscaler/_private/kuberay/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"tpu-v5-lite-device": "v5e",
"tpu-v5-lite-podslice": "v5e",
"tpu-v5p-slice": "v5p",
"tpu-v6e-slice": "v6e",
}


Expand Down Expand Up @@ -102,9 +103,9 @@ def tpu_node_selectors_to_type(topology: str, accelerator: str) -> Optional[str]
# Reduce e.g. "2x2x2" to 8
chip_dimensions = [int(chip_count) for chip_count in topology.split("x")]
num_chips = reduce(lambda x, y: x * y, chip_dimensions)
default_num_cores_per_chip = 2
if generation == "v5e":
default_num_cores_per_chip = 1
default_num_cores_per_chip = 1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How can I determine the exact value of default_num_cores_per_chip to verify that this logic is correct? I briefly used Ctrl + F to search for some keywords in https://cloud.google.com/kubernetes-engine/docs/how-to/tpus#run, but I couldn't find the information.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The relation between cores to number of chips for each TPU generation in one spot is viewable in the Cloud TPU documentation for each version. Under System Architecture it states the number of TensorCores per TPU chip.

if generation == "v4" or generation == "v5p":
default_num_cores_per_chip = 2
num_cores = num_chips * default_num_cores_per_chip
return f"{generation}-{num_cores}"
return None
71 changes: 71 additions & 0 deletions python/ray/tests/kuberay/test_autoscaling_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
_get_custom_resources,
)

from ray.autoscaler._private.kuberay.utils import tpu_node_selectors_to_type

AUTOSCALING_CONFIG_MODULE_PATH = "ray.autoscaler._private.kuberay.autoscaling_config"


Expand Down Expand Up @@ -402,6 +404,75 @@ def _fetch_ray_cr_from_k8s(self) -> Dict[str, Any]:
assert out == {"ok-key": "ok-value"}


TPU_TYPES_ARGS = ",".join(
[
"accelerator",
"topology",
"expected_tpu_type",
]
)
TPU_TYPES_DATA = (
[]
if platform.system() == "Windows"
else [
pytest.param(
"tpu-v4-podslice",
None,
None,
id="tpu-none-topology",
),
pytest.param(
None,
"2x2x2",
None,
id="tpu-none-accelerator",
),
pytest.param(
"tpu-v4-podslice",
"2x2x2",
"v4-16",
id="tpu-v4-test",
),
pytest.param(
"tpu-v5-lite-device",
"2x2",
"v5e-4",
id="tpu-v5e-device-test",
),
pytest.param(
"tpu-v5-lite-podslice",
"2x4",
"v5e-8",
id="tpu-v5e-podslice-test",
),
pytest.param(
"tpu-v5p-slice",
"2x2x4",
"v5p-32",
id="tpu-v5p-test",
),
pytest.param(
"tpu-v6e-slice",
"16x16",
"v6e-256",
id="tpu-v6e-test",
),
]
)


@pytest.mark.skipif(platform.system() == "Windows", reason="Not relevant.")
@pytest.mark.parametrize(TPU_TYPES_ARGS, TPU_TYPES_DATA)
def test_tpu_node_selectors_to_type(
accelerator: str, topology: str, expected_tpu_type: str
):
"""Verify that tpu_node_selectors_to_type correctly returns TPU type from
TPU nodeSelectors.
"""
tpu_type = tpu_node_selectors_to_type(topology, accelerator)
assert expected_tpu_type == tpu_type


TPU_PARAM_ARGS = ",".join(
[
"ray_cr_in",
Expand Down
Loading