chore: make final updates to include Ray v2.42.0 support on SDK, make…

… 2.42.0 the default Ray version over 2.33.0 FUTURE_COPYBARA_INTEGRATE_REVIEW=#4990 from googleapis:release-please--branches--main dc59b4d PiperOrigin-RevId: 734649560
googleapis · Mar 7, 2025 · 98f5403 · 98f5403
1 parent 80cfc2f
commit 98f5403
Show file tree

Hide file tree

Showing 10 changed files with 109 additions and 62 deletions.
diff --git a/.kokoro/presubmit/unit_ray_2-42.cfg b/.kokoro/presubmit/unit_ray_2-42.cfg
@@ -0,0 +1,13 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+# Run unit tests for Ray 2.42.0 on Python 3.10
+env_vars: {
+    key: "NOX_SESSION"
+    value: "unit_ray(ray='2.42.0')"
+}
+
+# Run unit tests in parallel, splitting up by file
+env_vars: {
+    key: "PYTEST_ADDOPTS"
+    value: "-n=auto --dist=loadscope"
+}
diff --git a/google/cloud/aiplatform/vertex_ray/client_builder.py b/google/cloud/aiplatform/vertex_ray/client_builder.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-# Copyright 2024 Google LLC
+# Copyright 2025 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -47,7 +47,7 @@ def __init__(
                 persistent_resource_id,
                 " failed to start Head node properly.",
             )
-        if ray.__version__ == "2.33.0":
+        if ray.__version__ in ("2.42.0", "2.33.0"):
             super().__init__(
                 dashboard_url=dashboard_uri,
                 python_version=ray_client_context.python_version,
@@ -69,7 +69,7 @@ def __init__(
         else:
             raise ImportError(
                 f"[Ray on Vertex AI]: Unsupported version {ray.__version__}."
-                + "Only 2.33.0 and 2.9.3 are supported."
+                + "Only 2.42.0, 2.33.0, and 2.9.3 are supported."
             )
         self.persistent_resource_id = persistent_resource_id
         self.vertex_sdk_version = str(VERTEX_SDK_VERSION)

diff --git a/google/cloud/aiplatform/vertex_ray/cluster_init.py b/google/cloud/aiplatform/vertex_ray/cluster_init.py
@@ -54,7 +54,7 @@
 def create_ray_cluster(
     head_node_type: Optional[resources.Resources] = resources.Resources(),
     python_version: Optional[str] = "3.10",
-    ray_version: Optional[str] = "2.33",
+    ray_version: Optional[str] = "2.42",
     network: Optional[str] = None,
     service_account: Optional[str] = None,
     cluster_name: Optional[str] = None,
@@ -76,17 +76,17 @@ def create_ray_cluster(
     head_node_type = Resources(
         machine_type="n1-standard-8",
         node_count=1,
-        accelerator_type="NVIDIA_TESLA_K80",
+        accelerator_type="NVIDIA_TESLA_T4",
         accelerator_count=1,
-        custom_image="us-docker.pkg.dev/my-project/ray-cpu-image.2.9:latest",  # Optional
+        custom_image="us-docker.pkg.dev/my-project/ray-cpu-image.2.33:latest",  # Optional
     )
 
     worker_node_types = [Resources(
         machine_type="n1-standard-8",
         node_count=2,
-        accelerator_type="NVIDIA_TESLA_K80",
+        accelerator_type="NVIDIA_TESLA_T4",
         accelerator_count=1,
-        custom_image="us-docker.pkg.dev/my-project/ray-gpu-image.2.9:latest",  # Optional
+        custom_image="us-docker.pkg.dev/my-project/ray-gpu-image.2.33:latest",  # Optional
     )]
 
     cluster_resource_name = vertex_ray.create_ray_cluster(
@@ -95,7 +95,7 @@ def create_ray_cluster(
         service_account="[email protected]",  # Optional
         cluster_name="my-cluster-name",  # Optional
         worker_node_types=worker_node_types,
-        ray_version="2.9",
+        ray_version="2.33",
     )
 
     After a ray cluster is set up, you can call
@@ -109,7 +109,7 @@ def create_ray_cluster(
         head_node_type: The head node resource. Resources.node_count must be 1.
             If not set, default value of Resources() class will be used.
         python_version: Python version for the ray cluster.
-        ray_version: Ray version for the ray cluster. Default is 2.33.0.
+        ray_version: Ray version for the ray cluster. Default is 2.42.0.
         network: Virtual private cloud (VPC) network. For Ray Client, VPC
             peering is required to connect to the Ray Cluster managed in the
             Vertex API service. For Ray Job API, VPC network is not required
@@ -162,7 +162,7 @@ def create_ray_cluster(
     local_ray_verion = _validation_utils.get_local_ray_version()
     if ray_version != local_ray_verion:
         if custom_images is None and head_node_type.custom_image is None:
-            install_ray_version = "2.33.0"
+            install_ray_version = "2.42.0"
             logging.info(
                 "[Ray on Vertex]: Local runtime has Ray version %s"
                 ", but the requested cluster runtime has %s. Please "

diff --git a/google/cloud/aiplatform/vertex_ray/dashboard_sdk.py b/google/cloud/aiplatform/vertex_ray/dashboard_sdk.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-# Copyright 2024 Google LLC
+# Copyright 2025 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -29,15 +29,15 @@ def get_job_submission_client_cluster_info(
     """A vertex_ray implementation of get_job_submission_client_cluster_info().
 
     Implements
-    https://github.com/ray-project/ray/blob/ray-2.33.0/python/ray/dashboard/modules/dashboard_sdk.py#L84
+    https://github.com/ray-project/ray/blob/ray-2.42.0/python/ray/dashboard/modules/dashboard_sdk.py#L84
     This will be called in from Ray Job API Python client.
 
     Args:
         address: Address without the module prefix `vertex_ray` but otherwise
             the same format as passed to ray.init(address="vertex_ray://...").
-        *args: Reminder of positional args that might be passed down from
+        *args: Remainder of positional args that might be passed down from
             the framework.
-        **kwargs: Reminder of keyword args that might be passed down from
+        **kwargs: Remainder of keyword args that might be passed down from
             the framework.
 
     Returns:

diff --git a/google/cloud/aiplatform/vertex_ray/util/resources.py b/google/cloud/aiplatform/vertex_ray/util/resources.py
@@ -69,17 +69,22 @@ class Resources:
 class NodeImages:
     """Custom images for a ray cluster.
 
-    We currently support Ray v2.9 and v2.33 and python v3.10.
+    We currently support Ray v2.9, v2.33, v2.42 and python v3.10.
+    We also support python v3.11 for Ray v2.42.
     The custom images must be extended from the following base images:
     "{region}-docker.pkg.dev/vertex-ai/training/ray-cpu.2-9.py310:latest",
     "{region}-docker.pkg.dev/vertex-ai/training/ray-gpu.2-9.py310:latest",
-    "{region}-docker.pkg.dev/vertex-ai/training/ray-cpu.2-33.py310:latest", or
-    "{region}-docker.pkg.dev/vertex-ai/training/ray-gpu.2-33.py310:latest". In
+    "{region}-docker.pkg.dev/vertex-ai/training/ray-cpu.2-33.py310:latest",
+    "{region}-docker.pkg.dev/vertex-ai/training/ray-gpu.2-33.py310:latest",
+    "{region}-docker.pkg.dev/vertex-ai/training/ray-cpu.2-42.py310:latest",
+    "{region}-docker.pkg.dev/vertex-ai/training/ray-gpu.2-42.py310:latest",
+    "{region}-docker.pkg.dev/vertex-ai/training/ray-cpu.2-42.py311:latest", or
+    "{region}-docker.pkg.dev/vertex-ai/training/ray-gpu.2-42.py311:latest". In
     order to use custom images, need to specify both head and worker images.
 
     Attributes:
-        head: image for head node (eg. us-docker.pkg.dev/my-project/ray-cpu.2-9.py310-tf:latest).
-        worker: image for all worker nodes (eg. us-docker.pkg.dev/my-project/ray-gpu.2-9.py310-tf:latest).
+        head: image for head node (eg. us-docker.pkg.dev/my-project/ray-cpu.2-33.py310-tf:latest).
+        worker: image for all worker nodes (eg. us-docker.pkg.dev/my-project/ray-gpu.2-33.py310-tf:latest).
     """
 
     head: str = None
@@ -147,7 +152,7 @@ class Cluster:
             the cluster.
         state: Describes the cluster state (defined in PersistentResource.State).
         python_version: Python version for the ray cluster (e.g. "3.10").
-        ray_version: Ray version for the ray cluster (e.g. "2.9").
+        ray_version: Ray version for the ray cluster (e.g. "2.33").
         head_node_type: The head node resource. Resources.node_count must be 1.
             If not set, by default it is a CPU node with machine_type of n1-standard-8.
         worker_node_types: The list of Resources of the worker nodes. Should not

diff --git a/noxfile.py b/noxfile.py
@@ -253,7 +253,7 @@ def unit_genai_minimal_dependencies(session):
 
 
 @nox.session(python="3.10")
-@nox.parametrize("ray", ["2.9.3", "2.33.0"])
+@nox.parametrize("ray", ["2.9.3", "2.33.0", "2.42.0"])
 def unit_ray(session, ray):
     # Install all test dependencies, then install this package in-place.
 

diff --git a/setup.py b/setup.py
@@ -101,19 +101,18 @@
 preview_extra_require = []
 
 ray_extra_require = [
-    # Cluster only supports 2.9.3 and 2.33.0. Keep 2.4.0 for our testing environment.
+    # Cluster only supports 2.9.3, 2.33.0, and 2.42.0. Keep 2.4.0 for our
+    # testing environment.
     # Note that testing is submiting a job in a cluster with Ray 2.9.3 remotely.
     (
-        "ray[default] >= 2.4, <= 2.33.0,!= 2.5.*,!= 2.6.*,!= 2.7.*,!="
-        " 2.8.*,!=2.9.0,!=2.9.1,!=2.9.2, !=2.10.*, !=2.11.*, !=2.12.*, !=2.13.*, !="
-        " 2.14.*, !=2.15.*, !=2.16.*, !=2.17.*, !=2.18.*, !=2.19.*, !=2.20.*, !="
-        " 2.21.*, !=2.22.*, !=2.23.*, !=2.24.*, !=2.25.*, !=2.26.*, !=2.27.*, !="
-        " 2.28.*, !=2.29.*, !=2.30.*, !=2.31.*, !=2.32.*; python_version<'3.11'"
+        "ray[default] >= 2.9.3, <= 2.42.0,!=2.10.*, !=2.11.*, !=2.12.*, !="
+        " 2.13.*, !=2.14.*, !=2.15.*, !=2.16.*, !=2.17.*, !=2.18.*, !=2.19.*, !="
+        " 2.20.*, !=2.21.*, !=2.22.*, !=2.23.*, !=2.24.*, !=2.25.*, !=2.26.*, !="
+        " 2.27.*, !=2.28.*, !=2.29.*, !=2.30.*, !=2.31.*, !=2.32.*; !=2.34.*, !="
+        " 2.35.*, !=2.36.*, !=2.37.*, ,!=2.38.*, !=2.39.*, !=2.40.*, !=2.41.*"
     ),
     # To avoid  ImportError: cannot import name 'packaging' from 'pkg_resources'
     "setuptools < 70.0.0",
-    # Ray Data v2.4 in Python 3.11 is broken, but got fixed in Ray v2.5.
-    "ray[default] >= 2.5, <= 2.33.0; python_version=='3.11'",
     "google-cloud-bigquery-storage",
     "google-cloud-bigquery",
     "pandas >= 1.0.0",

diff --git a/testing/constraints-ray-2.42.0.txt b/testing/constraints-ray-2.42.0.txt
@@ -0,0 +1,13 @@
+ray==2.42.0
+# Below constraints are inherited from constraints-3.10.txt
+google-api-core
+proto-plus==1.22.3
+protobuf
+mock==4.0.2
+google-cloud-storage==2.2.1 # Increased for kfp 2.0 compatibility
+packaging==24.1 # Increased to unbreak canonicalize_version error (b/377774673)
+grpcio-testing==1.34.0
+mlflow==1.30.1 # Pinned to speed up installation
+pytest-xdist==3.3.1 # Pinned to unbreak unit tests
+IPython # Added to test supernova rich html buttons
+
diff --git a/tests/unit/vertex_ray/test_cluster_init.py b/tests/unit/vertex_ray/test_cluster_init.py
@@ -290,7 +290,7 @@ def cluster_eq(returned_cluster, expected_cluster):
     assert returned_cluster.state == expected_cluster.state
 
 
-@pytest.mark.parametrize("ray_version", ["2.9", "2.33"])
+@pytest.mark.parametrize("ray_version", ["2.9", "2.33", "2.42"])
 @pytest.mark.usefixtures("google_auth_mock", "get_project_number_mock")
 class TestClusterManagement:
     def setup_method(self, ray_version):
@@ -317,14 +317,16 @@ def test_create_ray_cluster_1_pool_gpu_success(
         assert tc.ClusterConstants.TEST_VERTEX_RAY_PR_ADDRESS == cluster_name
 
         test_persistent_resource = tc.ClusterConstants.TEST_REQUEST_RUNNING_1_POOL
+
         if ray_version == "2.9":
-            test_persistent_resource.resource_runtime_spec.ray_spec.resource_pool_images[
-                "head-node"
-            ] = tc.ClusterConstants.TEST_GPU_IMAGE_2_9
+            head_node_image = tc.ClusterConstants.TEST_GPU_IMAGE_2_9
+        elif ray_version == "2.33":
+            head_node_image = tc.ClusterConstants.TEST_GPU_IMAGE_2_33
         else:
-            test_persistent_resource.resource_runtime_spec.ray_spec.resource_pool_images[
-                "head-node"
-            ] = tc.ClusterConstants.TEST_GPU_IMAGE_2_33
+            head_node_image = tc.ClusterConstants.TEST_GPU_IMAGE_2_42
+        test_persistent_resource.resource_runtime_spec.ray_spec.resource_pool_images[
+            "head-node"
+        ] = head_node_image
 
         request = persistent_resource_service.CreatePersistentResourceRequest(
             parent=tc.ProjectConstants.TEST_PARENT,
@@ -388,14 +390,17 @@ def test_create_ray_cluster_1_pool_gpu_with_labels_success(
         test_persistent_resource = (
             tc.ClusterConstants.TEST_REQUEST_RUNNING_1_POOL_WITH_LABELS
         )
+
         if ray_version == "2.9":
-            test_persistent_resource.resource_runtime_spec.ray_spec.resource_pool_images[
-                "head-node"
-            ] = tc.ClusterConstants.TEST_GPU_IMAGE_2_9
+            head_node_image = tc.ClusterConstants.TEST_GPU_IMAGE_2_9
+        elif ray_version == "2.33":
+            head_node_image = tc.ClusterConstants.TEST_GPU_IMAGE_2_33
         else:
-            test_persistent_resource.resource_runtime_spec.ray_spec.resource_pool_images[
-                "head-node"
-            ] = tc.ClusterConstants.TEST_GPU_IMAGE_2_33
+            head_node_image = tc.ClusterConstants.TEST_GPU_IMAGE_2_42
+
+        test_persistent_resource.resource_runtime_spec.ray_spec.resource_pool_images[
+            "head-node"
+        ] = head_node_image
 
         request = persistent_resource_service.CreatePersistentResourceRequest(
             parent=tc.ProjectConstants.TEST_PARENT,
@@ -449,20 +454,23 @@ def test_create_ray_cluster_2_pools_success(
         )
 
         test_persistent_resource = tc.ClusterConstants.TEST_REQUEST_RUNNING_2_POOLS
+
         if ray_version == "2.9":
-            test_persistent_resource.resource_runtime_spec.ray_spec.resource_pool_images[
-                "head-node"
-            ] = tc.ClusterConstants.TEST_CPU_IMAGE_2_9
-            test_persistent_resource.resource_runtime_spec.ray_spec.resource_pool_images[
-                "worker-pool1"
-            ] = tc.ClusterConstants.TEST_GPU_IMAGE_2_9
+            head_node_image = tc.ClusterConstants.TEST_CPU_IMAGE_2_9
+            worker_node_image = tc.ClusterConstants.TEST_GPU_IMAGE_2_9
+        elif ray_version == "2.33":
+            head_node_image = tc.ClusterConstants.TEST_CPU_IMAGE_2_33
+            worker_node_image = tc.ClusterConstants.TEST_GPU_IMAGE_2_33
         else:
-            test_persistent_resource.resource_runtime_spec.ray_spec.resource_pool_images[
-                "head-node"
-            ] = tc.ClusterConstants.TEST_CPU_IMAGE_2_33
-            test_persistent_resource.resource_runtime_spec.ray_spec.resource_pool_images[
-                "worker-pool1"
-            ] = tc.ClusterConstants.TEST_GPU_IMAGE_2_33
+            head_node_image = tc.ClusterConstants.TEST_CPU_IMAGE_2_42
+            worker_node_image = tc.ClusterConstants.TEST_GPU_IMAGE_2_42
+
+        test_persistent_resource.resource_runtime_spec.ray_spec.resource_pool_images[
+            "head-node"
+        ] = head_node_image
+        test_persistent_resource.resource_runtime_spec.ray_spec.resource_pool_images[
+            "worker-pool1"
+        ] = worker_node_image
 
         assert tc.ClusterConstants.TEST_VERTEX_RAY_PR_ADDRESS == cluster_name
         request = persistent_resource_service.CreatePersistentResourceRequest(
@@ -526,14 +534,17 @@ def test_create_ray_cluster_byosa_success(
         assert tc.ClusterConstants.TEST_VERTEX_RAY_PR_ADDRESS == cluster_name
 
         test_persistent_resource = tc.ClusterConstants.TEST_REQUEST_RUNNING_1_POOL_BYOSA
+
         if ray_version == "2.9":
-            test_persistent_resource.resource_runtime_spec.ray_spec.resource_pool_images[
-                "head-node"
-            ] = tc.ClusterConstants.TEST_GPU_IMAGE_2_9
+            head_node_image = tc.ClusterConstants.TEST_GPU_IMAGE_2_9
+        elif ray_version == "2.33":
+            head_node_image = tc.ClusterConstants.TEST_GPU_IMAGE_2_33
         else:
-            test_persistent_resource.resource_runtime_spec.ray_spec.resource_pool_images[
-                "head-node"
-            ] = tc.ClusterConstants.TEST_GPU_IMAGE_2_33
+            head_node_image = tc.ClusterConstants.TEST_GPU_IMAGE_2_42
+
+        test_persistent_resource.resource_runtime_spec.ray_spec.resource_pool_images[
+            "head-node"
+        ] = head_node_image
 
         request = persistent_resource_service.CreatePersistentResourceRequest(
             parent=tc.ProjectConstants.TEST_PARENT,

diff --git a/tests/unit/vertex_ray/test_constants.py b/tests/unit/vertex_ray/test_constants.py
@@ -68,7 +68,7 @@
 )
 
 predictionrayversion = pytest.mark.skipif(
-    ray.__version__ != "2.9.3", reason="Not currently supported on Ray 2.33"
+    ray.__version__ != "2.9.3", reason="Not currently supported on Ray 2.33 or Ray 2.42"
 )
 
 
@@ -123,6 +123,12 @@ class ClusterConstants:
     TEST_GPU_IMAGE_2_33 = (
         "us-docker.pkg.dev/vertex-ai/training/ray-gpu.2-33.py310:latest"
     )
+    TEST_CPU_IMAGE_2_42 = (
+        "us-docker.pkg.dev/vertex-ai/training/ray-cpu.2-42.py310:latest"
+    )
+    TEST_GPU_IMAGE_2_42 = (
+        "us-docker.pkg.dev/vertex-ai/training/ray-gpu.2-42.py310:latest"
+    )
     TEST_CUSTOM_IMAGE = "us-docker.pkg.dev/my-project/ray-custom-image.2.9:latest"
     TEST_PSC_NETWORK_ATTACHMENT = "my-network-attachment"
     # RUNNING Persistent Cluster w/o Ray