diff --git a/.buildkite/core.rayci.yml b/.buildkite/core.rayci.yml index 09005593c319..9c474bb66276 100644 --- a/.buildkite/core.rayci.yml +++ b/.buildkite/core.rayci.yml @@ -321,7 +321,7 @@ steps: commands: - bazel run //ci/ray_ci:test_in_docker -- //... core --run-flaky-tests --build-type clang - --parallelism-per-worker 2 --gpus 2 + --gpus 4 --build-name coregpubuild --only-tags multi_gpu depends_on: coregpubuild diff --git a/.buildkite/others.rayci.yml b/.buildkite/others.rayci.yml index 16508b6f7864..790266fbf4d2 100644 --- a/.buildkite/others.rayci.yml +++ b/.buildkite/others.rayci.yml @@ -1,12 +1,8 @@ group: others depends_on: - forge - - oss-ci-base_build steps: - #build - - name: doctestbuild - wanda: ci/docker/doctest.build.wanda.yaml - + # dependencies - label: ":tapioca: build: pip-compile dependencies" key: pip_compile_dependencies instance_type: small @@ -19,10 +15,13 @@ steps: - cp -f ./python/requirements_compiled.txt /artifact-mount/ soft_fail: true job_env: oss-ci-base_test-py3.11 - depends_on: - - oss-ci-base_test-multipy + depends_on: oss-ci-base_test-multipy + + # docs + - name: doctestbuild + wanda: ci/docker/doctest.build.wanda.yaml + depends_on: oss-ci-base_build - # test - label: doc tests instance_type: large commands: @@ -40,6 +39,7 @@ steps: --skip-ray-installation depends_on: doctestbuild + # java - label: ":java: java tests" tags: java instance_type: medium @@ -48,7 +48,7 @@ steps: - docker run -i --rm --volume /tmp/artifacts:/artifact-mount --shm-size=2.5gb "$${RAYCI_WORK_REPO}":"$${RAYCI_BUILD_ID}"-corebuild /bin/bash -iecuo pipefail "./java/test.sh" - depends_on: [ "corebuild", "forge" ] + depends_on: corebuild # bot - label: ":robot_face: CI weekly green metric" diff --git a/.vale/styles/config/vocabularies/Data/accept.txt b/.vale/styles/config/vocabularies/Data/accept.txt index 8ec78bd70bce..1104d6f3cd41 100644 --- a/.vale/styles/config/vocabularies/Data/accept.txt +++ b/.vale/styles/config/vocabularies/Data/accept.txt @@ -7,6 +7,7 @@ Data('s)? [Dd]iscretizer(s)? dtype [Gg]roupby +[Hh]udi [Ii]ndexable [Ii]ngest [Ii]nqueue(s)? diff --git a/BUILD.bazel b/BUILD.bazel index 03c002fc1256..f30037472d39 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -1627,7 +1627,7 @@ ray_cc_test( deps = [ ":gcs_server_lib", ":gcs_test_util_lib", - "@com_google_googletest//:gtest_main", + "@com_google_googletest//:gtest", ], ) @@ -1649,7 +1649,7 @@ ray_cc_test( deps = [ ":gcs_server_lib", ":gcs_test_util_lib", - "@com_google_googletest//:gtest_main", + "@com_google_googletest//:gtest", ], ) @@ -1883,7 +1883,7 @@ ray_cc_test( ":gcs_table_storage_test_lib", ":gcs_test_util_lib", ":store_client_test_lib", - "@com_google_googletest//:gtest_main", + "@com_google_googletest//:gtest", ], ) @@ -2403,11 +2403,43 @@ ray_cc_test( ) ray_cc_test( - name = "gcs_export_event_test", + name = "gcs_job_manager_export_event_test", size = "small", - srcs = glob([ - "src/ray/gcs/gcs_server/test/export_api/*.cc", - ]), + srcs = ["src/ray/gcs/gcs_server/test/export_api/gcs_job_manager_export_event_test.cc"], + tags = [ + "no_windows", + "team:core" + ], + deps = [ + ":gcs_server_lib", + ":gcs_server_test_util", + ":gcs_test_util_lib", + ":ray_mock", + "@com_google_googletest//:gtest_main", + ], +) + +ray_cc_test( + name = "gcs_actor_manager_export_event_test", + size = "small", + srcs = ["src/ray/gcs/gcs_server/test/export_api/gcs_actor_manager_export_event_test.cc"], + tags = [ + "no_windows", + "team:core" + ], + deps = [ + ":gcs_server_lib", + ":gcs_server_test_util", + ":gcs_test_util_lib", + ":ray_mock", + "@com_google_googletest//:gtest_main", + ], +) + +ray_cc_test( + name = "gcs_node_manager_export_event_test", + size = "small", + srcs = ["src/ray/gcs/gcs_server/test/export_api/gcs_node_manager_export_event_test.cc"], tags = [ "no_windows", "team:core" diff --git a/ci/docker/ray-ml.cpu.base.wanda.yaml b/ci/docker/ray-ml.cpu.base.wanda.yaml index 53dc0700a973..29838a2a3c98 100644 --- a/ci/docker/ray-ml.cpu.base.wanda.yaml +++ b/ci/docker/ray-ml.cpu.base.wanda.yaml @@ -3,7 +3,6 @@ froms: ["cr.ray.io/rayproject/ray-py$PYTHON_VERSION-cpu-base"] dockerfile: docker/ray-ml/Dockerfile srcs: - python/requirements.txt - - python/requirements_compiled.txt - python/requirements/ml/dl-cpu-requirements.txt - python/requirements/ml/dl-gpu-requirements.txt - python/requirements/ml/core-requirements.txt diff --git a/ci/docker/ray-ml.cuda.base.wanda.yaml b/ci/docker/ray-ml.cuda.base.wanda.yaml index 723374e90210..b3aa908c4b5f 100644 --- a/ci/docker/ray-ml.cuda.base.wanda.yaml +++ b/ci/docker/ray-ml.cuda.base.wanda.yaml @@ -3,7 +3,6 @@ froms: ["cr.ray.io/rayproject/ray-py$PYTHON_VERSION-cu$CUDA_VERSION-base"] dockerfile: docker/ray-ml/Dockerfile srcs: - python/requirements.txt - - python/requirements_compiled.txt - python/requirements/ml/dl-cpu-requirements.txt - python/requirements/ml/dl-gpu-requirements.txt - python/requirements/ml/core-requirements.txt diff --git a/ci/docker/ray.cpu.base.aarch64.wanda.yaml b/ci/docker/ray.cpu.base.aarch64.wanda.yaml index 43321ccb7ba5..1726fb261825 100644 --- a/ci/docker/ray.cpu.base.aarch64.wanda.yaml +++ b/ci/docker/ray.cpu.base.aarch64.wanda.yaml @@ -1,6 +1,8 @@ name: "ray-py$PYTHON_VERSION-cpu-base-aarch64" froms: ["ubuntu:22.04"] dockerfile: docker/base-deps/Dockerfile +srcs: + - python/requirements_compiled.txt build_args: - PYTHON_VERSION - BASE_IMAGE=ubuntu:22.04 diff --git a/ci/docker/ray.cpu.base.wanda.yaml b/ci/docker/ray.cpu.base.wanda.yaml index 4310a1820957..895605ed8f71 100644 --- a/ci/docker/ray.cpu.base.wanda.yaml +++ b/ci/docker/ray.cpu.base.wanda.yaml @@ -1,6 +1,8 @@ name: "ray-py$PYTHON_VERSION-cpu-base" froms: ["ubuntu:22.04"] dockerfile: docker/base-deps/Dockerfile +srcs: + - python/requirements_compiled.txt build_args: - PYTHON_VERSION - BASE_IMAGE=ubuntu:22.04 diff --git a/ci/docker/ray.cuda.base.aarch64.wanda.yaml b/ci/docker/ray.cuda.base.aarch64.wanda.yaml index 51fe8a870814..1d1d6df12787 100644 --- a/ci/docker/ray.cuda.base.aarch64.wanda.yaml +++ b/ci/docker/ray.cuda.base.aarch64.wanda.yaml @@ -1,6 +1,8 @@ name: "ray-py$PYTHON_VERSION-cu$CUDA_VERSION-base-aarch64" froms: ["nvidia/cuda:$CUDA_VERSION-devel-ubuntu22.04"] dockerfile: docker/base-deps/Dockerfile +srcs: + - python/requirements_compiled.txt build_args: - PYTHON_VERSION - BASE_IMAGE=nvidia/cuda:$CUDA_VERSION-devel-ubuntu22.04 diff --git a/ci/docker/ray.cuda.base.wanda.yaml b/ci/docker/ray.cuda.base.wanda.yaml index 3b2cbf4c3081..0bcd7611c921 100644 --- a/ci/docker/ray.cuda.base.wanda.yaml +++ b/ci/docker/ray.cuda.base.wanda.yaml @@ -1,6 +1,8 @@ name: "ray-py$PYTHON_VERSION-cu$CUDA_VERSION-base" froms: ["nvidia/cuda:$CUDA_VERSION-devel-ubuntu22.04"] dockerfile: docker/base-deps/Dockerfile +srcs: + - python/requirements_compiled.txt build_args: - PYTHON_VERSION - BASE_IMAGE=nvidia/cuda:$CUDA_VERSION-devel-ubuntu22.04 diff --git a/ci/env/install-core-prerelease-dependencies.sh b/ci/env/install-core-prerelease-dependencies.sh index 498ecd024581..55ba3b1e55f9 100755 --- a/ci/env/install-core-prerelease-dependencies.sh +++ b/ci/env/install-core-prerelease-dependencies.sh @@ -5,7 +5,5 @@ set -e # install all unbounded dependencies in setup.py for ray core # TOOD(scv119) reenable grpcio once https://github.com/grpc/grpc/issues/31885 is fixed. # TOOD(scv119) reenable jsonschema once https://github.com/ray-project/ray/issues/33411 is fixed. -for dependency in aiosignal frozenlist requests protobuf -do - python -m pip install -U --pre --upgrade-strategy=eager $dependency -done +DEPS=(aiosignal frozenlist requests protobuf) +python -m pip install -U --pre --upgrade-strategy=eager "${DEPS[@]}" diff --git a/doc/source/cluster/configure-manage-dashboard.md b/doc/source/cluster/configure-manage-dashboard.md index ce8eb9c9e941..885357ce38b2 100644 --- a/doc/source/cluster/configure-manage-dashboard.md +++ b/doc/source/cluster/configure-manage-dashboard.md @@ -5,7 +5,7 @@ Dashboard configurations may differ depending on how you launch Ray Clusters (e.g., local Ray Cluster v.s. KubeRay). Integrations with Prometheus and Grafana are optional for enhanced Dashboard experience. :::{note} -Ray Dashboard is only intended for interactive development and debugging because the Dashboard UI and the underlying data are not accessible after Clusters are terminated. For production monitoring and debugging, users should rely on [persisted logs](../cluster/kubernetes/user-guides/logging.md), [persisted metrics](./metrics.md), [persisted Ray states](../ray-observability/user-guides/cli-sdk.rst), and other observability tools. +Ray Dashboard is useful for interactive development and debugging because when clusters terminate, the dashboard UI and the underlying data are no longer accessible. For production monitoring and debugging, you should rely on [persisted logs](../cluster/kubernetes/user-guides/persist-kuberay-custom-resource-logs.md), [persisted metrics](./metrics.md), [persisted Ray states](../ray-observability/user-guides/cli-sdk.rst), and other observability tools. ::: ## Changing the Ray Dashboard port diff --git a/doc/source/cluster/kubernetes/configs/loki.log.yaml b/doc/source/cluster/kubernetes/configs/loki.log.yaml new file mode 100644 index 000000000000..07ab28d13344 --- /dev/null +++ b/doc/source/cluster/kubernetes/configs/loki.log.yaml @@ -0,0 +1,46 @@ +# Fluent Bit Config +config: + inputs: | + [INPUT] + Name tail + Path /var/log/containers/*.log + multiline.parser docker, cri + Tag kube.* + Mem_Buf_Limit 5MB + Skip_Long_Lines On + + filters: | + [FILTER] + Name kubernetes + Match kube.* + Merge_Log On + Keep_Log Off + K8S-Logging.Parser On + K8S-Logging.Exclude On + + outputs: | + [OUTPUT] + Name loki + Match * + Host loki-gateway + Port 80 + Labels job=fluent-bit,namespace=$kubernetes['namespace_name'],pod=$kubernetes['pod_name'],container=$kubernetes['container_name'] + Auto_Kubernetes_Labels Off + tenant_id test +--- +# Grafana Datasource Config +datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: Loki + type: loki + access: proxy + editable: true + url: http://loki-gateway.default + jsonData: + timeout: 60 + maxLines: 1000 + httpHeaderName1: "X-Scope-OrgID" + secureJsonData: + httpHeaderValue1: "test" diff --git a/doc/source/cluster/kubernetes/configs/ray-cluster.gpu.yaml b/doc/source/cluster/kubernetes/configs/ray-cluster.gpu.yaml index 5a2d01839e9b..b42a7cf10a06 100644 --- a/doc/source/cluster/kubernetes/configs/ray-cluster.gpu.yaml +++ b/doc/source/cluster/kubernetes/configs/ray-cluster.gpu.yaml @@ -12,7 +12,7 @@ spec: ######################headGroupSpec################################# # head group template and specs, (perhaps 'group' is not needed in the name) headGroupSpec: - # logical group name, for this called head-group, also can be functional + # logical group name, for this called headgroup, also can be functional # pod type head or worker # rayNodeType: head # Not needed since it is under the headgroup # the following params are used to complete the ray start: ray start --head --block ... diff --git a/doc/source/cluster/kubernetes/getting-started/raycluster-quick-start.md b/doc/source/cluster/kubernetes/getting-started/raycluster-quick-start.md index 1915081b0717..6275564a9ea7 100644 --- a/doc/source/cluster/kubernetes/getting-started/raycluster-quick-start.md +++ b/doc/source/cluster/kubernetes/getting-started/raycluster-quick-start.md @@ -35,11 +35,12 @@ kubectl get pods # kuberay-operator-7fbdbf8c89-pt8bk 1/1 Running 0 27s ``` -KubeRay offers multiple options for operator installations, such as Helm, Kustomize, and a single-namespaced operator. For further information, please refer to [the installation instructions in the KubeRay documentation](https://ray-project.github.io/kuberay/deploy/installation/). +KubeRay offers multiple options for operator installations, such as Helm, Kustomize, and a single-namespaced operator. For further information, see [the installation instructions in the KubeRay documentation](https://ray-project.github.io/kuberay/deploy/installation/). +(raycluster-deploy)= ## Step 3: Deploy a RayCluster custom resource -Once the KubeRay operator is running, we are ready to deploy a RayCluster. To do so, we create a RayCluster Custom Resource (CR) in the `default` namespace. +Once the KubeRay operator is running, you're ready to deploy a RayCluster. Create a RayCluster Custom Resource (CR) in the `default` namespace. ::::{tab-set} diff --git a/doc/source/cluster/kubernetes/user-guides.md b/doc/source/cluster/kubernetes/user-guides.md index bb8713a51822..0b2b49639949 100644 --- a/doc/source/cluster/kubernetes/user-guides.md +++ b/doc/source/cluster/kubernetes/user-guides.md @@ -15,7 +15,8 @@ user-guides/config user-guides/configuring-autoscaling user-guides/kuberay-gcs-ft user-guides/gke-gcs-bucket -user-guides/logging +user-guides/persist-kuberay-custom-resource-logs +user-guides/persist-kuberay-operator-logs user-guides/gpu user-guides/tpu user-guides/rayserve-dev-doc @@ -45,7 +46,8 @@ at the {ref}`introductory guide ` first. * {ref}`kuberay-gpu` * {ref}`kuberay-tpu` * {ref}`kuberay-gcs-ft` -* {ref}`kuberay-logging` +* {ref}`persist-kuberay-custom-resource-logs` +* {ref}`persist-kuberay-operator-logs` * {ref}`kuberay-dev-serve` * {ref}`kuberay-pod-command` * {ref}`kuberay-pod-security` diff --git a/doc/source/cluster/kubernetes/user-guides/config.md b/doc/source/cluster/kubernetes/user-guides/config.md index 5ca8df8f1c73..e4b18aaa4dea 100644 --- a/doc/source/cluster/kubernetes/user-guides/config.md +++ b/doc/source/cluster/kubernetes/user-guides/config.md @@ -126,7 +126,7 @@ Here are some of the subfields of the pod `template` to pay attention to: #### containers A Ray pod template specifies at minimum one container, namely the container that runs the Ray processes. A Ray pod template may also specify additional sidecar -containers, for purposes such as {ref}`log processing `. However, the KubeRay operator assumes that +containers, for purposes such as {ref}`log processing `. However, the KubeRay operator assumes that the first container in the containers list is the main Ray container. Therefore, make sure to specify any sidecar containers **after** the main Ray container. In other words, the Ray container should be the **first** diff --git a/doc/source/cluster/kubernetes/user-guides/images/loki-logs.png b/doc/source/cluster/kubernetes/user-guides/images/loki-logs.png new file mode 100644 index 000000000000..2419cf7ca8f0 Binary files /dev/null and b/doc/source/cluster/kubernetes/user-guides/images/loki-logs.png differ diff --git a/doc/source/cluster/kubernetes/user-guides/kuberay-gcs-ft.md b/doc/source/cluster/kubernetes/user-guides/kuberay-gcs-ft.md index dd0000049140..a54161faf82c 100644 --- a/doc/source/cluster/kubernetes/user-guides/kuberay-gcs-ft.md +++ b/doc/source/cluster/kubernetes/user-guides/kuberay-gcs-ft.md @@ -27,7 +27,7 @@ See {ref}`Ray Serve end-to-end fault tolerance documentation ` where `` is the PID of the Prometheus process that was printed out when you ran the command. To find the PID, you can also run `ps aux | grep prometheus`. +To stop Prometheus, run the following commands: + +```sh +# case 1: Ray > 2.40 +ray metrics shutdown-prometheus + +# case 2: Otherwise +# Run `ps aux | grep prometheus` to find the PID of the Prometheus process. Then, kill the process. +kill +``` + ### [Optional] Manual: Running Prometheus locally diff --git a/doc/source/conf.py b/doc/source/conf.py index 98bed502ee8c..d8ae19629647 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -74,6 +74,8 @@ ] # Configuration for algolia +# Note: This API key grants read access to our indexes and is intended to be public. +# See https://www.algolia.com/doc/guides/security/api-keys/ for more information. docsearch_app_id = "LBHF0PABBL" docsearch_api_key = "6c42f30d9669d8e42f6fc92f44028596" docsearch_index_name = "docs-ray" diff --git a/doc/source/custom_directives.py b/doc/source/custom_directives.py index 6e81d401c833..2683160332d7 100644 --- a/doc/source/custom_directives.py +++ b/doc/source/custom_directives.py @@ -481,6 +481,7 @@ def key(cls: type) -> str: class Framework(ExampleEnum): """Framework type for example metadata.""" + AWSNEURON = "AWS Neuron" PYTORCH = "PyTorch" LIGHTNING = "Lightning" TRANSFORMERS = "Transformers" diff --git a/doc/source/data/api/input_output.rst b/doc/source/data/api/input_output.rst index bb8d791d98b2..51bd7ecedb13 100644 --- a/doc/source/data/api/input_output.rst +++ b/doc/source/data/api/input_output.rst @@ -186,6 +186,15 @@ Delta Sharing read_delta_sharing_tables +Hudi +---- + +.. autosummary:: + :nosignatures: + :toctree: doc/ + + read_hudi + Iceberg ------- diff --git a/doc/source/ray-more-libs/dask-on-ray.rst b/doc/source/ray-more-libs/dask-on-ray.rst index 3e130bfcaa35..1e6ae2f39129 100644 --- a/doc/source/ray-more-libs/dask-on-ray.rst +++ b/doc/source/ray-more-libs/dask-on-ray.rst @@ -31,7 +31,10 @@ workload. Using the Dask-on-Ray scheduler, the entire Dask ecosystem can be exec * - Ray Version - Dask Version - * - ``2.8.0`` or above + * - ``2.34.0`` or above + - | ``2022.10.1 (Python version < 3.12)`` + | ``2024.6.0 (Python version >= 3.12)`` + * - ``2.8.0`` to ``2.33.x`` - ``2022.10.1`` * - ``2.5.0`` to ``2.7.x`` - | ``2022.2.0 (Python version < 3.8)`` diff --git a/doc/source/ray-observability/user-guides/configure-logging.md b/doc/source/ray-observability/user-guides/configure-logging.md index 3be1af34cbff..358691ed5584 100644 --- a/doc/source/ray-observability/user-guides/configure-logging.md +++ b/doc/source/ray-observability/user-guides/configure-logging.md @@ -28,7 +28,7 @@ A new Ray session creates a new folder to the temp directory. The latest session Usually, temp directories are cleared up whenever the machines reboot. As a result, log files may get lost whenever your cluster or some of the nodes are stopped or terminated. -If you need to inspect logs after the clusters are stopped or terminated, you need to store and persist the logs. View the instructions for how to process and export logs for {ref}`clusters on VMs ` and {ref}`KubeRay Clusters `. +If you need to inspect logs after the clusters stop or terminate, you need to store and persist the logs. See the instructions for how to process and export logs for {ref}`Log persistence ` and {ref}`KubeRay Clusters `. (logging-directory-structure)= ## Log files in logging directory @@ -131,12 +131,12 @@ ray.get([task.remote() for _ in range(100)]) The output is as follows: ```bash -2023-03-27 15:08:34,195 INFO worker.py:1603 -- Started a local Ray instance. View the dashboard at http://127.0.0.1:8265 +2023-03-27 15:08:34,195 INFO worker.py:1603 -- Started a local Ray instance. View the dashboard at http://127.0.0.1:8265 (task pid=534172) Hello there, I am a task 0.20583517821231412 (task pid=534174) Hello there, I am a task 0.17536720316370757 [repeated 99x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication) ``` -This feature is useful when importing libraries such as `tensorflow` or `numpy`, which may emit many verbose warning messages when you import them. +This feature is useful when importing libraries such as `tensorflow` or `numpy`, which may emit many verbose warning messages when you import them. Configure the following environment variables on the driver process **before importing Ray** to customize log deduplication: @@ -247,8 +247,8 @@ ray_tune_logger.addHandler(logging.FileHandler("extra_ray_tune_log.log")) Implement structured logging to enable downstream users and applications to consume the logs efficiently. ### Application logs -A Ray applications include both driver and worker processes. For Python applications, use Python loggers to format and structure your logs. -As a result, Python loggers need to be set up for both driver and worker processes. +A Ray app includes both driver and worker processes. For Python apps, use Python loggers to format and structure your logs. +As a result, you need to set up Python loggers for both driver and worker processes. ::::{tab-set} @@ -472,4 +472,4 @@ The max size of a log file, including its backup, is `RAY_ROTATION_MAX_BYTES * R ## Log persistence -To process and export logs to external stroage or management systems, view {ref}`log persistence on Kubernetes ` and {ref}`log persistence on VMs ` for more details. +To process and export logs to external stroage or management systems, view {ref}`log persistence on Kubernetes ` see {ref}`log persistence on VMs ` for more details. diff --git a/doc/source/ray-overview/installation.rst b/doc/source/ray-overview/installation.rst index 74fde96e48e6..97ff6a53e85a 100644 --- a/doc/source/ray-overview/installation.rst +++ b/doc/source/ray-overview/installation.rst @@ -441,8 +441,8 @@ Install Ray Java with Maven --------------------------- .. note:: - - All Ray Java APIs are experimental and only supported by the community. + + All Ray Java APIs are experimental and only supported by the community. Before installing Ray Java with Maven, you should install Ray Python with `pip install -U ray` . Note that the versions of Ray Java and Ray Python must match. Note that nightly Ray python wheels are also required if you want to install Ray Java snapshot version. @@ -506,7 +506,7 @@ Install Ray C++ .. note:: - All Ray C++ APIs are experimental and only supported by the community. + All Ray C++ APIs are experimental and only supported by the community. You can install and use Ray C++ API as follows. diff --git a/doc/source/serve/getting_started.md b/doc/source/serve/getting_started.md index ff2620cc8052..0bbe4084f3e5 100644 --- a/doc/source/serve/getting_started.md +++ b/doc/source/serve/getting_started.md @@ -101,6 +101,7 @@ parameters in the `@serve.deployment` decorator. The example configures a few co * `ray_actor_options`: a dictionary containing configuration options for each replica. * `num_cpus`: a float representing the logical number of CPUs each replica should reserve. You can make this a fraction to pack multiple replicas together on a machine with fewer CPUs than replicas. * `num_gpus`: a float representing the logical number of GPUs each replica should reserve. You can make this a fraction to pack multiple replicas together on a machine with fewer GPUs than replicas. + * `resources`: a dictionary containing other resource requirements for the replicate, such as non-GPU accelerators like HPUs or TPUs. All these parameters are optional, so feel free to omit them: diff --git a/doc/source/serve/production-guide/kubernetes.md b/doc/source/serve/production-guide/kubernetes.md index f8b55a1f116f..5a4914699772 100644 --- a/doc/source/serve/production-guide/kubernetes.md +++ b/doc/source/serve/production-guide/kubernetes.md @@ -72,7 +72,7 @@ rayservice-sample 7s $ kubectl get pods NAME READY STATUS RESTARTS AGE -ervice-sample-raycluster-454c4-worker-small-group-b6mmg 1/1 Running 0 XXs +service-sample-raycluster-454c4-worker-small-group-b6mmg 1/1 Running 0 XXs kuberay-operator-7fbdbf8c89-4lrnr 1/1 Running 0 XXs rayservice-sample-raycluster-454c4-head-krk9d 1/1 Running 0 XXs @@ -238,7 +238,7 @@ Monitor your Serve application using the Ray Dashboard. - Learn more about how to configure and manage Dashboard [here](observability-configure-manage-dashboard). - Learn about the Ray Serve Dashboard [here](serve-monitoring). - Learn how to set up [Prometheus](prometheus-setup) and [Grafana](grafana) for Dashboard. -- Learn about the [Ray Serve logs](serve-logging) and how to [persistent logs](kuberay-logging) on Kubernetes. +- Learn about the [Ray Serve logs](serve-logging) and how to [persistent logs](persist-kuberay-custom-resource-logs) on Kubernetes. :::{note} - To troubleshoot application deployment failures in Serve, you can check the KubeRay operator logs by running `kubectl logs -f ` (e.g., `kubectl logs -f kuberay-operator-7447d85d58-lv7pf`). The KubeRay operator logs contain information about the Serve application deployment event and Serve application health checks. diff --git a/doc/source/serve/resource-allocation.md b/doc/source/serve/resource-allocation.md index 57f580f2c370..18df5a8181a4 100644 --- a/doc/source/serve/resource-allocation.md +++ b/doc/source/serve/resource-allocation.md @@ -6,14 +6,14 @@ This guide helps you configure Ray Serve to: - Scale your deployments horizontally by specifying a number of replicas - Scale up and down automatically to react to changing traffic -- Allocate hardware resources (CPUs, GPUs, etc) for each deployment +- Allocate hardware resources (CPUs, GPUs, other accelerators, etc) for each deployment (serve-cpus-gpus)= -## Resource management (CPUs, GPUs) +## Resource management (CPUs, GPUs, accelerators) -You may want to specify a deployment's resource requirements to reserve cluster resources like GPUs. To assign hardware resources per replica, you can pass resource requirements to +You may want to specify a deployment's resource requirements to reserve cluster resources like GPUs or other accelerators. To assign hardware resources per replica, you can pass resource requirements to `ray_actor_options`. By default, each replica reserves one CPU. To learn about options to pass in, take a look at the [Resources with Actors guide](actor-resource-guide). @@ -27,6 +27,14 @@ def func(*args): return do_something_with_my_gpu() ``` +Or if you want to create a deployment where each replica uses another type of accelerator such as an HPU, follow the example below: + +```python +@serve.deployment(ray_actor_options={"resources": {"HPU": 1}}) +def func(*args): + return do_something_with_my_hpu() +``` + (serve-fractional-resources-guide)= ### Fractional CPUs and fractional GPUs diff --git a/doc/source/templates/04_finetuning_llms_with_deepspeed/compute_configs/aws_70b_v1.yaml b/doc/source/templates/04_finetuning_llms_with_deepspeed/compute_configs/aws_70b_v1.yaml index 4d553edf1500..d82552cb5c15 100644 --- a/doc/source/templates/04_finetuning_llms_with_deepspeed/compute_configs/aws_70b_v1.yaml +++ b/doc/source/templates/04_finetuning_llms_with_deepspeed/compute_configs/aws_70b_v1.yaml @@ -4,7 +4,7 @@ head_node_type: name: head_node_type instance_type: g5.48xlarge resources: - custom_resources: + custom_resources: large_cpu_mem: 1 worker_node_types: @@ -14,7 +14,7 @@ worker_node_types: max_workers: 3 use_spot: false -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/doc/source/templates/04_finetuning_llms_with_deepspeed/compute_configs/aws_70b_v2.yaml b/doc/source/templates/04_finetuning_llms_with_deepspeed/compute_configs/aws_70b_v2.yaml index 975c81fe5f3a..985939a018d5 100644 --- a/doc/source/templates/04_finetuning_llms_with_deepspeed/compute_configs/aws_70b_v2.yaml +++ b/doc/source/templates/04_finetuning_llms_with_deepspeed/compute_configs/aws_70b_v2.yaml @@ -4,7 +4,7 @@ head_node_type: name: head_node_type instance_type: g5.48xlarge resources: - custom_resources: + custom_resources: large_cpu_mem: 1 worker_node_types: @@ -20,7 +20,7 @@ worker_node_types: max_workers: 2 use_spot: false -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/doc/source/templates/04_finetuning_llms_with_deepspeed/compute_configs/aws_7b_or_13b.yaml b/doc/source/templates/04_finetuning_llms_with_deepspeed/compute_configs/aws_7b_or_13b.yaml index a8fee017a51d..0231e4aa53bc 100644 --- a/doc/source/templates/04_finetuning_llms_with_deepspeed/compute_configs/aws_7b_or_13b.yaml +++ b/doc/source/templates/04_finetuning_llms_with_deepspeed/compute_configs/aws_7b_or_13b.yaml @@ -12,7 +12,7 @@ worker_node_types: max_workers: 16 use_spot: false -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/doc/source/templates/05_dreambooth_finetuning/dreambooth/dataset.py b/doc/source/templates/05_dreambooth_finetuning/dreambooth/dataset.py index 6c7e1e52aa48..bba46f34208b 100644 --- a/doc/source/templates/05_dreambooth_finetuning/dreambooth/dataset.py +++ b/doc/source/templates/05_dreambooth_finetuning/dreambooth/dataset.py @@ -88,7 +88,9 @@ def transform_image( transform_image, fn_kwargs={"output_column_name": "instance_image"} ) .drop_columns(["image"]) - .add_column("instance_prompt_ids", lambda df: [instance_prompt_ids] * len(df)) + .add_column( + "instance_prompt_ids", lambda df: pd.Series([instance_prompt_ids] * len(df)) + ) ) # END: Apply preprocessing steps as Ray Dataset operations @@ -97,7 +99,9 @@ def transform_image( transform_image, fn_kwargs={"output_column_name": "class_image"} ) .drop_columns(["image"]) - .add_column("class_prompt_ids", lambda df: [class_prompt_ids] * len(df)) + .add_column( + "class_prompt_ids", lambda df: pd.Series([class_prompt_ids] * len(df)) + ) ) # --- Ray Data diff --git a/doc/source/templates/README.md b/doc/source/templates/README.md index 912d3174c75f..306b31bc3dc8 100644 --- a/doc/source/templates/README.md +++ b/doc/source/templates/README.md @@ -32,7 +32,7 @@ To add a template: Your template does not need to be a Jupyter notebook. It can also be presented as a Python script with `README` instructions of how to run. -2. Add a release test for the template in `release/release_tests.yaml` (for both AWS and GCE). +2. Add a release test for the template in `release/release_tests.yaml` (for both AWS and GCE). For Data tests, use `release/release_data_tests.yaml` instead. See the section on workspace templates for an example. Note that the cluster env and compute config are a little different for release tests. Use the files in the diff --git a/doc/source/templates/testing/compute_configs/04_finetuning_llms_with_deepspeed/aws_70b_v1.yaml b/doc/source/templates/testing/compute_configs/04_finetuning_llms_with_deepspeed/aws_70b_v1.yaml index 97441ceca4f7..57fa332f53c7 100644 --- a/doc/source/templates/testing/compute_configs/04_finetuning_llms_with_deepspeed/aws_70b_v1.yaml +++ b/doc/source/templates/testing/compute_configs/04_finetuning_llms_with_deepspeed/aws_70b_v1.yaml @@ -5,7 +5,7 @@ head_node_type: name: head_node_type instance_type: g5.48xlarge resources: - custom_resources: + custom_resources: large_cpu_mem: 1 worker_node_types: @@ -15,7 +15,7 @@ worker_node_types: max_workers: 3 use_spot: false -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/doc/source/templates/testing/compute_configs/04_finetuning_llms_with_deepspeed/aws_70b_v2.yaml b/doc/source/templates/testing/compute_configs/04_finetuning_llms_with_deepspeed/aws_70b_v2.yaml index 9ff1c7d09aae..d176e25d9051 100644 --- a/doc/source/templates/testing/compute_configs/04_finetuning_llms_with_deepspeed/aws_70b_v2.yaml +++ b/doc/source/templates/testing/compute_configs/04_finetuning_llms_with_deepspeed/aws_70b_v2.yaml @@ -5,7 +5,7 @@ head_node_type: name: head_node_type instance_type: g5.48xlarge resources: - custom_resources: + custom_resources: large_cpu_mem: 1 worker_node_types: @@ -21,7 +21,7 @@ worker_node_types: max_workers: 2 use_spot: false -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/doc/source/templates/testing/compute_configs/04_finetuning_llms_with_deepspeed/aws_7b.yaml b/doc/source/templates/testing/compute_configs/04_finetuning_llms_with_deepspeed/aws_7b.yaml index d8923e7ccad0..af1d8e1fa02c 100644 --- a/doc/source/templates/testing/compute_configs/04_finetuning_llms_with_deepspeed/aws_7b.yaml +++ b/doc/source/templates/testing/compute_configs/04_finetuning_llms_with_deepspeed/aws_7b.yaml @@ -13,7 +13,7 @@ worker_node_types: max_workers: 16 use_spot: false -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/doc/source/templates/testing/compute_configs/cpu/aws.yaml b/doc/source/templates/testing/compute_configs/cpu/aws.yaml index 28b9115d2755..251368c99d42 100644 --- a/doc/source/templates/testing/compute_configs/cpu/aws.yaml +++ b/doc/source/templates/testing/compute_configs/cpu/aws.yaml @@ -13,7 +13,7 @@ worker_node_types: max_workers: 7 use_spot: false -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/doc/source/templates/testing/compute_configs/gpu/aws.yaml b/doc/source/templates/testing/compute_configs/gpu/aws.yaml index 240dbea0e19e..d27020b4af2e 100644 --- a/doc/source/templates/testing/compute_configs/gpu/aws.yaml +++ b/doc/source/templates/testing/compute_configs/gpu/aws.yaml @@ -13,7 +13,7 @@ worker_node_types: max_workers: 3 use_spot: false -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/doc/source/train/examples.yml b/doc/source/train/examples.yml index 8b4f1c7cf8f2..0e7f6725e100 100644 --- a/doc/source/train/examples.yml +++ b/doc/source/train/examples.yml @@ -119,7 +119,17 @@ examples: contributor: community link: examples/intel_gaudi/llama_pretrain - - title: Fine-tune a Llama-2 text generation models with DeepSpeed and Hugging Face Accelerate + - title: Fine-tune Llama3.1 with AWS Trainium + frameworks: + - pytorch + - aws neuron + skill_level: advanced + use_cases: + - natural language processing + - large language models + contributor: community + link: examples/aws-trainium/llama3 + - title: Fine-tune a Llama-2 text generation model with DeepSpeed and Hugging Face Accelerate frameworks: - accelerate - deepspeed diff --git a/doc/source/train/examples/aws-trainium/llama3.rst b/doc/source/train/examples/aws-trainium/llama3.rst new file mode 100644 index 000000000000..ee7b89faf39e --- /dev/null +++ b/doc/source/train/examples/aws-trainium/llama3.rst @@ -0,0 +1,103 @@ +:orphan: + +Distributed fine-tuning of Llama 3.1 8B on AWS Trainium with Ray and PyTorch Lightning +====================================================================================== + + +This example demonstrates how to fine-tune the `Llama 3.1 8B `__ model on `AWS +Trainium `__ instances using Ray Train, PyTorch Lightning, and AWS Neuron SDK. + +AWS Trainium is the machine learning (ML) chip that AWS built for deep +learning (DL) training of 100B+ parameter models. `AWS Neuron +SDK `__ helps +developers train models on Trainium accelerators. + +Prepare the environment +----------------------- + +See `Setup EKS cluster and tools `__ for setting up an Amazon EKS cluster leveraging AWS Trainium instances. + +Create a Docker image +--------------------- +When the EKS cluster is ready, create an Amazon ECR repository for building and uploading the Docker image containing artifacts for fine-tuning a Llama3.1 8B model: + +1. Clone the repo. + +:: + + git clone https://github.com/aws-neuron/aws-neuron-eks-samples.git + +2. Go to the ``llama3.1_8B_finetune_ray_ptl_neuron`` directory. + +:: + + cd aws-neuron-eks-samples/llama3.1_8B_finetune_ray_ptl_neuron + +3. Trigger the script. + +:: + + chmod +x 0-kuberay-trn1-llama3-finetune-build-image.sh + ./0-kuberay-trn1-llama3-finetune-build-image.sh + +4. Enter the zone your cluster is running in, for example: us-east-2. + +5. Verify in the AWS console that the Amazon ECR service has the newly + created ``kuberay_trn1_llama3.1_pytorch2`` repository. + +6. Update the ECR image ARN in the manifest file used for creating the Ray cluster. + +Replace the and placeholders with actual values in the ``1-llama3-finetune-trn1-create-raycluster.yaml`` file using commands below to reflect the ECR image ARN created above: + +:: + + export AWS_ACCOUNT_ID= # for ex: 111222333444 + export REGION= # for ex: us-east-2 + sed -i "s//$AWS_ACCOUNT_ID/g" 1-llama3-finetune-trn1-create-raycluster.yaml + sed -i "s//$REGION/g" 1-llama3-finetune-trn1-create-raycluster.yaml + +Configuring Ray Cluster +----------------------- + +The ``llama3.1_8B_finetune_ray_ptl_neuron`` directory in the AWS Neuron samples repository simplifies the +Ray configuration. KubeRay provides a manifest that you can apply +to the cluster to set up the head and worker pods. + +Run the following command to set up the Ray cluster: + +:: + + kubectl apply -f 1-llama3-finetune-trn1-create-raycluster.yaml + + +Accessing Ray Dashboard +----------------------- +Port forward from the cluster to see the state of the Ray dashboard and +then view it on `http://localhost:8265 `__. +Run it in the background with the following command: + +:: + + kubectl port-forward service/kuberay-trn1-head-svc 8265:8265 & + +Launching Ray Jobs +------------------ + +The Ray cluster now ready to handle workloads. Initiate the data preparation and fine-tuning Ray jobs: + +1. Launch the Ray job for downloading the dolly-15k dataset and the Llama3.1 8B model artifacts: + +:: + + kubectl apply -f 2-llama3-finetune-trn1-rayjob-create-data.yaml + +2. When the job has executed successfully, run the following fine-tuning job: + +:: + + kubectl apply -f 3-llama3-finetune-trn1-rayjob-submit-finetuning-job.yaml + +3. Monitor the jobs via the Ray Dashboard + + +For detailed information on each of the steps above, see the `AWS documentation link `__. \ No newline at end of file diff --git a/docker/base-deps/Dockerfile b/docker/base-deps/Dockerfile index eec1e564d8c0..ee7d7872dfb0 100644 --- a/docker/base-deps/Dockerfile +++ b/docker/base-deps/Dockerfile @@ -1,3 +1,5 @@ +# syntax=docker/dockerfile:1.3-labs + # The base-deps Docker image installs main libraries needed to run Ray # The GPU options are NVIDIA CUDA developer images. @@ -13,82 +15,115 @@ ENV LANG=C.UTF-8 # TODO(ilr) $HOME seems to point to result in "" instead of "/home/ray" ENV PATH "/home/ray/anaconda3/bin:$PATH" ARG DEBIAN_FRONTEND=noninteractive -ARG PYTHON_VERSION=3.8.16 +ARG PYTHON_VERSION=3.9 ARG HOSTTYPE=${HOSTTYPE:-x86_64} ARG RAY_UID=1000 ARG RAY_GID=100 -RUN apt-get update -y \ - && apt-get install -y sudo tzdata \ - && useradd -ms /bin/bash -d /home/ray ray --uid $RAY_UID --gid $RAY_GID \ - && usermod -aG sudo ray \ - && echo 'ray ALL=NOPASSWD: ALL' >> /etc/sudoers \ - && rm -rf /var/lib/apt/lists/* \ - && apt-get clean +RUN <> /etc/sudoers + +EOF USER $RAY_UID ENV HOME=/home/ray +COPY python/requirements_compiled.txt /home/ray/requirements_compiled.txt + SHELL ["/bin/bash", "-c"] -RUN sudo apt-get update -y && sudo apt-get upgrade -y \ - && sudo apt-get install -y \ - git \ - libjemalloc-dev \ - wget \ - cmake \ - g++ \ - zlib1g-dev \ - $(if [ "$AUTOSCALER" = "autoscaler" ]; then echo \ - tmux \ - screen \ - rsync \ - netbase \ - openssh-client \ - gnupg; fi) \ - && wget --quiet \ - "https://repo.anaconda.com/miniconda/Miniconda3-py311_24.4.0-0-Linux-${HOSTTYPE}.sh" \ - -O /tmp/miniconda.sh \ - && /bin/bash /tmp/miniconda.sh -b -u -p $HOME/anaconda3 \ - && $HOME/anaconda3/bin/conda init \ - && echo 'export PATH=$HOME/anaconda3/bin:$PATH' >> /home/ray/.bashrc \ - && rm /tmp/miniconda.sh \ - && $HOME/anaconda3/bin/conda install -y libgcc-ng python=$PYTHON_VERSION \ - && $HOME/anaconda3/bin/conda install -y -c conda-forge libffi=3.4.2 \ - && $HOME/anaconda3/bin/conda clean -y --all \ - && $HOME/anaconda3/bin/pip install --no-cache-dir \ - flatbuffers \ - cython==0.29.37 \ - # Necessary for Dataset to work properly. - numpy\>=1.20 \ - psutil \ - # Required a recent version of setuptools to be compatible with python 3.12+. - setuptools==71.1.0 \ - # To avoid the following error on Jenkins: - # AttributeError: 'numpy.ufunc' object has no attribute '__module__' - && $HOME/anaconda3/bin/pip uninstall -y dask \ - # We install cmake temporarily to get psutil - && sudo apt-get autoremove -y cmake zlib1g-dev \ - # We keep g++ on GPU images, because uninstalling removes CUDA Devel tooling - $(if [[ "$BASE_IMAGE" == "ubuntu:22.04" && "$HOSTTYPE" == "x86_64" ]]; then echo \ - g++; fi) \ - && sudo rm -rf /var/lib/apt/lists/* \ - && sudo apt-get clean \ - && (if [ "$AUTOSCALER" = "autoscaler" ]; \ - then $HOME/anaconda3/bin/pip --no-cache-dir install \ - "redis>=3.5.0,<4.0.0" \ - "six==1.13.0" \ - "boto3==1.26.76" \ - "pyOpenSSL==22.1.0" \ - "cryptography==38.0.1" \ - "google-api-python-client==1.7.8" \ - "google-oauth" \ - "azure-cli-core==2.40.0" \ - "azure-identity==1.10.0" \ - "azure-mgmt-compute==23.1.0" \ - "azure-mgmt-network==19.0.0" \ - "azure-mgmt-resource==20.0.0" \ - "msrestazure==0.6.4"; \ - fi;) + +RUN <> /home/ray/.bashrc +rm /tmp/miniconda.sh +$HOME/anaconda3/bin/conda install -y libgcc-ng python=$PYTHON_VERSION +$HOME/anaconda3/bin/conda install -y -c conda-forge libffi=3.4.2 +$HOME/anaconda3/bin/conda clean -y --all + +PIP_PKGS=( + # Required a recent version of setuptools to be compatible with python 3.12+. + setuptools==71.1.0 + + flatbuffers + cython + numpy # Necessary for Dataset to work properly. + psutil +) +if [[ "$AUTOSCALER" == "autoscaler" ]]; then + PIP_PKGS+=( + redis + six + boto3 + pyopenssl + cryptography + google-api-python-client + google-oauth + ) +fi + +$HOME/anaconda3/bin/pip install --no-cache-dir \ + -c $HOME/requirements_compiled.txt \ + "${PIP_PKGS[@]}" + +# To avoid the following error on Jenkins: +# AttributeError: 'numpy.ufunc' object has no attribute '__module__' +$HOME/anaconda3/bin/pip uninstall -y dask + +# We install cmake temporarily to get psutil +sudo apt-get autoremove -y cmake zlib1g-dev + +# We keep g++ on GPU images, because uninstalling removes CUDA Devel tooling +if [[ "$BASE_IMAGE" == "ubuntu:22.04" && "$HOSTTYPE" == "x86_64" ]]; then + sudo apt-get autoremove -y g++ +fi + +sudo rm -rf /var/lib/apt/lists/* +sudo apt-get clean + +EOF WORKDIR $HOME diff --git a/docker/ray-ml/Dockerfile b/docker/ray-ml/Dockerfile index 67ee790389a6..42743924a118 100644 --- a/docker/ray-ml/Dockerfile +++ b/docker/ray-ml/Dockerfile @@ -1,19 +1,13 @@ +# syntax=docker/dockerfile:1.3-labs + ARG BASE_IMAGE ARG FULL_BASE_IMAGE=rayproject/ray:nightly"$BASE_IMAGE" FROM "$FULL_BASE_IMAGE" -# The python/* paths only exist in civ2, so we put them as non-first arguments. Docker -# will ignore non-existent paths if they are non-first arguments. -# -# TODO(can): simplify this once civ1 is completely deprecated. -COPY *requirements.txt \ - python/*requirements.txt \ +COPY python/*requirements.txt \ python/requirements/ml/*requirements.txt \ python/requirements/docker/*requirements.txt ./ -COPY *requirements_compiled.txt \ - python/*requirements_compiled.txt ./ -COPY *install-ml-docker-requirements.sh \ - docker/ray-ml/*install-ml-docker-requirements.sh ./ +COPY docker/ray-ml/install-ml-docker-requirements.sh ./ RUN sudo chmod +x install-ml-docker-requirements.sh \ && ./install-ml-docker-requirements.sh diff --git a/docker/ray-ml/install-ml-docker-requirements.sh b/docker/ray-ml/install-ml-docker-requirements.sh index d6744a13d2aa..0763b4d9589e 100755 --- a/docker/ray-ml/install-ml-docker-requirements.sh +++ b/docker/ray-ml/install-ml-docker-requirements.sh @@ -52,7 +52,8 @@ pip --no-cache-dir install \ sudo apt-get clean -sudo rm ./*requirements*.txt +# requirements_compiled.txt will be kept. +sudo rm ./*requirements.txt requirements_compiled_gpu.txt # MuJoCo Installation. export MUJOCO_GL=osmesa diff --git a/docker/ray/Dockerfile b/docker/ray/Dockerfile index 888183223609..9e54302603c9 100644 --- a/docker/ray/Dockerfile +++ b/docker/ray/Dockerfile @@ -1,3 +1,5 @@ +# syntax=docker/dockerfile:1.3-labs + ARG BASE_IMAGE ARG FULL_BASE_IMAGE=rayproject/ray-deps:nightly"$BASE_IMAGE" FROM $FULL_BASE_IMAGE @@ -6,7 +8,6 @@ ARG WHEEL_PATH ARG FIND_LINKS_PATH=".whl" ARG CONSTRAINTS_FILE="requirements_compiled.txt" -COPY requirements_compiled.txt ./ COPY $WHEEL_PATH . COPY $FIND_LINKS_PATH $FIND_LINKS_PATH diff --git a/python/ray/_private/ray_logging/constants.py b/python/ray/_private/ray_logging/constants.py index de84d510c16c..54552bdfe1d7 100644 --- a/python/ray/_private/ray_logging/constants.py +++ b/python/ray/_private/ray_logging/constants.py @@ -41,6 +41,9 @@ class LogKey(str, Enum): NODE_ID = "node_id" ACTOR_ID = "actor_id" TASK_ID = "task_id" + ACTOR_NAME = "actor_name" + TASK_NAME = "task_name" + TASK_FUNCTION_NAME = "task_func_name" # Logger built-in context ASCTIME = "asctime" diff --git a/python/ray/_private/ray_logging/filters.py b/python/ray/_private/ray_logging/filters.py index e7003022040a..91233a2b11c6 100644 --- a/python/ray/_private/ray_logging/filters.py +++ b/python/ray/_private/ray_logging/filters.py @@ -20,4 +20,13 @@ def filter(self, record): task_id = runtime_context.get_task_id() if task_id is not None: setattr(record, LogKey.TASK_ID.value, task_id) + task_name = runtime_context.get_task_name() + if task_name is not None: + setattr(record, LogKey.TASK_NAME.value, task_name) + task_function_name = runtime_context.get_task_function_name() + if task_function_name is not None: + setattr(record, LogKey.TASK_FUNCTION_NAME.value, task_function_name) + actor_name = runtime_context.get_actor_name() + if actor_name is not None: + setattr(record, LogKey.ACTOR_NAME.value, actor_name) return True diff --git a/python/ray/_private/ray_option_utils.py b/python/ray/_private/ray_option_utils.py index 91345e536446..61c898aff8c4 100644 --- a/python/ray/_private/ray_option_utils.py +++ b/python/ray/_private/ray_option_utils.py @@ -147,6 +147,7 @@ def _validate_resources(resources: Optional[Dict[str, float]]) -> Optional[str]: ), "_metadata": Option((dict, type(None))), "enable_task_events": Option(bool, default_value=True), + "_labels": Option((dict, type(None))), } diff --git a/python/ray/_private/services.py b/python/ray/_private/services.py index 0de0ba78405f..23461e950286 100644 --- a/python/ray/_private/services.py +++ b/python/ray/_private/services.py @@ -834,8 +834,8 @@ def start_ray_process( use_valgrind_profiler: bool = False, use_perftools_profiler: bool = False, use_tmux: bool = False, - stdout_file: Optional[str] = None, - stderr_file: Optional[str] = None, + stdout_file: Optional[IO[AnyStr]] = None, + stderr_file: Optional[IO[AnyStr]] = None, pipe_stdin: bool = False, ): """Start one of the Ray processes. @@ -1443,8 +1443,8 @@ def start_gcs_server( redis_address: str, log_dir: str, session_name: str, - stdout_file: Optional[str] = None, - stderr_file: Optional[str] = None, + stdout_file: Optional[IO[AnyStr]] = None, + stderr_file: Optional[IO[AnyStr]] = None, redis_password: Optional[str] = None, config: Optional[dict] = None, fate_share: Optional[bool] = None, diff --git a/python/ray/_private/state.py b/python/ray/_private/state.py index bebca03c0044..b8af96491b08 100644 --- a/python/ray/_private/state.py +++ b/python/ray/_private/state.py @@ -525,7 +525,7 @@ def chrome_tracing_object_transfer_dump(self, filename=None): """Return a list of transfer events that can viewed as a timeline. To view this information as a timeline, simply dump it as a json file - by passing in "filename" or using using json.dump, and then load go to + by passing in "filename" or using json.dump, and then load go to chrome://tracing in the Chrome web browser and load the dumped file. Make sure to enable "Flow events" in the "View Options" menu. @@ -748,7 +748,7 @@ def _live_node_ids(self): return set(self.total_resources_per_node().keys()) def available_resources_per_node(self): - """Returns a dictionary mapping node id to avaiable resources.""" + """Returns a dictionary mapping node id to available resources.""" self._check_connected() available_resources_by_id = {} @@ -952,7 +952,7 @@ def timeline(filename=None): variable prior to starting Ray, and set RAY_task_events_report_interval_ms=0 To view this information as a timeline, simply dump it as a json file by - passing in "filename" or using using json.dump, and then load go to + passing in "filename" or using json.dump, and then load go to chrome://tracing in the Chrome web browser and load the dumped file. Args: @@ -970,7 +970,7 @@ def object_transfer_timeline(filename=None): """Return a list of transfer events that can viewed as a timeline. To view this information as a timeline, simply dump it as a json file by - passing in "filename" or using using json.dump, and then load go to + passing in "filename" or using json.dump, and then load go to chrome://tracing in the Chrome web browser and load the dumped file. Make sure to enable "Flow events" in the "View Options" menu. diff --git a/python/ray/_private/test_utils.py b/python/ray/_private/test_utils.py index 1eb26e0fad25..7bf0de943269 100644 --- a/python/ray/_private/test_utils.py +++ b/python/ray/_private/test_utils.py @@ -98,6 +98,12 @@ def redis_replicas(): return int(os.environ.get("TEST_EXTERNAL_REDIS_REPLICAS", "1")) +def redis_sentinel_replicas(): + import os + + return int(os.environ.get("TEST_EXTERNAL_REDIS_SENTINEL_REPLICAS", "2")) + + def get_redis_cli(port, enable_tls): try: # If there is no redis libs installed, skip the check. @@ -122,6 +128,63 @@ def get_redis_cli(port, enable_tls): return redis.Redis("localhost", str(port), **params) +def start_redis_sentinel_instance( + session_dir_path: str, + port: int, + redis_master_port: int, + password: Optional[str] = None, + enable_tls: bool = False, + db_dir=None, + free_port=0, +): + config_file = os.path.join( + session_dir_path, "redis-sentinel-" + uuid.uuid4().hex + ".conf" + ) + config_lines = [] + # Port for this Sentinel instance + if enable_tls: + config_lines.append(f"port {free_port}") + else: + config_lines.append(f"port {port}") + + # Monitor the Redis master + config_lines.append(f"sentinel monitor redis-test 127.0.0.1 {redis_master_port} 1") + config_lines.append( + "sentinel down-after-milliseconds redis-test 1000" + ) # failover after 1 second + config_lines.append("sentinel failover-timeout redis-test 5000") # + config_lines.append("sentinel parallel-syncs redis-test 1") + + if password: + config_lines.append(f"sentinel auth-pass redis-test {password}") + + if enable_tls: + config_lines.append(f"tls-port {port}") + if Config.REDIS_CA_CERT(): + config_lines.append(f"tls-ca-cert-file {Config.REDIS_CA_CERT()}") + # Check and add TLS client certificate file + if Config.REDIS_CLIENT_CERT(): + config_lines.append(f"tls-cert-file {Config.REDIS_CLIENT_CERT()}") + # Check and add TLS client key file + if Config.REDIS_CLIENT_KEY(): + config_lines.append(f"tls-key-file {Config.REDIS_CLIENT_KEY()}") + config_lines.append("tls-auth-clients no") + config_lines.append("sentinel tls-auth-clients redis-test no") + if db_dir: + config_lines.append(f"dir {db_dir}") + + with open(config_file, "w") as f: + f.write("\n".join(config_lines)) + + command = [REDIS_EXECUTABLE, config_file, "--sentinel"] + process_info = ray._private.services.start_ray_process( + command, + ray_constants.PROCESS_TYPE_REDIS_SERVER, + fate_share=False, + ) + return process_info + + def start_redis_instance( session_dir_path: str, port: int, diff --git a/python/ray/_private/usage/usage_lib.py b/python/ray/_private/usage/usage_lib.py index e980703ed3eb..558f56c602ef 100644 --- a/python/ray/_private/usage/usage_lib.py +++ b/python/ray/_private/usage/usage_lib.py @@ -634,8 +634,8 @@ def _get_cluster_status_to_report_v2(gcs_client) -> ClusterStatusToReport: try: cluster_status = get_cluster_status(gcs_client.address) total_resources = cluster_status.total_resources() - result.total_num_cpus = total_resources.get("CPU", 0) - result.total_num_gpus = total_resources.get("GPU", 0) + result.total_num_cpus = int(total_resources.get("CPU", 0)) + result.total_num_gpus = int(total_resources.get("GPU", 0)) to_GiB = 1 / 2**30 result.total_memory_gb = total_resources.get("memory", 0) * to_GiB diff --git a/python/ray/_private/utils.py b/python/ray/_private/utils.py index 0eb5bf09c997..bd23131bebdf 100644 --- a/python/ray/_private/utils.py +++ b/python/ray/_private/utils.py @@ -1608,7 +1608,8 @@ def get_runtime_env_info( In the user interface, the argument `runtime_env` contains some fields which not contained in `ProtoRuntimeEnv` but in `ProtoRuntimeEnvInfo`, such as `eager_install`. This function will extract those fields from - `RuntimeEnv` and create a new `ProtoRuntimeEnvInfo`, and serialize it. + `RuntimeEnv` and create a new `ProtoRuntimeEnvInfo`, and serialize it + into json format. """ from ray.runtime_env import RuntimeEnvConfig diff --git a/python/ray/_private/worker.py b/python/ray/_private/worker.py index 118c556ec966..d2b0cf3b013d 100644 --- a/python/ray/_private/worker.py +++ b/python/ray/_private/worker.py @@ -531,6 +531,14 @@ def actor_name(self): def current_task_id(self): return self.core_worker.get_current_task_id() + @property + def current_task_name(self): + return self.core_worker.get_current_task_name() + + @property + def current_task_function_name(self): + return self.core_worker.get_current_task_function_name() + @property def current_node_id(self): return self.core_worker.get_current_node_id() @@ -3549,7 +3557,7 @@ def method(self): for more details. _metadata: Extended options for Ray libraries. For example, _metadata={"workflows.io/options": } for Ray workflows. - + _labels: The key-value labels of a task or actor. """ # "callable" returns true for both function and class. if len(args) == 1 and len(kwargs) == 0 and callable(args[0]): diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 3ddf101189dc..f3d93dce33ba 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -261,6 +261,9 @@ cdef optional[ObjectIDIndexType] NULL_PUT_INDEX = nullopt # https://docs.python.org/3/library/contextvars.html#contextvars.ContextVar # It is thread-safe. async_task_id = contextvars.ContextVar('async_task_id', default=None) +async_task_name = contextvars.ContextVar('async_task_name', default=None) +async_task_function_name = contextvars.ContextVar('async_task_function_name', + default=None) class DynamicObjectRefGenerator: @@ -737,11 +740,26 @@ cdef class Language: JAVA = Language.from_native(LANGUAGE_JAVA) +cdef int prepare_labels( + dict label_dict, + unordered_map[c_string, c_string] *label_map) except -1: + + if label_dict is None: + return 0 + + for key, value in label_dict.items(): + if not isinstance(key, str): + raise ValueError(f"Label key must be string, but got {type(key)}") + if not isinstance(value, str): + raise ValueError(f"Label value must be string, but got {type(value)}") + label_map[0][key.encode("utf-8")] = value.encode("utf-8") + + return 0 + cdef int prepare_resources( dict resource_dict, unordered_map[c_string, double] *resource_map) except -1: cdef: - unordered_map[c_string, double] out c_string resource_name list unit_resources @@ -1800,7 +1818,8 @@ cdef void execute_task( return core_worker.run_async_func_or_coro_in_event_loop( async_function, function_descriptor, name_of_concurrency_group_to_execute, task_id=task_id, - func_args=(actor, *arguments), func_kwargs=kwarguments) + task_name=task_name, func_args=(actor, *arguments), + func_kwargs=kwarguments) return function(actor, *arguments, **kwarguments) @@ -1912,7 +1931,8 @@ cdef void execute_task( execute_streaming_generator_async(context), function_descriptor, name_of_concurrency_group_to_execute, - task_id=task_id) + task_id=task_id, + task_name=task_name) else: execute_streaming_generator_sync(context) @@ -3400,6 +3420,48 @@ cdef class CoreWorker: with nogil: CCoreWorkerProcess.GetCoreWorker().Exit(c_exit_type, detail, null_ptr) + def get_current_task_name(self) -> str: + """Return the current task name. + + If it is a normal task, it returns the task name from the main thread. + If it is a threaded actor, it returns the task name for the current thread. + If it is async actor, it returns the task name stored in contextVar for + the current asyncio task. + """ + # We can only obtain the correct task name within asyncio task + # via async_task_name contextvar. We try this first. + # It is needed because the core worker's GetCurrentTask API + # doesn't have asyncio context, thus it cannot return the + # correct task name. + task_name = async_task_name.get() + if task_name is None: + # if it is not within asyncio context, fallback to TaskName + # obtainable from core worker. + task_name = CCoreWorkerProcess.GetCoreWorker().GetCurrentTaskName() \ + .decode("utf-8") + return task_name + + def get_current_task_function_name(self) -> str: + """Return the current task function. + + If it is a normal task, it returns the task function from the main thread. + If it is a threaded actor, it returns the task function for the current thread. + If it is async actor, it returns the task function stored in contextVar for + the current asyncio task. + """ + # We can only obtain the correct task function within asyncio task + # via async_task_function_name contextvar. We try this first. + # It is needed because the core Worker's GetCurrentTask API + # doesn't have asyncio context, thus it cannot return the + # correct task function. + task_function_name = async_task_function_name.get() + if task_function_name is None: + # if it is not within asyncio context, fallback to TaskName + # obtainable from core worker. + task_function_name = CCoreWorkerProcess.GetCoreWorker() \ + .GetCurrentTaskFunctionName().decode("utf-8") + return task_function_name + def get_current_task_id(self) -> TaskID: """Return the current task ID. @@ -4009,10 +4071,12 @@ cdef class CoreWorker: c_string debugger_breakpoint, c_string serialized_runtime_env_info, int64_t generator_backpressure_num_objects, - c_bool enable_task_events + c_bool enable_task_events, + labels, ): cdef: unordered_map[c_string, double] c_resources + unordered_map[c_string, c_string] c_labels CRayFunction ray_function CTaskOptions task_options c_vector[unique_ptr[CTaskArg]] args_vector @@ -4032,6 +4096,7 @@ cdef class CoreWorker: with self.profile_event(b"submit_task"): prepare_resources(resources, &c_resources) + prepare_labels(labels, &c_labels) ray_function = CRayFunction( language.lang, function_descriptor.descriptor) prepare_args_and_increment_put_refs( @@ -4043,7 +4108,9 @@ cdef class CoreWorker: b"", generator_backpressure_num_objects, serialized_runtime_env_info, - enable_task_events) + enable_task_events, + c_labels, + ) current_c_task_id = current_task.native() @@ -4089,6 +4156,7 @@ cdef class CoreWorker: int32_t max_pending_calls, scheduling_strategy, c_bool enable_task_events, + labels, ): cdef: CRayFunction ray_function @@ -4101,6 +4169,7 @@ cdef class CoreWorker: CSchedulingStrategy c_scheduling_strategy c_vector[CObjectID] incremented_put_arg_ids optional[c_bool] is_detached_optional = nullopt + unordered_map[c_string, c_string] c_labels self.python_scheduling_strategy_to_c( scheduling_strategy, &c_scheduling_strategy) @@ -4108,6 +4177,7 @@ cdef class CoreWorker: with self.profile_event(b"submit_task"): prepare_resources(resources, &c_resources) prepare_resources(placement_resources, &c_placement_resources) + prepare_labels(labels, &c_labels) ray_function = CRayFunction( language.lang, function_descriptor.descriptor) prepare_args_and_increment_put_refs( @@ -4136,7 +4206,8 @@ cdef class CoreWorker: # async or threaded actors. is_asyncio or max_concurrency > 1, max_pending_calls, - enable_task_events), + enable_task_events, + c_labels), extension_data, &c_actor_id) @@ -4247,6 +4318,7 @@ cdef class CoreWorker: TaskID current_task = self.get_current_task_id() c_string serialized_retry_exception_allowlist c_string serialized_runtime_env = b"{}" + unordered_map[c_string, c_string] c_labels serialized_retry_exception_allowlist = serialize_retry_exception_allowlist( retry_exception_allowlist, @@ -4275,7 +4347,8 @@ cdef class CoreWorker: concurrency_group_name, generator_backpressure_num_objects, serialized_runtime_env, - enable_task_events), + enable_task_events, + c_labels), max_retries, retry_exceptions, serialized_retry_exception_allowlist, @@ -4796,6 +4869,7 @@ cdef class CoreWorker: specified_cgname: str, *, task_id: Optional[TaskID] = None, + task_name: Optional[str] = None, func_args: Optional[Tuple] = None, func_kwargs: Optional[Dict] = None, ): @@ -4842,6 +4916,9 @@ cdef class CoreWorker: try: if task_id: async_task_id.set(task_id) + if task_name is not None: + async_task_name.set(task_name) + async_task_function_name.set(function_descriptor.repr) if inspect.isawaitable(func_or_coro): coroutine = func_or_coro diff --git a/python/ray/actor.py b/python/ray/actor.py index 222f52c24b5f..824de9efad73 100644 --- a/python/ray/actor.py +++ b/python/ray/actor.py @@ -924,6 +924,7 @@ def _remote(self, args=None, kwargs=None, **actor_options): scheduling_strategy: Strategy about how to schedule this actor. enable_task_events: True if tracing is enabled, i.e., task events from the actor should be reported. Defaults to True. + _labels: The key-value labels of the actor. Returns: A handle to the newly created actor. @@ -1197,6 +1198,7 @@ def _remote(self, args=None, kwargs=None, **actor_options): max_pending_calls=max_pending_calls, scheduling_strategy=scheduling_strategy, enable_task_events=enable_task_events, + labels=actor_options.get("_labels"), ) if _actor_launch_hook: diff --git a/python/ray/air/BUILD b/python/ray/air/BUILD index 0799ef871078..58951ee92889 100644 --- a/python/ray/air/BUILD +++ b/python/ray/air/BUILD @@ -46,6 +46,14 @@ py_test( deps = [":ml_lib"] ) +py_test( + name = "test_arrow", + size = "small", + srcs = ["tests/test_arrow.py"], + tags = ["team:ml", "team:data", "ray_data", "exclusive"], + deps = [":ml_lib"] +) + py_test( name = "test_air_usage", size = "small", diff --git a/python/ray/air/data_batch_type.py b/python/ray/air/data_batch_type.py index a6fad4591d35..5d5d09b3218e 100644 --- a/python/ray/air/data_batch_type.py +++ b/python/ray/air/data_batch_type.py @@ -2,6 +2,10 @@ if TYPE_CHECKING: import numpy - import pandas + import pandas # noqa: F401 + import pyarrow -DataBatchType = Union["numpy.ndarray", "pandas.DataFrame", Dict[str, "numpy.ndarray"]] +# TODO de-dup with ray.data.block.DataBatch +DataBatchType = Union[ + "numpy.ndarray", "pyarrow.Table" "pandas.DataFrame", Dict[str, "numpy.ndarray"] +] diff --git a/python/ray/air/tests/test_arrow.py b/python/ray/air/tests/test_arrow.py new file mode 100644 index 000000000000..efe68937836b --- /dev/null +++ b/python/ray/air/tests/test_arrow.py @@ -0,0 +1,71 @@ +from dataclasses import dataclass, field + +import pyarrow as pa +import pytest + +from ray.air.util.tensor_extensions.arrow import ( + ArrowConversionError, + _convert_to_pyarrow_native_array, + _infer_pyarrow_type, + convert_to_pyarrow_array, +) +from ray.air.util.tensor_extensions.utils import create_ragged_ndarray + + +@dataclass +class UserObj: + i: int = field() + + +def test_pa_infer_type_failing_to_infer(): + # Represent a single column that will be using `ArrowPythonObjectExtension` type + # to ser/de native Python objects into bytes + column_vals = create_ragged_ndarray( + [ + "hi", + 1, + None, + [[[[]]]], + {"a": [[{"b": 2, "c": UserObj(i=123)}]]}, + UserObj(i=456), + ] + ) + + inferred_dtype = _infer_pyarrow_type(column_vals) + + # Arrow (17.0) seem to fallback to assume the dtype of the first element + assert pa.string().equals(inferred_dtype) + + +def test_convert_to_pyarrow_array_object_ext_type_fallback(): + column_values = create_ragged_ndarray( + [ + "hi", + 1, + None, + [[[[]]]], + {"a": [[{"b": 2, "c": UserObj(i=123)}]]}, + UserObj(i=456), + ] + ) + column_name = "py_object_column" + + # First, assert that straightforward conversion into Arrow native types fails + with pytest.raises(ArrowConversionError) as exc_info: + _convert_to_pyarrow_native_array(column_values, column_name) + + assert ( + str(exc_info.value) + == "Error converting data to Arrow: ['hi' 1 None list([[[[]]]]) {'a': [[{'b': 2, 'c': UserObj(i=123)}]]}\n UserObj(i=456)]" # noqa: E501 + ) + + # Subsequently, assert that fallback to `ArrowObjectExtensionType` succeeds + pa_array = convert_to_pyarrow_array(column_values, column_name) + + assert pa_array.to_pylist() == column_values.tolist() + + +if __name__ == "__main__": + import sys + + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/air/tests/test_object_extension.py b/python/ray/air/tests/test_object_extension.py index b1479dbc4ac0..64600bafc69c 100644 --- a/python/ray/air/tests/test_object_extension.py +++ b/python/ray/air/tests/test_object_extension.py @@ -7,13 +7,13 @@ from ray.air.util.object_extensions.arrow import ( ArrowPythonObjectArray, ArrowPythonObjectType, - object_extension_type_allowed, + _object_extension_type_allowed, ) from ray.air.util.object_extensions.pandas import PythonObjectArray @pytest.mark.skipif( - not object_extension_type_allowed(), reason="Object extension not supported." + not _object_extension_type_allowed(), reason="Object extension not supported." ) def test_object_array_validation(): # Test unknown input type raises TypeError. @@ -25,7 +25,7 @@ def test_object_array_validation(): @pytest.mark.skipif( - not object_extension_type_allowed(), reason="Object extension not supported." + not _object_extension_type_allowed(), reason="Object extension not supported." ) def test_arrow_scalar_object_array_roundtrip(): arr = np.array( @@ -41,7 +41,7 @@ def test_arrow_scalar_object_array_roundtrip(): @pytest.mark.skipif( - not object_extension_type_allowed(), reason="Object extension not supported." + not _object_extension_type_allowed(), reason="Object extension not supported." ) def test_arrow_python_object_array_slice(): arr = np.array(["test", 20, "test2", 40, "test3", 60], dtype=object) @@ -51,7 +51,7 @@ def test_arrow_python_object_array_slice(): @pytest.mark.skipif( - not object_extension_type_allowed(), reason="Object extension not supported." + not _object_extension_type_allowed(), reason="Object extension not supported." ) def test_arrow_pandas_roundtrip(): obj = types.SimpleNamespace(a=1, b="test") diff --git a/python/ray/air/util/data_batch_conversion.py b/python/ray/air/util/data_batch_conversion.py index 4fe7a8ab2ea9..1bf69b4b9398 100644 --- a/python/ray/air/util/data_batch_conversion.py +++ b/python/ray/air/util/data_batch_conversion.py @@ -6,9 +6,6 @@ from ray.air.constants import TENSOR_COLUMN_NAME from ray.air.data_batch_type import DataBatchType -from ray.air.util.tensor_extensions.arrow import ( - get_arrow_extension_fixed_shape_tensor_types, -) from ray.util.annotations import Deprecated, DeveloperAPI if TYPE_CHECKING: @@ -220,37 +217,31 @@ def _convert_batch_type_to_numpy( ) return data elif pyarrow is not None and isinstance(data, pyarrow.Table): - from ray.air.util.transform_pyarrow import ( - _concatenate_extension_column, - _is_column_extension_type, + from ray.air.util.tensor_extensions.arrow import ( + get_arrow_extension_fixed_shape_tensor_types, ) + from ray.data._internal.arrow_ops import transform_pyarrow - if data.column_names == [TENSOR_COLUMN_NAME] and ( - isinstance( - data.schema.types[0], get_arrow_extension_fixed_shape_tensor_types() + column_values_ndarrays = [] + + for col in data.columns: + # Combine columnar values arrays to make these contiguous + # (making them compatible with numpy format) + combined_array = transform_pyarrow.combine_chunked_array(col) + + column_values_ndarrays.append( + transform_pyarrow.to_numpy(combined_array, zero_copy_only=False) ) + + arrow_fixed_shape_tensor_types = get_arrow_extension_fixed_shape_tensor_types() + + # NOTE: This branch is here for backwards-compatibility + if data.column_names == [TENSOR_COLUMN_NAME] and ( + isinstance(data.schema.types[0], arrow_fixed_shape_tensor_types) ): - # If representing a tensor dataset, return as a single numpy array. - # Example: ray.data.from_numpy(np.arange(12).reshape((3, 2, 2))) - # Arrow’s incorrect concatenation of extension arrays: - # https://issues.apache.org/jira/browse/ARROW-16503 - return _concatenate_extension_column(data[TENSOR_COLUMN_NAME]).to_numpy( - zero_copy_only=False - ) - else: - output_dict = {} - for col_name in data.column_names: - col = data[col_name] - if col.num_chunks == 0: - col = pyarrow.array([], type=col.type) - elif _is_column_extension_type(col): - # Arrow’s incorrect concatenation of extension arrays: - # https://issues.apache.org/jira/browse/ARROW-16503 - col = _concatenate_extension_column(col) - else: - col = col.combine_chunks() - output_dict[col_name] = col.to_numpy(zero_copy_only=False) - return output_dict + return column_values_ndarrays[0] + + return dict(zip(data.column_names, column_values_ndarrays)) elif isinstance(data, pd.DataFrame): return _convert_pandas_to_batch_type(data, BatchFormat.NUMPY) else: diff --git a/python/ray/air/util/object_extensions/arrow.py b/python/ray/air/util/object_extensions/arrow.py index c3158bbff68b..a56a04869855 100644 --- a/python/ray/air/util/object_extensions/arrow.py +++ b/python/ray/air/util/object_extensions/arrow.py @@ -16,7 +16,7 @@ PYARROW_VERSION = None if _VER is None else parse_version(_VER) -def object_extension_type_allowed() -> bool: +def _object_extension_type_allowed() -> bool: return ( PYARROW_VERSION is not None and PYARROW_VERSION >= MIN_PYARROW_VERSION_SCALAR_SUBCLASS @@ -104,7 +104,9 @@ def from_objects( arr = pa.array(all_dumped_bytes, type=type_.storage_type) return ArrowPythonObjectArray.from_storage(type_, arr) - def to_numpy(self, zero_copy_only: bool = False) -> np.ndarray: + def to_numpy( + self, zero_copy_only: bool = False, writable: bool = False + ) -> np.ndarray: arr = np.empty(len(self), dtype=object) arr[:] = self.to_pylist() return arr diff --git a/python/ray/air/util/tensor_extensions/arrow.py b/python/ray/air/util/tensor_extensions/arrow.py index bef940c136f9..ebe01c792458 100644 --- a/python/ray/air/util/tensor_extensions/arrow.py +++ b/python/ray/air/util/tensor_extensions/arrow.py @@ -10,10 +10,14 @@ from packaging.version import parse as parse_version from ray._private.utils import _get_pyarrow_version +from ray.air.constants import TENSOR_COLUMN_NAME from ray.air.util.tensor_extensions.utils import ( + _is_ndarray_tensor, _is_ndarray_variable_shaped_tensor, create_ragged_ndarray, ) +from ray.data._internal.util import GiB +from ray.util import log_once from ray.util.annotations import DeveloperAPI, PublicAPI PYARROW_VERSION = _get_pyarrow_version() @@ -25,9 +29,15 @@ # Minimum version of Arrow that supports subclassable ExtensionScalars. # TODO(Clark): Remove conditional definition once we only support Arrow 9.0.0+. MIN_PYARROW_VERSION_SCALAR_SUBCLASS = parse_version("9.0.0") +# Minimum version supporting `zero_copy_only` flag in `ChunkedArray.to_numpy` +MIN_PYARROW_VERSION_CHUNKED_ARRAY_TO_NUMPY_ZERO_COPY_ONLY = parse_version("13.0.0") NUM_BYTES_PER_UNICODE_CHAR = 4 +# NOTE: Overflow threshold in bytes for most Arrow types using int32 as +# its offsets +INT32_OVERFLOW_THRESHOLD = 2 * GiB + logger = logging.getLogger(__name__) @@ -85,14 +95,152 @@ def pyarrow_table_from_pydict( raise ArrowConversionError(str(pydict)) from e -@DeveloperAPI -def convert_list_to_pyarrow_array( - val: List[Any], enclosing_dict: Dict[str, Any] +@DeveloperAPI(stability="alpha") +def convert_to_pyarrow_array(column_values: np.ndarray, column_name: str) -> pa.Array: + """Converts provided NumPy `ndarray` into PyArrow's `array` while utilizing + both Arrow's natively supported types as well as custom extension types: + + - ArrowTensorArray (for tensors) + - ArrowPythonObjectArray (for user-defined python class objects, as well as + any python object that aren't represented by a corresponding Arrow's native + scalar type) + """ + + try: + # Since Arrow does NOT support tensors (aka multidimensional arrays) natively, + # we have to make sure that we handle this case utilizing `ArrowTensorArray` + # extension type + if column_name == TENSOR_COLUMN_NAME or _is_ndarray_tensor(column_values): + from ray.data.extensions.tensor_extension import ArrowTensorArray + + return ArrowTensorArray.from_numpy(column_values, column_name) + else: + return _convert_to_pyarrow_native_array(column_values, column_name) + + except ArrowConversionError as ace: + from ray.data.extensions.object_extension import ( + ArrowPythonObjectArray, + _object_extension_type_allowed, + ) + + if not _object_extension_type_allowed(): + should_serialize_as_object_ext_type = False + object_ext_type_detail = ( + "skipping fallback to serialize as pickled python" + f" objects (due to unsupported Arrow version {PYARROW_VERSION}, " + f"min required version is {MIN_PYARROW_VERSION_SCALAR_SUBCLASS})" + ) + else: + from ray.data import DataContext + + if not DataContext.get_current().enable_fallback_to_arrow_object_ext_type: + should_serialize_as_object_ext_type = False + object_ext_type_detail = ( + "skipping fallback to serialize as pickled python objects " + "(due to DataContext.enable_fallback_to_arrow_object_ext_type " + "= False)" + ) + else: + should_serialize_as_object_ext_type = True + object_ext_type_detail = ( + "falling back to serialize as pickled python objects" + ) + + # NOTE: To avoid logging following warning for every block it's + # only going to be logged in following cases + # - When fallback is disabled, or + # - It's being logged for the first time + if not should_serialize_as_object_ext_type or log_once( + "_fallback_to_arrow_object_extension_type_warning" + ): + logger.warning( + f"Failed to convert column '{column_name}' into pyarrow " + f"array due to: {ace}; {object_ext_type_detail}", + exc_info=ace, + ) + + # If `ArrowPythonObjectType` is not supported raise original exception + if not should_serialize_as_object_ext_type: + raise + + # Otherwise, attempt to fall back to serialize as python objects + return ArrowPythonObjectArray.from_objects(column_values) + + +def _convert_to_pyarrow_native_array( + column_values: np.ndarray, column_name: str ) -> pa.Array: + """Converts provided NumPy `ndarray` into PyArrow's `array` while only utilizing + Arrow's natively supported types (ie no custom extension types)""" + try: - return pa.array(val) + # NOTE: We explicitly infer PyArrow `DataType` so that + # we can perform upcasting to be able to accommodate + # blocks that are larger than 2Gb in size (limited + # by int32 offsets used by Arrow internally) + dtype = _infer_pyarrow_type(column_values) + + logger.log( + logging.getLevelName("TRACE"), + f"Inferred dtype of '{dtype}' for column '{column_name}'", + ) + + return pa.array(column_values, type=dtype) except Exception as e: - raise ArrowConversionError(str(enclosing_dict)) from e + raise ArrowConversionError(str(column_values)) from e + + +def _infer_pyarrow_type(column_values: np.ndarray) -> Optional[pa.DataType]: + """Infers target Pyarrow `DataType` based on the provided + columnar values. + + NOTE: This is a wrapper on top of `pa.infer_type(...)` utility + performing up-casting of `binary` and `string` types to + corresponding `large_binary` and `large_string` types in case + any of the array elements exceeds 2Gb in size therefore + making it impossible for original types to accommodate such + values. + + Unfortunately, for unknown reasons PA doesn't perform + that upcasting itself henceforth we have to do perform + it manually + + Args: + column_values: List of columnar values + + Returns: + Instance of PyArrow's `DataType` based on the provided + column values + """ + + if len(column_values) == 0: + return None + + inferred_pa_dtype = pa.infer_type(column_values) + + def _len_gt_overflow_threshold(obj: Any) -> bool: + # NOTE: This utility could be seeing objects other than strings or bytes in + # cases when column contains non-scalar non-homogeneous object types as + # column values, therefore making Arrow unable to infer corresponding + # column type appropriately, therefore falling back to assume the type + # of the first element in the list. + # + # Check out test cases for this method for an additional context. + if isinstance(obj, (str, bytes)): + return len(obj) > INT32_OVERFLOW_THRESHOLD + + return False + + if pa.types.is_binary(inferred_pa_dtype) and any( + [_len_gt_overflow_threshold(v) for v in column_values] + ): + return pa.large_binary() + elif pa.types.is_string(inferred_pa_dtype) and any( + [_len_gt_overflow_threshold(v) for v in column_values] + ): + return pa.large_string() + + return inferred_pa_dtype @DeveloperAPI @@ -427,7 +575,13 @@ def from_numpy( # Stack ndarrays and pass through to ndarray handling logic below. try: arr = np.stack(arr, axis=0) - except ValueError: + except ValueError as ve: + logger.warning( + f"Failed to stack lists due to: {ve}; " + f"falling back to using np.array(..., dtype=object)", + exc_info=ve, + ) + # ndarray stacking may fail if the arrays are heterogeneously-shaped. arr = np.array(arr, dtype=object) if not isinstance(arr, np.ndarray): diff --git a/python/ray/air/util/tensor_extensions/utils.py b/python/ray/air/util/tensor_extensions/utils.py index be250d89a04d..dfaa95a0acab 100644 --- a/python/ray/air/util/tensor_extensions/utils.py +++ b/python/ray/air/util/tensor_extensions/utils.py @@ -9,9 +9,28 @@ from pandas.core.dtypes.generic import ABCSeries +def _is_ndarray_tensor(arr: np.ndarray) -> bool: + """Return whether the provided NumPy ndarray is comprised of tensors. + + NOTE: Tensor is defined as a NumPy array such that `len(arr.shape) > 1` + """ + + # Case of uniform-shaped (ie non-ragged) tensor + if arr.ndim > 1: + return True + + # Case of ragged tensor (as produced by `create_ragged_ndarray` utility) + elif ( + arr.dtype.type is np.object_ and len(arr) > 0 and isinstance(arr[0], np.ndarray) + ): + return True + + return False + + def _is_ndarray_variable_shaped_tensor(arr: np.ndarray) -> bool: - """Return whether the provided NumPy ndarray is representing a variable-shaped - tensor. + """Return whether the provided NumPy ndarray is comprised of variable-shaped + tensors. NOTE: This is an O(rows) check. """ @@ -69,7 +88,7 @@ def _create_possibly_ragged_ndarray( @PublicAPI(stability="alpha") -def create_ragged_ndarray(values: Sequence[np.ndarray]) -> np.ndarray: +def create_ragged_ndarray(values: Sequence[Any]) -> np.ndarray: """Create an array that contains arrays of different length If you're working with variable-length arrays like images, use this function to diff --git a/python/ray/autoscaler/_private/cli_logger.py b/python/ray/autoscaler/_private/cli_logger.py index 01083be23eff..5172891d3119 100644 --- a/python/ray/autoscaler/_private/cli_logger.py +++ b/python/ray/autoscaler/_private/cli_logger.py @@ -113,39 +113,6 @@ def __getattr__(self, name): colorama.init(strip=False) -def _patched_makeRecord( - self, name, level, fn, lno, msg, args, exc_info, func=None, extra=None, sinfo=None -): - """Monkey-patched version of logging.Logger.makeRecord - We have to patch default loggers so they use the proper frame for - line numbers and function names (otherwise everything shows up as - e.g. cli_logger:info() instead of as where it was called from). - - In Python 3.8 we could just use stacklevel=2, but we have to support - Python 3.6 and 3.7 as well. - - The solution is this Python magic superhack. - - The default makeRecord will deliberately check that we don't override - any existing property on the LogRecord using `extra`, - so we remove that check. - - This patched version is otherwise identical to the one in the standard - library. - - TODO: Remove this magic superhack. Find a more responsible workaround. - """ - rv = logging._logRecordFactory( - name, level, fn, lno, msg, args, exc_info, func, sinfo - ) - if extra is not None: - rv.__dict__.update(extra) - return rv - - -logging.Logger.makeRecord = _patched_makeRecord - - def _external_caller_info(): """Get the info from the caller frame. diff --git a/python/ray/autoscaler/_private/commands.py b/python/ray/autoscaler/_private/commands.py index 3c03738854f7..9a9b9d91cc2f 100644 --- a/python/ray/autoscaler/_private/commands.py +++ b/python/ray/autoscaler/_private/commands.py @@ -1153,16 +1153,15 @@ def exec_cluster( }, docker_config=config.get("docker"), ) - shutdown_after_run = False if cmd and stop: cmd = "; ".join( [ cmd, "ray stop", "ray teardown ~/ray_bootstrap_config.yaml --yes --workers-only", + "sudo shutdown -h now", ] ) - shutdown_after_run = True result = _exec( updater, @@ -1172,7 +1171,7 @@ def exec_cluster( port_forward=port_forward, with_output=with_output, run_env=run_env, - shutdown_after_run=shutdown_after_run, + shutdown_after_run=False, extra_screen_args=extra_screen_args, ) if tmux or screen: diff --git a/python/ray/autoscaler/_private/kuberay/autoscaling_config.py b/python/ray/autoscaler/_private/kuberay/autoscaling_config.py index d74bb253560a..0bf61b311128 100644 --- a/python/ray/autoscaler/_private/kuberay/autoscaling_config.py +++ b/python/ray/autoscaler/_private/kuberay/autoscaling_config.py @@ -30,7 +30,7 @@ # Logical group name for the KubeRay head group. # Used as the name of the "head node type" by the autoscaler. -_HEAD_GROUP_NAME = "head-group" +_HEAD_GROUP_NAME = "headgroup" class AutoscalingConfigProducer: @@ -219,7 +219,7 @@ def _node_type_from_group_spec( resources = _get_ray_resources_from_group_spec(group_spec, is_head) - return { + node_type = { "min_workers": min_workers, "max_workers": max_workers, # `node_config` is a legacy field required for compatibility. @@ -228,6 +228,12 @@ def _node_type_from_group_spec( "resources": resources, } + idle_timeout_s = group_spec.get(IDLE_SECONDS_KEY) + if idle_timeout_s is not None: + node_type["idle_timeout_s"] = float(idle_timeout_s) + + return node_type + def _get_ray_resources_from_group_spec( group_spec: Dict[str, Any], is_head: bool diff --git a/python/ray/autoscaler/_private/kuberay/node_provider.py b/python/ray/autoscaler/_private/kuberay/node_provider.py index 060e4794867d..5378347ba78a 100644 --- a/python/ray/autoscaler/_private/kuberay/node_provider.py +++ b/python/ray/autoscaler/_private/kuberay/node_provider.py @@ -38,8 +38,6 @@ # Kind label value indicating the pod is the worker. KUBERAY_KIND_WORKER = "worker" -# Group name (node type) to use for the head. -KUBERAY_TYPE_HEAD = "head-group" # KubeRay CRD version KUBERAY_CRD_VER = os.getenv("KUBERAY_CRD_VER", "v1alpha1") @@ -104,12 +102,12 @@ def kind_and_type(pod: Dict[str, Any]) -> Tuple[NodeKind, NodeType]: from a Ray pod's labels. """ labels = pod["metadata"]["labels"] - if labels[KUBERAY_LABEL_KEY_KIND] == KUBERAY_KIND_HEAD: - kind = NODE_KIND_HEAD - type = KUBERAY_TYPE_HEAD - else: - kind = NODE_KIND_WORKER - type = labels[KUBERAY_LABEL_KEY_TYPE] + kind = ( + NODE_KIND_HEAD + if labels[KUBERAY_LABEL_KEY_KIND] == KUBERAY_KIND_HEAD + else NODE_KIND_WORKER + ) + type = labels[KUBERAY_LABEL_KEY_TYPE] return kind, type diff --git a/python/ray/autoscaler/_private/kuberay/run_autoscaler.py b/python/ray/autoscaler/_private/kuberay/run_autoscaler.py index 452bfef66c3e..efafac6d8f37 100644 --- a/python/ray/autoscaler/_private/kuberay/run_autoscaler.py +++ b/python/ray/autoscaler/_private/kuberay/run_autoscaler.py @@ -44,12 +44,12 @@ def run_kuberay_autoscaler(cluster_name: str, cluster_namespace: str): "--skip-version-check", ] ) - # Logging is not ready yet. Print to stdout for now. - print("The Ray head is ready. Starting the autoscaler.") + logger.info("The Ray head is ready. Starting the autoscaler.") break except subprocess.CalledProcessError: - print("The Ray head is not yet ready.") - print(f"Will check again in {BACKOFF_S} seconds.") + logger.warning( + f"The Ray head is not ready. Will check again in {BACKOFF_S} seconds." + ) time.sleep(BACKOFF_S) # The Ray head container sets up the log directory. Thus, we set up logging diff --git a/python/ray/autoscaler/_private/monitor.py b/python/ray/autoscaler/_private/monitor.py index a641652615df..90b5610f59ae 100644 --- a/python/ray/autoscaler/_private/monitor.py +++ b/python/ray/autoscaler/_private/monitor.py @@ -604,16 +604,6 @@ def log_resource_batch_data_if_desired( parser.add_argument( "--gcs-address", required=False, type=str, help="The address (ip:port) of GCS." ) - parser.add_argument( - "--redis-address", required=False, type=str, help="This is deprecated" - ) - parser.add_argument( - "--redis-password", - required=False, - type=str, - default=None, - help="This is deprecated", - ) parser.add_argument( "--autoscaling-config", required=False, diff --git a/python/ray/autoscaler/aws/tests/aws_compute.yaml b/python/ray/autoscaler/aws/tests/aws_compute.yaml index 1ef4e02ba1e8..8bf740d8eeed 100644 --- a/python/ray/autoscaler/aws/tests/aws_compute.yaml +++ b/python/ray/autoscaler/aws/tests/aws_compute.yaml @@ -1,7 +1,7 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west-2 -aws: +advanced_configurations_json: IamInstanceProfile: {"Name": "ray-autoscaler-v1"} head_node_type: diff --git a/python/ray/autoscaler/azure/example-full.yaml b/python/ray/autoscaler/azure/example-full.yaml index a043603bfaaa..41d7fbfd60d1 100644 --- a/python/ray/autoscaler/azure/example-full.yaml +++ b/python/ray/autoscaler/azure/example-full.yaml @@ -164,9 +164,8 @@ setup_commands: [] # - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl" # Custom commands that will be run on the head node after common setup. -# NOTE: rayproject/ray-ml:latest has azure packages bundled -head_setup_commands: [] - # - pip install -U azure-cli-core==2.22.0 azure-mgmt-compute==14.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.2.0 azure-mgmt-resource==13.0.0 +head_setup_commands: + - pip install -U azure-cli-core==2.29.1 azure-identity==1.7.0 azure-mgmt-compute==23.1.0 azure-mgmt-network==19.0.0 azure-mgmt-resource==20.0.0 msrestazure==0.6.4 # Custom commands that will be run on worker nodes after common setup. worker_setup_commands: [] diff --git a/python/ray/autoscaler/azure/example-gpu-docker.yaml b/python/ray/autoscaler/azure/example-gpu-docker.yaml index 6faaed48fb64..3ebc763e7d26 100644 --- a/python/ray/autoscaler/azure/example-gpu-docker.yaml +++ b/python/ray/autoscaler/azure/example-gpu-docker.yaml @@ -117,9 +117,8 @@ setup_commands: [] # - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl" # Custom commands that will be run on the head node after common setup. -# NOTE: rayproject/ray-ml:latest has azure packages bundled -head_setup_commands: [] - # - pip install -U azure-cli-core==2.22.0 azure-mgmt-compute==14.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.2.0 azure-mgmt-resource==13.0.0 +head_setup_commands: + - pip install -U azure-cli-core==2.29.1 azure-identity==1.7.0 azure-mgmt-compute==23.1.0 azure-mgmt-network==19.0.0 azure-mgmt-resource==20.0.0 msrestazure==0.6.4 # Custom commands that will be run on worker nodes after common setup. worker_setup_commands: [] diff --git a/python/ray/autoscaler/gcp/example-full.yaml b/python/ray/autoscaler/gcp/example-full.yaml index f5b30613aed9..2fa4c1211752 100644 --- a/python/ray/autoscaler/gcp/example-full.yaml +++ b/python/ray/autoscaler/gcp/example-full.yaml @@ -70,7 +70,7 @@ available_node_types: initializeParams: diskSizeGb: 50 # See https://cloud.google.com/compute/docs/images for more images - sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu + sourceImage: projects/deeplearning-platform-release/global/images/common-cpu-v20240922 # Additional options can be found in in the compute docs at # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert @@ -105,7 +105,7 @@ available_node_types: initializeParams: diskSizeGb: 50 # See https://cloud.google.com/compute/docs/images for more images - sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu + sourceImage: projects/deeplearning-platform-release/global/images/common-cpu-v20240922 # Run workers on preemtible instance by default. # Comment this out to use on-demand. scheduling: diff --git a/python/ray/autoscaler/gcp/example-minimal-pinned.yaml b/python/ray/autoscaler/gcp/example-minimal-pinned.yaml new file mode 100644 index 000000000000..ce42e6705854 --- /dev/null +++ b/python/ray/autoscaler/gcp/example-minimal-pinned.yaml @@ -0,0 +1,36 @@ +auth: + ssh_user: ubuntu +cluster_name: minimal +provider: + availability_zone: us-west1-a + project_id: null # TODO: set your GCP project ID here + region: us-west1 + type: gcp + +# Needs to pin the VM images for stability.. +available_node_types: + ray_head_default: + resources: {"CPU": 2} + node_config: + machineType: n1-standard-2 + disks: + - boot: true + autoDelete: true + type: PERSISTENT + initializeParams: + diskSizeGb: 50 + sourceImage: projects/deeplearning-platform-release/global/images/common-cpu-v20240922 + ray_worker_small: + min_workers: 0 + resources: {"CPU": 2} + node_config: + machineType: n1-standard-2 + disks: + - boot: true + autoDelete: true + type: PERSISTENT + initializeParams: + diskSizeGb: 50 + sourceImage: projects/deeplearning-platform-release/global/images/common-cpu-v20240922 + scheduling: + - preemptible: true diff --git a/python/ray/autoscaler/gcp/tests/single_node_32_cpu_gce.yaml b/python/ray/autoscaler/gcp/tests/single_node_32_cpu_gce.yaml index c6d1a6729fa0..466d7fe8602c 100644 --- a/python/ray/autoscaler/gcp/tests/single_node_32_cpu_gce.yaml +++ b/python/ray/autoscaler/gcp/tests/single_node_32_cpu_gce.yaml @@ -19,7 +19,7 @@ gcp_advanced_configurations_json: initialize_params: disk_size_gb: 500 -#aws: +#advanced_configurations_json: # BlockDeviceMappings: # - DeviceName: /dev/sda1 # Ebs: diff --git a/python/ray/autoscaler/kuberay/ray-cluster.complete.yaml b/python/ray/autoscaler/kuberay/ray-cluster.complete.yaml index f9e1a6cef375..d57f5d6f23b9 100644 --- a/python/ray/autoscaler/kuberay/ray-cluster.complete.yaml +++ b/python/ray/autoscaler/kuberay/ray-cluster.complete.yaml @@ -18,7 +18,7 @@ spec: serviceType: ClusterIP # the pod replicas in this group typed head (assuming there could be more than 1 in the future) replicas: 1 - # logical group name, for this called head-group, also can be functional + # logical group name, for this called headgroup, also can be functional # pod type head or worker # rayNodeType: head # Not needed since it is under the headgroup # the following params are used to complete the ray start: ray start --head --block --port=6379 ... @@ -108,7 +108,7 @@ spec: workerGroupSpecs: # the pod replicas in this group typed worker - replicas: 1 - minReplicas: 1 + minReplicas: 0 maxReplicas: 300 # logical group name, for this called small-group, also can be functional groupName: small-group diff --git a/python/ray/autoscaler/ray-schema.json b/python/ray/autoscaler/ray-schema.json index ad5da68ea2a0..2e07dadac912 100644 --- a/python/ray/autoscaler/ray-schema.json +++ b/python/ray/autoscaler/ray-schema.json @@ -345,6 +345,7 @@ }, "min_workers": {"type": "integer"}, "max_workers": {"type": "integer"}, + "idle_timeout_s": {"type": "number", "nullable": true}, "resources": { "type": "object", "patternProperties": { diff --git a/python/ray/autoscaler/v2/instance_manager/cloud_providers/kuberay/cloud_provider.py b/python/ray/autoscaler/v2/instance_manager/cloud_providers/kuberay/cloud_provider.py index c685be58cf60..c1b8ddc2a31b 100644 --- a/python/ray/autoscaler/v2/instance_manager/cloud_providers/kuberay/cloud_provider.py +++ b/python/ray/autoscaler/v2/instance_manager/cloud_providers/kuberay/cloud_provider.py @@ -55,13 +55,12 @@ def __init__( """ Args: cluster_name: The name of the RayCluster resource. - namespace: The namespace of the RayCluster resource. + provider_config: The namespace of the RayCluster. k8s_api_client: The client to the Kubernetes API server. This could be used to mock the Kubernetes API server for testing. """ self._cluster_name = cluster_name self._namespace = provider_config["namespace"] - self._head_node_type = provider_config["head_node_type"] self._k8s_api_client = k8s_api_client or KubernetesHttpApiClient( namespace=self._namespace @@ -210,21 +209,25 @@ def _initialize_scale_request( cur_instances = self.instances # Get the worker groups that have pending deletes and the worker groups that - # have finished deletes. + # have finished deletes, and the set of workers included in the workersToDelete + # field of any worker group. ( worker_groups_with_pending_deletes, worker_groups_without_pending_deletes, - ) = self._get_workers_groups_with_deletes( - ray_cluster, set(cur_instances.keys()) - ) + worker_to_delete_set, + ) = self._get_workers_delete_info(ray_cluster, set(cur_instances.keys())) # Calculate the desired number of workers by type. num_workers_dict = defaultdict(int) - for _, cur_instance in cur_instances.items(): - if cur_instance.node_kind == NodeKind.HEAD: - # Only track workers. - continue - num_workers_dict[cur_instance.node_type] += 1 + worker_groups = ray_cluster["spec"].get("workerGroupSpecs", []) + for worker_group in worker_groups: + node_type = worker_group["groupName"] + # Handle the case where users manually increase `minReplicas` + # to scale up the number of worker Pods. In this scenario, + # `replicas` will be smaller than `minReplicas`. + num_workers_dict[node_type] = max( + worker_group["replicas"], worker_group["minReplicas"] + ) # Add to launch nodes. for node_type, count in to_launch.items(): @@ -243,6 +246,11 @@ def _initialize_scale_request( # Not possible to delete head node. continue + if to_delete_instance.cloud_instance_id in worker_to_delete_set: + # If the instance is already in the workersToDelete field of + # any worker group, skip it. + continue + num_workers_dict[to_delete_instance.node_type] -= 1 assert num_workers_dict[to_delete_instance.node_type] >= 0 to_delete_instances_by_type[to_delete_instance.node_type].append( @@ -322,6 +330,7 @@ def _submit_scale_request( # No patch required. return + logger.info(f"Submitting a scale request: {scale_request}") self._patch(f"rayclusters/{self._cluster_name}", patch_payload) def _add_launch_errors( @@ -393,9 +402,9 @@ def instances(self) -> Dict[CloudInstanceId, CloudInstance]: return copy.deepcopy(self._cached_instances) @staticmethod - def _get_workers_groups_with_deletes( + def _get_workers_delete_info( ray_cluster_spec: Dict[str, Any], node_set: Set[CloudInstanceId] - ) -> Tuple[Set[NodeType], Set[NodeType]]: + ) -> Tuple[Set[NodeType], Set[NodeType], Set[CloudInstanceId]]: """ Gets the worker groups that have pending deletes and the worker groups that have finished deletes. @@ -405,10 +414,13 @@ def _get_workers_groups_with_deletes( deletes. worker_groups_with_finished_deletes: The worker groups that have finished deletes. + worker_to_delete_set: A set of Pods that are included in the workersToDelete + field of any worker group. """ worker_groups_with_pending_deletes = set() worker_groups_with_deletes = set() + worker_to_delete_set = set() worker_groups = ray_cluster_spec["spec"].get("workerGroupSpecs", []) for worker_group in worker_groups: @@ -423,6 +435,7 @@ def _get_workers_groups_with_deletes( worker_groups_with_deletes.add(node_type) for worker in workersToDelete: + worker_to_delete_set.add(worker) if worker in node_set: worker_groups_with_pending_deletes.add(node_type) break @@ -430,7 +443,11 @@ def _get_workers_groups_with_deletes( worker_groups_with_finished_deletes = ( worker_groups_with_deletes - worker_groups_with_pending_deletes ) - return worker_groups_with_pending_deletes, worker_groups_with_finished_deletes + return ( + worker_groups_with_pending_deletes, + worker_groups_with_finished_deletes, + worker_to_delete_set, + ) def _fetch_instances(self) -> Dict[CloudInstanceId, CloudInstance]: """ @@ -478,26 +495,23 @@ def _fetch_instances(self) -> Dict[CloudInstanceId, CloudInstance]: # Ignore pods marked for termination. continue pod_name = pod["metadata"]["name"] - cloud_instance = self._cloud_instance_from_pod(pod, self._head_node_type) + cloud_instance = self._cloud_instance_from_pod(pod) if cloud_instance: cloud_instances[pod_name] = cloud_instance return cloud_instances @staticmethod - def _cloud_instance_from_pod( - pod: Dict[str, Any], head_node_type: NodeType - ) -> Optional[CloudInstance]: + def _cloud_instance_from_pod(pod: Dict[str, Any]) -> Optional[CloudInstance]: """ Convert a pod to a Ray CloudInstance. Args: pod: The pod resource dict. - head_node_type: The node type of the head node. """ labels = pod["metadata"]["labels"] if labels[KUBERAY_LABEL_KEY_KIND] == KUBERAY_KIND_HEAD: kind = NodeKind.HEAD - type = head_node_type + type = labels[KUBERAY_LABEL_KEY_TYPE] elif labels[KUBERAY_LABEL_KEY_KIND] == KUBERAY_KIND_WORKER: kind = NodeKind.WORKER type = labels[KUBERAY_LABEL_KEY_TYPE] diff --git a/python/ray/autoscaler/v2/instance_manager/config.py b/python/ray/autoscaler/v2/instance_manager/config.py index c9597eef6c73..a7e582eacdbc 100644 --- a/python/ray/autoscaler/v2/instance_manager/config.py +++ b/python/ray/autoscaler/v2/instance_manager/config.py @@ -128,6 +128,8 @@ class NodeTypeConfig: min_worker_nodes: int # The maximal number of worker nodes can be launched for this node type. max_worker_nodes: int + # Idle timeout seconds for worker nodes of this node type. + idle_timeout_s: Optional[float] = None # The total resources on the node. resources: Dict[str, float] = field(default_factory=dict) # The labels on the node. @@ -346,6 +348,7 @@ def get_node_type_configs(self) -> Dict[NodeType, NodeTypeConfig]: name=node_type, min_worker_nodes=node_config.get("min_workers", 0), max_worker_nodes=max_workers_nodes, + idle_timeout_s=node_config.get("idle_timeout_s", None), resources=node_config.get("resources", {}), labels=node_config.get("labels", {}), launch_config_hash=launch_config_hash, diff --git a/python/ray/autoscaler/v2/instance_manager/instance_manager.py b/python/ray/autoscaler/v2/instance_manager/instance_manager.py index aa926ba92747..6a1f6e207408 100644 --- a/python/ray/autoscaler/v2/instance_manager/instance_manager.py +++ b/python/ray/autoscaler/v2/instance_manager/instance_manager.py @@ -58,7 +58,7 @@ def update_instance_manager_state( """ Updates the instance manager state. - If there's a any failure, no updates would be made and the reply + If there's any failure, no updates would be made and the reply would contain the latest version of the instance manager state, and the error info. @@ -80,7 +80,7 @@ def update_instance_manager_state( f"Version mismatch: expected: {request.expected_version}, " f"actual: {version}" ) - logger.warn(err_str) + logger.warning(err_str) return self._get_update_im_state_reply( StatusCode.VERSION_MISMATCH, version, @@ -110,7 +110,7 @@ def update_instance_manager_state( err_str = ( f"Version mismatch: expected: {version}, actual: {result.version}" ) - logger.warn(err_str) + logger.warning(err_str) return self._get_update_im_state_reply( StatusCode.VERSION_MISMATCH, result.version, err_str ) diff --git a/python/ray/autoscaler/v2/monitor.py b/python/ray/autoscaler/v2/monitor.py index 8277addc0017..558725f3e78f 100644 --- a/python/ray/autoscaler/v2/monitor.py +++ b/python/ray/autoscaler/v2/monitor.py @@ -17,6 +17,7 @@ from ray._private.event.event_logger import get_event_logger from ray._private.ray_logging import setup_component_logger from ray._private.usage.usage_lib import record_extra_usage_tag +from ray._private.worker import SCRIPT_MODE from ray._raylet import GcsClient from ray.autoscaler._private.constants import ( AUTOSCALER_METRIC_PORT, @@ -77,7 +78,7 @@ def __init__( ) self._session_name = self._get_session_name(self.gcs_client) logger.info(f"session_name: {self._session_name}") - worker.mode = 0 + worker.set_mode(SCRIPT_MODE) head_node_ip = self.gcs_address.split(":")[0] self.autoscaler = None @@ -197,16 +198,6 @@ def record_autoscaler_v2_usage(gcs_client: GcsClient) -> None: parser.add_argument( "--gcs-address", required=False, type=str, help="The address (ip:port) of GCS." ) - parser.add_argument( - "--redis-address", required=False, type=str, help="This is deprecated" - ) - parser.add_argument( - "--redis-password", - required=False, - type=str, - default=None, - help="This is deprecated", - ) parser.add_argument( "--autoscaling-config", required=False, diff --git a/python/ray/autoscaler/v2/scheduler.py b/python/ray/autoscaler/v2/scheduler.py index 3732a6282632..2d5a70065066 100644 --- a/python/ray/autoscaler/v2/scheduler.py +++ b/python/ray/autoscaler/v2/scheduler.py @@ -1584,6 +1584,11 @@ def _enforce_idle_termination( continue idle_timeout_s = ctx.get_idle_timeout_s() + # Override the scheduler idle_timeout_s if set for this node_type. + node_type = node.node_type + if node_type in node_type_configs: + if node_type_configs[node_type].idle_timeout_s is not None: + idle_timeout_s = node_type_configs[node_type].idle_timeout_s if idle_timeout_s is None: # No idle timeout is set, skip the idle termination. continue @@ -1606,7 +1611,6 @@ def _enforce_idle_termination( # Honor the min_worker_nodes setting for the node type. min_count = 0 - node_type = node.node_type if node_type in node_type_configs: min_count = node_type_configs[node_type].min_worker_nodes if ( diff --git a/python/ray/autoscaler/v2/tests/test_node_provider.py b/python/ray/autoscaler/v2/tests/test_node_provider.py index 5141891c0a36..47483d3f61fa 100644 --- a/python/ray/autoscaler/v2/tests/test_node_provider.py +++ b/python/ray/autoscaler/v2/tests/test_node_provider.py @@ -19,10 +19,7 @@ AUTOSCALER_MAX_LAUNCH_BATCH, ) from ray.autoscaler._private.fake_multi_node.node_provider import FakeMultiNodeProvider -from ray.autoscaler._private.kuberay.node_provider import ( - KUBERAY_TYPE_HEAD, - IKubernetesHttpApiClient, -) +from ray.autoscaler._private.kuberay.node_provider import IKubernetesHttpApiClient from ray.autoscaler.v2.instance_manager.cloud_providers.kuberay.cloud_provider import ( KubeRayProvider, ) @@ -372,7 +369,7 @@ def setUp(self): cluster_name="test", provider_config={ "namespace": "default", - "head_node_type": KUBERAY_TYPE_HEAD, + "head_node_type": "headgroup", }, k8s_api_client=self.mock_client, ) @@ -389,7 +386,7 @@ def test_get_nodes(self): "raycluster-autoscaler-head-8zsc8": CloudInstance( cloud_instance_id="raycluster-autoscaler-head-8zsc8", node_kind=NodeKind.HEAD, - node_type="head-group", + node_type="headgroup", is_running=True, ), # up-to-date status because the Ray container is in running status "raycluster-autoscaler-worker-small-group-dkz2r": CloudInstance( @@ -495,6 +492,124 @@ def test_pending_deletes(self): }, ] + def test_increase_min_replicas_to_scale_up(self): + # Simulate the case where users manually increase the `minReplicas` field + # from 0 to $num_pods. KubeRay will create $num_pods worker Pods to meet the new + # `minReplicas`, even though the `replicas` field is still 0. + small_group = "small-group" + num_pods = 0 + assert ( + self.mock_client._ray_cluster["spec"]["workerGroupSpecs"][0]["groupName"] + == small_group + ) + for pod in self.mock_client._pod_list["items"]: + if pod["metadata"]["labels"]["ray.io/group"] == small_group: + num_pods += 1 + assert num_pods > 0 + self.mock_client._ray_cluster["spec"]["workerGroupSpecs"][0]["replicas"] = 0 + self.mock_client._ray_cluster["spec"]["workerGroupSpecs"][0][ + "minReplicas" + ] = num_pods + + # Launching a new node and `replicas` should be + # `max(replicas, minReplicas) + 1`. + self.provider.launch(shape={small_group: 1}, request_id="launch-1") + patches = self.mock_client.get_patches( + f"rayclusters/{self.provider._cluster_name}" + ) + assert len(patches) == 1 + assert patches[0] == { + "op": "replace", + "path": "/spec/workerGroupSpecs/0/replicas", + "value": num_pods + 1, + } + + def test_inconsistent_pods_raycr_scale_up(self): + """ + Test the case where the cluster state has not yet reached the desired state. + Specifically, the replicas field in the RayCluster CR does not match the actual + number of Pods. + """ + # Check the assumptions of the test + small_group = "small-group" + num_pods = 0 + for pod in self.mock_client._pod_list["items"]: + if pod["metadata"]["labels"]["ray.io/group"] == small_group: + num_pods += 1 + + assert ( + self.mock_client._ray_cluster["spec"]["workerGroupSpecs"][0]["groupName"] + == small_group + ) + desired_replicas = num_pods + 1 + self.mock_client._ray_cluster["spec"]["workerGroupSpecs"][0][ + "replicas" + ] = desired_replicas + + # Launch a new node. The replicas field should be incremented by 1, even though + # the cluster state has not yet reached the goal state. + launch_request = {"small-group": 1} + self.provider.launch(shape=launch_request, request_id="launch-1") + + patches = self.mock_client.get_patches( + f"rayclusters/{self.provider._cluster_name}" + ) + assert len(patches) == 1 + assert patches[0] == { + "op": "replace", + "path": "/spec/workerGroupSpecs/0/replicas", + "value": desired_replicas + 1, + } + + def test_inconsistent_pods_raycr_scale_down(self): + """ + Test the case where the cluster state has not yet reached the desired state. + Specifically, the replicas field in the RayCluster CR does not match the actual + number of Pods. + """ + # Check the assumptions of the test + small_group = "small-group" + num_pods = 0 + pod_to_delete = None + for pod in self.mock_client._pod_list["items"]: + if pod["metadata"]["labels"]["ray.io/group"] == small_group: + num_pods += 1 + pod_to_delete = pod["metadata"]["name"] + assert pod_to_delete is not None + + assert ( + self.mock_client._ray_cluster["spec"]["workerGroupSpecs"][0]["groupName"] + == small_group + ) + desired_replicas = num_pods + 1 + self.mock_client._ray_cluster["spec"]["workerGroupSpecs"][0][ + "replicas" + ] = desired_replicas + + # Terminate a node. The replicas field should be decremented by 1, even though + # the cluster state has not yet reached the goal state. + self.provider.terminate(ids=[pod_to_delete], request_id="term-1") + patches = self.mock_client.get_patches( + f"rayclusters/{self.provider._cluster_name}" + ) + assert len(patches) == 2 + assert patches == [ + { + "op": "replace", + "path": "/spec/workerGroupSpecs/0/replicas", + "value": desired_replicas - 1, + }, + { + "op": "replace", + "path": "/spec/workerGroupSpecs/0/scaleStrategy", + "value": { + "workersToDelete": [ + pod_to_delete, + ] + }, + }, + ] + if __name__ == "__main__": if os.environ.get("PARALLEL_CI"): diff --git a/python/ray/autoscaler/v2/tests/test_scheduler.py b/python/ray/autoscaler/v2/tests/test_scheduler.py index e6d6cb71978d..3a188bdaf2ce 100644 --- a/python/ray/autoscaler/v2/tests/test_scheduler.py +++ b/python/ray/autoscaler/v2/tests/test_scheduler.py @@ -1434,6 +1434,82 @@ def test_idle_termination_with_min_worker(min_workers): assert len(to_terminate) == 0 +@pytest.mark.parametrize("node_type_idle_timeout_s", [1, 2, 10]) +def test_idle_termination_with_node_type_idle_timeout(node_type_idle_timeout_s): + """ + Test that idle nodes are terminated when idle_timeout_s is set for node type. + """ + scheduler = ResourceDemandScheduler(event_logger) + + node_type_configs = { + "type_cpu_with_idle_timeout": NodeTypeConfig( + name="type_cpu", + resources={"CPU": 1}, + min_worker_nodes=0, + max_worker_nodes=5, + idle_timeout_s=node_type_idle_timeout_s, + launch_config_hash="hash1", + ), + } + + idle_time_s = 5 + constraints = [] + + request = sched_request( + node_type_configs=node_type_configs, + instances=[ + make_autoscaler_instance( + im_instance=Instance( + instance_type="type_cpu_with_idle_timeout", + status=Instance.RAY_RUNNING, + launch_config_hash="hash1", + instance_id="i-1", + node_id="r-1", + ), + ray_node=NodeState( + node_id=b"r-1", + ray_node_type_name="type_cpu_with_idle_timeout", + available_resources={"CPU": 0}, + total_resources={"CPU": 1}, + idle_duration_ms=0, # Non idle + status=NodeStatus.RUNNING, + ), + cloud_instance_id="c-1", + ), + make_autoscaler_instance( + im_instance=Instance( + instance_id="i-2", + instance_type="type_cpu_with_idle_timeout", + status=Instance.RAY_RUNNING, + launch_config_hash="hash1", + node_id="r-2", + ), + ray_node=NodeState( + ray_node_type_name="type_cpu_with_idle_timeout", + node_id=b"r-2", + available_resources={"CPU": 1}, + total_resources={"CPU": 1}, + idle_duration_ms=idle_time_s * 1000, + status=NodeStatus.IDLE, + ), + cloud_instance_id="c-2", + ), + ], + # Set autoscaler idle_timeout_s to a value greater than + # node_type_idle_timeout_s and idle_time_s. + idle_timeout_s=idle_time_s * 1000, + cluster_resource_constraints=constraints, + ) + + reply = scheduler.schedule(request) + _, to_terminate = _launch_and_terminate(reply) + if node_type_idle_timeout_s <= idle_time_s: + assert len(to_terminate) == 1 + assert to_terminate == [("i-2", "r-2", TerminationRequest.Cause.IDLE)] + else: + assert len(to_terminate) == 0 + + def test_gang_scheduling(): """ Test that gang scheduling works. diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py index acec6c2672cd..7c6160d8937d 100644 --- a/python/ray/dag/compiled_dag_node.py +++ b/python/ray/dag/compiled_dag_node.py @@ -185,7 +185,7 @@ def do_profile_tasks( """ try: for task in tasks: - task.prepare() + task.prepare(overlap_gpu_communication=overlap_gpu_communication) if not hasattr(self, "__ray_adag_events"): self.__ray_adag_events = [] @@ -1880,7 +1880,7 @@ def wait_teardown(self, kill_actors: bool = False): from ray.dag import DAGContext ctx = DAGContext.get_current() - teardown_timeout = ctx.retrieval_timeout + teardown_timeout = ctx.teardown_timeout for actor, ref in outer.worker_task_refs.items(): timeout = False @@ -2443,7 +2443,14 @@ def teardown(self, kill_actors: bool = False): monitor = getattr(self, "_monitor", None) if monitor is not None: + from ray.dag import DAGContext + + ctx = DAGContext.get_current() monitor.teardown(kill_actors=kill_actors) + monitor.join(timeout=ctx.teardown_timeout) + # We do not log a warning here if the thread is still alive because + # wait_teardown already logs upon teardown_timeout. + self._is_teardown = True def __del__(self): diff --git a/python/ray/dag/context.py b/python/ray/dag/context.py index 29e1d5bf2c78..531785c50262 100644 --- a/python/ray/dag/context.py +++ b/python/ray/dag/context.py @@ -10,6 +10,7 @@ DEFAULT_EXECUTION_TIMEOUT_S = int(os.environ.get("RAY_DAG_execution_timeout", 10)) DEFAULT_RETRIEVAL_TIMEOUT_S = int(os.environ.get("RAY_DAG_retrieval_timeout", 10)) +DEFAULT_TEARDOWN_TIMEOUT_S = int(os.environ.get("RAY_DAG_teardown_timeout", 30)) # Default buffer size is 1MB. DEFAULT_BUFFER_SIZE_BYTES = int(os.environ.get("RAY_DAG_buffer_size_bytes", 1e6)) # Default asyncio_max_queue_size is 0, which means no limit. @@ -51,6 +52,8 @@ class DAGContext: calls. retrieval_timeout: The maximum time in seconds to wait to retrieve a result from the DAG. + teardown_timeout: The maximum time in seconds to wait for the DAG to + cleanly shut down. buffer_size_bytes: The maximum size of messages that can be passed between tasks in the DAG. asyncio_max_queue_size: The max queue size for the async execution. @@ -72,6 +75,7 @@ class DAGContext: execution_timeout: int = DEFAULT_EXECUTION_TIMEOUT_S retrieval_timeout: int = DEFAULT_RETRIEVAL_TIMEOUT_S + teardown_timeout: int = DEFAULT_TEARDOWN_TIMEOUT_S buffer_size_bytes: int = DEFAULT_BUFFER_SIZE_BYTES asyncio_max_queue_size: int = DEFAULT_ASYNCIO_MAX_QUEUE_SIZE max_buffered_results: int = DEFAULT_MAX_BUFFERED_RESULTS diff --git a/python/ray/dag/tests/experimental/test_accelerated_dag.py b/python/ray/dag/tests/experimental/test_accelerated_dag.py index d6176fd57dc6..cbec80a871c4 100644 --- a/python/ray/dag/tests/experimental/test_accelerated_dag.py +++ b/python/ray/dag/tests/experimental/test_accelerated_dag.py @@ -1078,6 +1078,12 @@ def test_dag_exception_chained(ray_start_regular, capsys): # Can use the DAG after exceptions are thrown. assert ray.get(compiled_dag.execute(1)) == 2 + # Note: somehow the auto triggered teardown() from ray.shutdown() + # does not finish in time for this test, leading to a segfault + # of the following test (likely due to a dangling monitor thread + # upon the new Ray init). + compiled_dag.teardown() + @pytest.mark.parametrize("single_fetch", [True, False]) def test_dag_exception_multi_output(ray_start_regular, single_fetch, capsys): diff --git a/python/ray/dag/tests/experimental/test_torch_tensor_dag.py b/python/ray/dag/tests/experimental/test_torch_tensor_dag.py index d1ac1c68063f..1797068e7e2d 100644 --- a/python/ray/dag/tests/experimental/test_torch_tensor_dag.py +++ b/python/ray/dag/tests/experimental/test_torch_tensor_dag.py @@ -182,7 +182,11 @@ def test_torch_tensor_as_dag_input(ray_start_regular): @pytest.mark.parametrize("ray_start_regular", [{"num_cpus": 4}], indirect=True) -def test_torch_tensor_nccl(ray_start_regular): +@pytest.mark.parametrize("enable_profiling", [False, True]) +@pytest.mark.parametrize("overlap_gpu_communication", [False, True]) +def test_torch_tensor_nccl( + ray_start_regular, monkeypatch, enable_profiling, overlap_gpu_communication +): if not USE_GPU: pytest.skip("NCCL tests require GPUs") @@ -190,6 +194,10 @@ def test_torch_tensor_nccl(ray_start_regular): sum(node["Resources"].get("GPU", 0) for node in ray.nodes()) > 1 ), "This test requires at least 2 GPUs" + monkeypatch.setattr( + ray.dag.constants, "RAY_ADAG_ENABLE_PROFILING", enable_profiling + ) + actor_cls = TorchTensorWorker.options(num_cpus=0, num_gpus=1) sender = actor_cls.remote() @@ -204,7 +212,9 @@ def test_torch_tensor_nccl(ray_start_regular): dag = dag.with_type_hint(TorchTensorType(transport="nccl")) dag = receiver.recv.bind(dag) - compiled_dag = dag.experimental_compile() + compiled_dag = dag.experimental_compile( + _overlap_gpu_communication=overlap_gpu_communication + ) # Test that we can pass different shapes and data. for i in range(3): diff --git a/python/ray/dashboard/client/src/App.tsx b/python/ray/dashboard/client/src/App.tsx index bc30e06ccc0c..84b4a2c88e7b 100644 --- a/python/ray/dashboard/client/src/App.tsx +++ b/python/ray/dashboard/client/src/App.tsx @@ -23,7 +23,12 @@ import { StateApiLogViewerPage, } from "./pages/log/Logs"; import { Metrics } from "./pages/metrics"; -import { DashboardUids, getMetricsInfo } from "./pages/metrics/utils"; +import { + DashboardUids, + getMetricsInfo, + getTimeZoneInfo, + TimezoneInfo, +} from "./pages/metrics/utils"; import Nodes, { ClusterMainPageLayout } from "./pages/node"; import { ClusterDetailInfoPage } from "./pages/node/ClusterDetailInfoPage"; import { ClusterLayout } from "./pages/node/ClusterLayout"; @@ -91,6 +96,14 @@ export type GlobalContextType = { * The name of the current selected datasource. */ dashboardDatasource: string | undefined; + /** + * The timezone set on the ray cluster. + */ + serverTimeZone: TimezoneInfo | null | undefined; + /** + * The globally selected current time zone. + */ + currentTimeZone: string | undefined; }; export const GlobalContext = React.createContext({ nodeMap: {}, @@ -102,10 +115,15 @@ export const GlobalContext = React.createContext({ prometheusHealth: undefined, sessionName: undefined, dashboardDatasource: undefined, + serverTimeZone: undefined, + currentTimeZone: undefined, }); const App = () => { - const [context, setContext] = useState({ + const [currentTimeZone, setCurrentTimeZone] = useState(); + const [context, setContext] = useState< + Omit + >({ nodeMap: {}, nodeMapByIp: {}, namespaceMap: {}, @@ -115,6 +133,7 @@ const App = () => { prometheusHealth: undefined, sessionName: undefined, dashboardDatasource: undefined, + serverTimeZone: undefined, }); useEffect(() => { getNodeList().then((res) => { @@ -158,11 +177,36 @@ const App = () => { doEffect(); }, []); + useEffect(() => { + const updateTimezone = async () => { + // Sets the intial timezone to localStorage value if it exists + const storedTimeZone = localStorage.getItem("timezone"); + if (storedTimeZone) { + setCurrentTimeZone(storedTimeZone); + } + + // Fetch the server time zone. + const tzInfo = await getTimeZoneInfo(); + + const timeZone = + storedTimeZone || + tzInfo?.value || + Intl.DateTimeFormat().resolvedOptions().timeZone; + + setCurrentTimeZone(timeZone); + setContext((existingContext) => ({ + ...existingContext, + serverTimeZone: tzInfo, + })); + }; + updateTimezone(); + }, []); + return ( - + diff --git a/python/ray/dashboard/client/src/common/formatUtils.ts b/python/ray/dashboard/client/src/common/formatUtils.ts index 03610fd82553..4b0c9a24a7d1 100644 --- a/python/ray/dashboard/client/src/common/formatUtils.ts +++ b/python/ray/dashboard/client/src/common/formatUtils.ts @@ -1,4 +1,9 @@ import dayjs from "dayjs"; +import timezone from "dayjs/plugin/timezone"; +import utc from "dayjs/plugin/utc"; + +dayjs.extend(utc); +dayjs.extend(timezone); export const formatByteAmount = ( amount: number, @@ -60,5 +65,10 @@ export const formatValue = (rawFloat: number) => { } }; -export const formatDateFromTimeMs = (time: number) => - dayjs(time).format("YYYY/MM/DD HH:mm:ss"); +export const formatTimeZone = (UTC: string) => { + dayjs.tz.setDefault(UTC); +}; + +export const formatDateFromTimeMs = (time: number) => { + return dayjs.utc(time).tz().format("YYYY/MM/DD HH:mm:ss"); +}; diff --git a/python/ray/dashboard/client/src/common/timezone.ts b/python/ray/dashboard/client/src/common/timezone.ts new file mode 100644 index 000000000000..d02b785550fe --- /dev/null +++ b/python/ray/dashboard/client/src/common/timezone.ts @@ -0,0 +1,794 @@ +export const timezones = [ + { + utc: "GMT-12:00", + value: "Etc/GMT+12", + group: "Pacific", + country: "International Date Line West", + }, + { + utc: "GMT-11:00", + value: "Pacific/Pago_Pago", + group: "Pacific", + country: "American Samoa", + }, + { + utc: "GMT-11:00", + value: "Pacific/Midway", + group: "Pacific", + country: "Midway Island", + }, + { + utc: "GMT-10:00", + value: "Pacific/Honolulu", + group: "Pacific", + country: "Hawaii", + }, + { + utc: "GMT-09:00", + value: "America/Anchorage", + group: "America", + country: "Alaska", + }, + { + utc: "GMT-08:00", + value: "America/Los_Angeles", + group: "America", + country: "Pacific Time (US & Canada)", + }, + { + utc: "GMT-08:00", + value: "America/Tijuana", + group: "America", + country: "Tijuana", + }, + { + utc: "GMT-07:00", + value: "America/Phoenix", + group: "America", + country: "Arizona", + }, + { + utc: "GMT-07:00", + value: "America/Mazatlan", + group: "America", + country: "Mazatlan", + }, + { + utc: "GMT-07:00", + value: "America/Denver", + group: "America", + country: "Mountain Time (US & Canada)", + }, + { + utc: "GMT-06:00", + value: "America/Guatemala", + group: "America", + country: "Central America", + }, + { + utc: "GMT-06:00", + value: "America/Chicago", + group: "America", + country: "Central Time (US & Canada)", + }, + { + utc: "GMT-06:00", + value: "America/Chihuahua", + group: "America", + country: "Chihuahua", + }, + { + utc: "GMT-06:00", + value: "America/Guadalajara", + group: "America", + country: "Guadalajara", + }, + { + utc: "GMT-06:00", + value: "America/Mexico_City", + group: "America", + country: "Mexico City", + }, + { + utc: "GMT-06:00", + value: "America/Monterrey", + group: "America", + country: "Monterrey", + }, + { + utc: "GMT-06:00", + value: "America/Regina", + group: "America", + country: "Saskatchewan", + }, + { + utc: "GMT-05:00", + value: "America/Bogota", + group: "America", + country: "Bogota", + }, + { + utc: "GMT-05:00", + value: "America/New_York", + group: "America", + country: "Eastern Time (US & Canada)", + }, + { + utc: "GMT-05:00", + value: "America/Indiana/Indianapolis", + group: "America", + country: "Indiana (East)", + }, + { + utc: "GMT-05:00", + value: "America/Lima", + group: "America", + country: "Lima", + }, + { + utc: "GMT-05:00", + value: "America/Guayaquil", + group: "America", + country: "Quito", + }, + { + utc: "GMT-04:00", + value: "America/Halifax", + group: "America", + country: "Atlantic Time (Canada)", + }, + { + utc: "GMT-04:00", + value: "America/Caracas", + group: "America", + country: "Caracas", + }, + { + utc: "GMT-04:00", + value: "America/Guyana", + group: "America", + country: "Georgetown", + }, + { + utc: "GMT-04:00", + value: "America/La_Paz", + group: "America", + country: "La Paz", + }, + { + utc: "GMT-04:00", + value: "America/Puerto_Rico", + group: "America", + country: "Puerto Rico", + }, + { + utc: "GMT-04:00", + value: "America/Santiago", + group: "America", + country: "Santiago", + }, + { + utc: "GMT-03:30", + value: "America/St_Johns", + group: "America", + country: "Newfoundland", + }, + { + utc: "GMT-03:00", + value: "America/Sao_Paulo", + group: "America", + country: "Brasilia", + }, + { + utc: "GMT-03:00", + value: "America/Argentina/Buenos_Aires", + group: "America", + country: "Buenos Aires", + }, + { + utc: "GMT-03:00", + value: "America/Montevideo", + group: "America", + country: "Montevideo", + }, + { + utc: "GMT-02:00", + value: "America/Godthab", + group: "America", + country: "Greenland", + }, + { + utc: "GMT-02:00", + value: "Etc/GMT+2", + group: "Atlantic", + country: "Mid-Atlantic", + }, + { + utc: "GMT-01:00", + value: "Atlantic/Azores", + group: "Atlantic", + country: "Azores", + }, + { + utc: "GMT-01:00", + value: "Atlantic/Cape_Verde", + group: "Atlantic", + country: "Cape Verde Is.", + }, + { + utc: "GMT+00:00", + value: "Europe/London", + group: "Europe", + country: "Edinburgh", + }, + { + utc: "GMT+00:00", + value: "Europe/Lisbon", + group: "Europe", + country: "Lisbon", + }, + { + utc: "GMT+00:00", + value: "Europe/London", + group: "Europe", + country: "London", + }, + { + utc: "GMT+00:00", + value: "Africa/Monrovia", + group: "Africa", + country: "Monrovia", + }, + { + utc: "GMT+01:00", + value: "Europe/Amsterdam", + group: "Europe", + country: "Amsterdam", + }, + { + utc: "GMT+01:00", + value: "Europe/Belgrade", + group: "Europe", + country: "Belgrade", + }, + { + utc: "GMT+01:00", + value: "Europe/Berlin", + group: "Europe", + country: "Berlin", + }, + { + utc: "GMT+01:00", + value: "Europe/Brussels", + group: "Europe", + country: "Brussels", + }, + { + utc: "GMT+01:00", + value: "Europe/Budapest", + group: "Europe", + country: "Budapest", + }, + { + utc: "GMT+01:00", + value: "Europe/Copenhagen", + group: "Europe", + country: "Copenhagen", + }, + { + utc: "GMT+01:00", + value: "Europe/Madrid", + group: "Europe", + country: "Madrid", + }, + { + utc: "GMT+01:00", + value: "Europe/Paris", + group: "Europe", + country: "Paris", + }, + { + utc: "GMT+01:00", + value: "Europe/Prague", + group: "Europe", + country: "Prague", + }, + { + utc: "GMT+01:00", + value: "Europe/Rome", + group: "Europe", + country: "Rome", + }, + { + utc: "GMT+01:00", + value: "Europe/Sarajevo", + group: "Europe", + country: "Sarajevo", + }, + { + utc: "GMT+01:00", + value: "Europe/Stockholm", + group: "Europe", + country: "Stockholm", + }, + { + utc: "GMT+01:00", + value: "Europe/Vienna", + group: "Europe", + country: "Vienna", + }, + { + utc: "GMT+01:00", + value: "Europe/Warsaw", + group: "Europe", + country: "Warsaw", + }, + { + utc: "GMT+01:00", + value: "Africa/Lagos", + group: "Africa", + country: "West Central Africa", + }, + { + utc: "GMT+02:00", + value: "Asia/Amman", + group: "Asia", + country: "Amman", + }, + { + utc: "GMT+02:00", + value: "Europe/Athens", + group: "Europe", + country: "Athens", + }, + { + utc: "GMT+02:00", + value: "Asia/Beirut", + group: "Asia", + country: "Beirut", + }, + { + utc: "GMT+02:00", + value: "Europe/Bucharest", + group: "Europe", + country: "Bucharest", + }, + { + utc: "GMT+02:00", + value: "Africa/Cairo", + group: "Africa", + country: "Cairo", + }, + { + utc: "GMT+02:00", + value: "Africa/Harare", + group: "Africa", + country: "Harare", + }, + { + utc: "GMT+02:00", + value: "Europe/Helsinki", + group: "Europe", + country: "Helsinki", + }, + { + utc: "GMT+02:00", + value: "Europe/Istanbul", + group: "Europe", + country: "Istanbul", + }, + { + utc: "GMT+02:00", + value: "Asia/Jerusalem", + group: "Asia", + country: "Jerusalem", + }, + { + utc: "GMT+02:00", + value: "Europe/Kiev", + group: "Europe", + country: "Kyiv", + }, + { + utc: "GMT+02:00", + value: "Europe/Minsk", + group: "Europe", + country: "Minsk", + }, + { + utc: "GMT+02:00", + value: "Europe/Riga", + group: "Europe", + country: "Riga", + }, + { + utc: "GMT+02:00", + value: "Europe/Sofia", + group: "Europe", + country: "Sofia", + }, + { + utc: "GMT+02:00", + value: "Europe/Tallinn", + group: "Europe", + country: "Tallinn", + }, + { + utc: "GMT+02:00", + value: "Europe/Vilnius", + group: "Europe", + country: "Vilnius", + }, + { + utc: "GMT+03:00", + value: "Asia/Baghdad", + group: "Asia", + country: "Baghdad", + }, + { + utc: "GMT+03:00", + value: "Asia/Kuwait", + group: "Asia", + country: "Kuwait", + }, + { + utc: "GMT+03:00", + value: "Europe/Moscow", + group: "Europe", + country: "Moscow", + }, + { + utc: "GMT+03:00", + value: "Africa/Nairobi", + group: "Africa", + country: "Nairobi", + }, + { + utc: "GMT+03:00", + value: "Asia/Riyadh", + group: "Asia", + country: "Riyadh", + }, + { + utc: "GMT+03:30", + value: "Asia/Tehran", + group: "Asia", + country: "Tehran", + }, + { + utc: "GMT+04:00", + value: "Asia/Dubai", + group: "Asia", + country: "Abu Dhabi", + }, + { + utc: "GMT+04:00", + value: "Asia/Baku", + group: "Asia", + country: "Baku", + }, + { + utc: "GMT+04:00", + value: "Asia/Muscat", + group: "Asia", + country: "Muscat", + }, + { + utc: "GMT+04:00", + value: "Asia/Tbilisi", + group: "Asia", + country: "Tbilisi", + }, + { + utc: "GMT+04:00", + value: "Asia/Yerevan", + group: "Asia", + country: "Yerevan", + }, + { + utc: "GMT+04:30", + value: "Asia/Kabul", + group: "Asia", + country: "Kabul", + }, + { + utc: "GMT+05:00", + value: "Asia/Karachi", + group: "Asia", + country: "Islamabad", + }, + { + utc: "GMT+05:00", + value: "Asia/Tashkent", + group: "Asia", + country: "Tashkent", + }, + { + utc: "GMT+05:30", + value: "Asia/Kolkata", + group: "Asia", + country: "Chennai", + }, + { + utc: "GMT+05:30", + value: "Asia/Kolkata", + group: "Asia", + country: "Kolkata", + }, + { + utc: "GMT+05:30", + value: "Asia/Kolkata", + group: "Asia", + country: "Mumbai", + }, + { + utc: "GMT+05:30", + value: "Asia/Kolkata", + group: "Asia", + country: "New Delhi", + }, + { + utc: "GMT+05:45", + value: "Asia/Kathmandu", + group: "Asia", + country: "Kathmandu", + }, + { + utc: "GMT+06:00", + value: "Asia/Almaty", + group: "Asia", + country: "Almaty", + }, + { + utc: "GMT+06:00", + value: "Asia/Dhaka", + group: "Asia", + country: "Dhaka", + }, + { + utc: "GMT+06:00", + value: "Asia/Yekaterinburg", + group: "Asia", + country: "Yekaterinburg", + }, + { + utc: "GMT+06:30", + value: "Asia/Yangon", + group: "Asia", + country: "Yangon (Rangoon)", + }, + { + utc: "GMT+07:00", + value: "Asia/Bangkok", + group: "Asia", + country: "Bangkok", + }, + { + utc: "GMT+07:00", + value: "Asia/Hanoi", + group: "Asia", + country: "Hanoi", + }, + { + utc: "GMT+07:00", + value: "Asia/Jakarta", + group: "Asia", + country: "Jakarta", + }, + { + utc: "GMT+07:00", + value: "Asia/Novosibirsk", + group: "Asia", + country: "Novosibirsk", + }, + { + utc: "GMT+08:00", + value: "Asia/Shanghai", + group: "Asia", + country: "Beijing", + }, + { + utc: "GMT+08:00", + value: "Asia/Chongqing", + group: "Asia", + country: "Chongqing", + }, + { + utc: "GMT+08:00", + value: "Asia/Hong_Kong", + group: "Asia", + country: "Hong Kong", + }, + { + utc: "GMT+08:00", + value: "Asia/Krasnoyarsk", + group: "Asia", + country: "Krasnoyarsk", + }, + { + utc: "GMT+08:00", + value: "Asia/Kuala_Lumpur", + group: "Asia", + country: "Kuala Lumpur", + }, + { + utc: "GMT+08:00", + value: "Australia/Perth", + group: "Australia", + country: "Perth", + }, + { + utc: "GMT+08:00", + value: "Asia/Singapore", + group: "Asia", + country: "Singapore", + }, + { + utc: "GMT+08:00", + value: "Asia/Taipei", + group: "Asia", + country: "Taipei", + }, + { + utc: "GMT+08:00", + value: "Asia/Ulaanbaatar", + group: "Asia", + country: "Ulaan Bataar", + }, + { + utc: "GMT+08:00", + value: "Asia/Urumqi", + group: "Asia", + country: "Urumqi", + }, + { + utc: "GMT+09:00", + value: "Asia/Irkutsk", + group: "Asia", + country: "Irkutsk", + }, + { + utc: "GMT+09:00", + value: "Asia/Tokyo", + group: "Asia", + country: "Osaka", + }, + { + utc: "GMT+09:00", + value: "Asia/Tokyo", + group: "Asia", + country: "Sapporo", + }, + { + utc: "GMT+09:00", + value: "Asia/Seoul", + group: "Asia", + country: "Seoul", + }, + { + utc: "GMT+09:00", + value: "Asia/Tokyo", + group: "Asia", + country: "Tokyo", + }, + { + utc: "GMT+09:30", + value: "Australia/Adelaide", + group: "Australia", + country: "Adelaide", + }, + { + utc: "GMT+09:30", + value: "Australia/Darwin", + group: "Australia", + country: "Darwin", + }, + { + utc: "GMT+10:00", + value: "Australia/Brisbane", + group: "Australia", + country: "Brisbane", + }, + { + utc: "GMT+10:00", + value: "Australia/Sydney", + group: "Australia", + country: "Canberra", + }, + { + utc: "GMT+10:00", + value: "Pacific/Guam", + group: "Pacific", + country: "Guam", + }, + { + utc: "GMT+10:00", + value: "Australia/Hobart", + group: "Australia", + country: "Hobart", + }, + { + utc: "GMT+10:00", + value: "Australia/Melbourne", + group: "Australia", + country: "Melbourne", + }, + { + utc: "GMT+10:00", + value: "Pacific/Port_Moresby", + group: "Pacific", + country: "Port Moresby", + }, + { + utc: "GMT+10:00", + value: "Australia/Sydney", + group: "Australia", + country: "Sydney", + }, + { + utc: "GMT+11:00", + value: "Asia/Magadan", + group: "Asia", + country: "Magadan", + }, + { + utc: "GMT+11:00", + value: "Pacific/Noumea", + group: "Pacific", + country: "New Caledonia", + }, + { + utc: "GMT+11:00", + value: "Pacific/Guadalcanal", + group: "Pacific", + country: "Solomon Is.", + }, + { + utc: "GMT+12:00", + value: "Pacific/Auckland", + group: "Pacific", + country: "Auckland", + }, + { + utc: "GMT+12:00", + value: "Pacific/Fiji", + group: "Pacific", + country: "Fiji", + }, + { + utc: "GMT+12:00", + value: "Asia/Kamchatka", + group: "Asia", + country: "Kamchatka", + }, + { + utc: "GMT+12:00", + value: "Pacific/Majuro", + group: "Pacific", + country: "Marshall Is.", + }, + { + utc: "GMT+12:00", + value: "Pacific/Auckland", + group: "Pacific", + country: "Wellington", + }, + { + utc: "GMT+13:00", + value: "Pacific/Tongatapu", + group: "Pacific", + country: "Nuku'alofa", + }, + { + utc: "GMT+13:00", + value: "Pacific/Apia", + group: "Pacific", + country: "Samoa", + }, + { + utc: "GMT+13:00", + value: "Pacific/Fakaofo", + group: "Pacific", + country: "Tokelau Is.", + }, +]; diff --git a/python/ray/dashboard/client/src/components/DataOverviewTable.tsx b/python/ray/dashboard/client/src/components/DataOverviewTable.tsx index c52bdee0c28b..c3538ba9ccd4 100644 --- a/python/ray/dashboard/client/src/components/DataOverviewTable.tsx +++ b/python/ray/dashboard/client/src/components/DataOverviewTable.tsx @@ -193,7 +193,7 @@ const DataRow = ({ {isDatasetRow && datasetMetrics.dataset} - {isOperatorRow && operatorMetrics.operator} + {isOperatorRow && operatorMetrics.name} ); }; + +export const SearchTimezone = ({ + serverTimeZone, + currentTimeZone, +}: { + serverTimeZone?: TimezoneInfo | null; + currentTimeZone?: string; +}) => { + const [timezone, setTimezone] = useState(""); + + useEffect(() => { + if (currentTimeZone !== undefined) { + formatTimeZone(currentTimeZone); + setTimezone(currentTimeZone); + } + }, [currentTimeZone]); + + const handleTimezoneChange = (value: string) => { + localStorage.setItem("timezone", value); + window.location.reload(); + }; + + const options = timezones + .map((x) => x) // Create a copy + .sort((a, b) => a.group.localeCompare(b.group)); + options.unshift({ + value: "Etc/UTC", + utc: "GMT+00:00", + group: "System", + country: "Coordinated Universal Time", + }); + + const browserTimezone = Intl.DateTimeFormat().resolvedOptions().timeZone; + + const browserOffset = (() => { + const offset = new Date().getTimezoneOffset(); + const sign = offset < 0 ? "+" : "-"; + const hours = Math.abs(Math.floor(offset / 60)) + .toString() + .padStart(2, "0"); + const minutes = Math.abs(offset % 60) + .toString() + .padStart(2, "0"); + return `GMT${sign}${hours}:${minutes}`; + })(); + + if (browserOffset) { + options.unshift({ + value: browserTimezone, + utc: browserOffset, + group: "System", + country: "Browser Time", + }); + } + + const serverUtc = + serverTimeZone?.value && + timezones.find((t) => t.value === serverTimeZone.value)?.utc; + if (serverUtc) { + options.unshift({ + value: serverTimeZone.value, + utc: serverUtc, + group: "System", + country: "Dashboard Server Timezone", + }); + } + + const curUtc = timezones.find((t) => t.value === timezone)?.utc; + return ( + { + if (newValue) { + handleTimezoneChange(newValue.value); + } + }} + options={options} + getOptionLabel={(option) => option.value} + groupBy={(option) => option.group} + filterOptions={(options, { inputValue }) => + options.filter( + (item) => + item.value.includes(inputValue) || + item.utc.includes(inputValue) || + item.country.toLowerCase().includes(inputValue.toLowerCase()) || + item.group.toLowerCase().includes(inputValue.toLowerCase()), + ) + } + renderOption={(props, option) => ( + + + {option.country} + + + {option.value} + + + + {option.utc} + + + )} + renderInput={(params) => ( + + )} + renderGroup={(params) => ( +
  • + + {params.group} + + + {params.children} + + +
  • + )} + slotProps={{ + paper: { + style: { + width: "400px", + }, + }, + popper: { + placement: "bottom-end", + style: { + width: "fit-content", + }, + }, + }} + /> + ); +}; diff --git a/python/ray/dashboard/client/src/pages/actor/ActorDetail.tsx b/python/ray/dashboard/client/src/pages/actor/ActorDetail.tsx index 60e8fc8ec8ac..22cb1e030351 100644 --- a/python/ray/dashboard/client/src/pages/actor/ActorDetail.tsx +++ b/python/ray/dashboard/client/src/pages/actor/ActorDetail.tsx @@ -133,6 +133,15 @@ const ActorDetailPage = () => { } : { value: "-" }, }, + { + label: "PID", + content: actorDetail.pid + ? { + value: `${actorDetail.pid}`, + copyableValue: `${actorDetail.pid}`, + } + : { value: "-" }, + }, { label: "Started at", content: { diff --git a/python/ray/dashboard/client/src/pages/data/DataOverview.component.test.tsx b/python/ray/dashboard/client/src/pages/data/DataOverview.component.test.tsx index 12cd19792928..107cd2037cfc 100644 --- a/python/ray/dashboard/client/src/pages/data/DataOverview.component.test.tsx +++ b/python/ray/dashboard/client/src/pages/data/DataOverview.component.test.tsx @@ -36,6 +36,7 @@ describe("DataOverview", () => { operators: [ { operator: "test_ds1_op1", + name: "test_ds1_op", state: "RUNNING", progress: 99, total: 101, @@ -104,11 +105,11 @@ describe("DataOverview", () => { expect(screen.getByText("70/80")).toBeVisible(); // Operator dropdown - expect(screen.queryByText("test_ds1_op1")).toBeNull(); + expect(screen.queryByText("test_ds1_op")).toBeNull(); await user.click(screen.getByTitle("Expand Dataset test_ds1")); - expect(screen.getByText("test_ds1_op1")).toBeVisible(); + expect(screen.getByText("test_ds1_op")).toBeVisible(); await user.click(screen.getByTitle("Collapse Dataset test_ds1")); - expect(screen.queryByText("test_ds1_op1")).toBeNull(); + expect(screen.queryByText("test_ds1_op")).toBeNull(); // Second Dataset expect(screen.getByText("test_ds2")).toBeVisible(); diff --git a/python/ray/dashboard/client/src/pages/layout/MainNavLayout.tsx b/python/ray/dashboard/client/src/pages/layout/MainNavLayout.tsx index 256c830cd29f..ea261a454b75 100644 --- a/python/ray/dashboard/client/src/pages/layout/MainNavLayout.tsx +++ b/python/ray/dashboard/client/src/pages/layout/MainNavLayout.tsx @@ -3,6 +3,7 @@ import React, { useContext } from "react"; import { RiBookMarkLine, RiFeedbackLine } from "react-icons/ri/"; import { Outlet, Link as RouterLink } from "react-router-dom"; import { GlobalContext } from "../../App"; +import { SearchTimezone } from "../../components/SearchComponent"; import Logo from "../../logo.svg"; import { MainNavContext, useMainNavState } from "./mainNavContext"; @@ -106,7 +107,8 @@ const NAV_ITEMS = [ const MainNavBar = () => { const { mainNavPageHierarchy } = useContext(MainNavContext); const rootRouteId = mainNavPageHierarchy[0]?.id; - const { metricsContextLoaded, grafanaHost } = useContext(GlobalContext); + const { metricsContextLoaded, grafanaHost, serverTimeZone, currentTimeZone } = + useContext(GlobalContext); let navItems = NAV_ITEMS; if (!metricsContextLoaded || grafanaHost === "DISABLED") { @@ -179,6 +181,17 @@ const MainNavBar = () => {
    + + + + + ); }; diff --git a/python/ray/dashboard/client/src/pages/metrics/Metrics.component.test.tsx b/python/ray/dashboard/client/src/pages/metrics/Metrics.component.test.tsx index c2cf646b743d..a7e3e06f0cea 100644 --- a/python/ray/dashboard/client/src/pages/metrics/Metrics.component.test.tsx +++ b/python/ray/dashboard/client/src/pages/metrics/Metrics.component.test.tsx @@ -22,6 +22,8 @@ const Wrapper = ({ children }: PropsWithChildren<{}>) => { nodeMapByIp: {}, namespaceMap: {}, dashboardDatasource: "Prometheus", + serverTimeZone: undefined, + currentTimeZone: undefined, }} > {children} @@ -47,6 +49,8 @@ const MetricsDisabledWrapper = ({ children }: PropsWithChildren<{}>) => { nodeMapByIp: {}, namespaceMap: {}, dashboardDatasource: "Prometheus", + serverTimeZone: undefined, + currentTimeZone: undefined, }} > {children} diff --git a/python/ray/dashboard/client/src/pages/metrics/Metrics.tsx b/python/ray/dashboard/client/src/pages/metrics/Metrics.tsx index fd542e534cd1..99d459a4fe54 100644 --- a/python/ray/dashboard/client/src/pages/metrics/Metrics.tsx +++ b/python/ray/dashboard/client/src/pages/metrics/Metrics.tsx @@ -571,8 +571,8 @@ const MetricsSection = ({ dashboardUid, dashboardDatasource, }: MetricsSectionProps) => { - const { grafanaHost, sessionName } = useContext(GlobalContext); - + const { grafanaHost, sessionName, currentTimeZone } = + useContext(GlobalContext); return ( { const path = `/d-solo/${dashboardUid}?${pathParams}` + - `&${refreshParams}${timeRangeParams}&var-SessionName=${sessionName}&var-datasource=${dashboardDatasource}`; + `&${refreshParams}&timezone=${currentTimeZone}${timeRangeParams}&var-SessionName=${sessionName}&var-datasource=${dashboardDatasource}`; return ( { return await get(GRAFANA_HEALTHCHECK_URL); }; @@ -68,3 +74,21 @@ export const getMetricsInfo = async () => { return info; }; + +export type TimezoneInfo = { + offset: string; + value: string; +}; + +export const getTimeZoneInfo = async () => { + try { + const resp = await get(TIMEZONE_URL); + if (resp.data) { + return { + offset: resp.data.offset, + value: resp.data.value, + }; + } + } catch (e) {} + return null; +}; diff --git a/python/ray/dashboard/client/src/pages/overview/OverviewPage.component.test.tsx b/python/ray/dashboard/client/src/pages/overview/OverviewPage.component.test.tsx index a92ebb6f4701..9b0d432b9f92 100644 --- a/python/ray/dashboard/client/src/pages/overview/OverviewPage.component.test.tsx +++ b/python/ray/dashboard/client/src/pages/overview/OverviewPage.component.test.tsx @@ -84,6 +84,8 @@ const Wrapper = nodeMapByIp: {}, namespaceMap: {}, dashboardDatasource: "Prometheus", + serverTimeZone: undefined, + currentTimeZone: undefined, }} > {children} diff --git a/python/ray/dashboard/client/src/pages/overview/cards/ClusterUtilizationCard.tsx b/python/ray/dashboard/client/src/pages/overview/cards/ClusterUtilizationCard.tsx index ab76f014f22f..a1c04c8047ab 100644 --- a/python/ray/dashboard/client/src/pages/overview/cards/ClusterUtilizationCard.tsx +++ b/python/ray/dashboard/client/src/pages/overview/cards/ClusterUtilizationCard.tsx @@ -20,11 +20,12 @@ export const ClusterUtilizationCard = ({ sessionName, dashboardUids, dashboardDatasource, + currentTimeZone, } = useContext(GlobalContext); const grafanaDefaultDashboardUid = dashboardUids?.default ?? "rayDefaultDashboard"; const path = `/d-solo/${grafanaDefaultDashboardUid}/default-dashboard?orgId=1&theme=light&panelId=41&var-datasource=${dashboardDatasource}`; - const timeRangeParams = "&from=now-30m&to=now"; + const timeRangeParams = "&from=now-1h&to=now"; if (!metricsContextLoaded || grafanaHost === "DISABLED") { return null; @@ -51,7 +52,7 @@ export const ClusterUtilizationCard = ({ component="iframe" title="Cluster Utilization" sx={{ flex: 1 }} - src={`${grafanaHost}${path}&refresh${timeRangeParams}&var-SessionName=${sessionName}`} + src={`${grafanaHost}${path}&refresh&timezone=${currentTimeZone}${timeRangeParams}&var-SessionName=${sessionName}`} frameBorder="0" /> { sessionName, dashboardUids, dashboardDatasource, + currentTimeZone, } = useContext(GlobalContext); const grafanaDefaultDashboardUid = dashboardUids?.default ?? "rayDefaultDashboard"; const path = `/d-solo/${grafanaDefaultDashboardUid}/default-dashboard?orgId=1&theme=light&panelId=24&var-datasource=${dashboardDatasource}`; - const timeRangeParams = "&from=now-30m&to=now"; + const timeRangeParams = "&from=now-1h&to=now"; if (!metricsContextLoaded || grafanaHost === "DISABLED") { return null; @@ -45,7 +46,7 @@ export const NodeCountCard = ({ className, sx }: NodeCountCardProps) => { component="iframe" title="Node Count" sx={{ flex: 1 }} - src={`${grafanaHost}${path}&refresh${timeRangeParams}&var-SessionName=${sessionName}`} + src={`${grafanaHost}${path}&refresh&timezone=${currentTimeZone}${timeRangeParams}&var-SessionName=${sessionName}`} frameBorder="0" /> )} diff --git a/python/ray/dashboard/client/src/pages/serve/ServeDeploymentMetricsSection.component.test.tsx b/python/ray/dashboard/client/src/pages/serve/ServeDeploymentMetricsSection.component.test.tsx index d0629ed7832c..a1f552514043 100644 --- a/python/ray/dashboard/client/src/pages/serve/ServeDeploymentMetricsSection.component.test.tsx +++ b/python/ray/dashboard/client/src/pages/serve/ServeDeploymentMetricsSection.component.test.tsx @@ -22,6 +22,8 @@ const Wrapper = ({ children }: PropsWithChildren<{}>) => { nodeMapByIp: {}, namespaceMap: {}, dashboardDatasource: "Prometheus", + serverTimeZone: undefined, + currentTimeZone: undefined, }} > {children} @@ -47,6 +49,8 @@ const MetricsDisabledWrapper = ({ children }: PropsWithChildren<{}>) => { nodeMapByIp: {}, namespaceMap: {}, dashboardDatasource: "Prometheus", + serverTimeZone: undefined, + currentTimeZone: undefined, }} > {children} diff --git a/python/ray/dashboard/client/src/pages/serve/ServeDeploymentMetricsSection.tsx b/python/ray/dashboard/client/src/pages/serve/ServeDeploymentMetricsSection.tsx index 33f62879a1d4..921f804e7fb6 100644 --- a/python/ray/dashboard/client/src/pages/serve/ServeDeploymentMetricsSection.tsx +++ b/python/ray/dashboard/client/src/pages/serve/ServeDeploymentMetricsSection.tsx @@ -51,8 +51,13 @@ export const ServeReplicaMetricsSection = ({ className, sx, }: ServeDeploymentMetricsSectionProps) => { - const { grafanaHost, prometheusHealth, dashboardUids, dashboardDatasource } = - useContext(GlobalContext); + const { + grafanaHost, + prometheusHealth, + dashboardUids, + dashboardDatasource, + currentTimeZone, + } = useContext(GlobalContext); const grafanaServeDashboardUid = dashboardUids?.serveDeployment ?? "rayServeDashboard"; @@ -179,7 +184,7 @@ export const ServeReplicaMetricsSection = ({ {METRICS_CONFIG.map(({ title, pathParams }) => { const path = `/d-solo/${grafanaServeDashboardUid}?${pathParams}` + - `${refreshParams}${timeRangeParams}&var-Deployment=${encodeURIComponent( + `${refreshParams}&timezone=${currentTimeZone}${timeRangeParams}&var-Deployment=${encodeURIComponent( deploymentName, )}&var-Replica=${encodeURIComponent( replicaId, diff --git a/python/ray/dashboard/client/src/pages/serve/ServeMetricsSection.component.test.tsx b/python/ray/dashboard/client/src/pages/serve/ServeMetricsSection.component.test.tsx index 6f5826087a63..c276d0fde417 100644 --- a/python/ray/dashboard/client/src/pages/serve/ServeMetricsSection.component.test.tsx +++ b/python/ray/dashboard/client/src/pages/serve/ServeMetricsSection.component.test.tsx @@ -26,6 +26,8 @@ const Wrapper = ({ children }: PropsWithChildren<{}>) => { nodeMapByIp: {}, namespaceMap: {}, dashboardDatasource: "Prometheus", + serverTimeZone: undefined, + currentTimeZone: undefined, }} > {children} @@ -51,6 +53,8 @@ const MetricsDisabledWrapper = ({ children }: PropsWithChildren<{}>) => { nodeMapByIp: {}, namespaceMap: {}, dashboardDatasource: "Prometheus", + serverTimeZone: undefined, + currentTimeZone: undefined, }} > {children} diff --git a/python/ray/dashboard/client/src/pages/serve/ServeMetricsSection.tsx b/python/ray/dashboard/client/src/pages/serve/ServeMetricsSection.tsx index efb3be529314..b03cfbcc56f8 100644 --- a/python/ray/dashboard/client/src/pages/serve/ServeMetricsSection.tsx +++ b/python/ray/dashboard/client/src/pages/serve/ServeMetricsSection.tsx @@ -77,8 +77,13 @@ export const ServeMetricsSection = ({ metricsConfig, sx, }: ServeMetricsSectionProps) => { - const { grafanaHost, prometheusHealth, dashboardUids, dashboardDatasource } = - useContext(GlobalContext); + const { + grafanaHost, + prometheusHealth, + dashboardUids, + dashboardDatasource, + currentTimeZone, + } = useContext(GlobalContext); const grafanaServeDashboardUid = dashboardUids?.serve ?? "rayServeDashboard"; const [refreshOption, setRefreshOption] = useState( RefreshOptions.FIVE_SECONDS, @@ -196,7 +201,7 @@ export const ServeMetricsSection = ({ {metricsConfig.map(({ title, pathParams }) => { const path = `/d-solo/${grafanaServeDashboardUid}?${pathParams}` + - `${refreshParams}${timeRangeParams}&var-datasource=${dashboardDatasource}`; + `${refreshParams}&timezone=${currentTimeZone}${timeRangeParams}&var-datasource=${dashboardDatasource}`; return ( ) => { prometheusHealth: true, sessionName: "session-name", dashboardDatasource: "Prometheus", + serverTimeZone: undefined, + currentTimeZone: undefined, }; return ( diff --git a/python/ray/dashboard/http_server_head.py b/python/ray/dashboard/http_server_head.py index e1c427b1b288..078d4a97dd38 100644 --- a/python/ray/dashboard/http_server_head.py +++ b/python/ray/dashboard/http_server_head.py @@ -12,6 +12,7 @@ import ray import ray.dashboard.optional_utils as dashboard_optional_utils +import ray.dashboard.timezone_utils as timezone_utils import ray.dashboard.utils as dashboard_utils from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag from ray._private.utils import get_or_create_event_loop @@ -139,6 +140,18 @@ async def get_favicon(self, req) -> aiohttp.web.FileResponse: ) ) + @routes.get("/timezone") + async def get_timezone(self, req) -> aiohttp.web.Response: + try: + current_timezone = timezone_utils.get_current_timezone_info() + return aiohttp.web.json_response(current_timezone) + + except Exception as e: + logger.error(f"Error getting timezone: {e}") + return aiohttp.web.Response( + status=500, text="Internal Server Error:" + str(e) + ) + def get_address(self): assert self.http_host and self.http_port return self.http_host, self.http_port diff --git a/python/ray/dashboard/modules/data/tests/test_data_head.py b/python/ray/dashboard/modules/data/tests/test_data_head.py index 650079360a8b..c94a50878c95 100644 --- a/python/ray/dashboard/modules/data/tests/test_data_head.py +++ b/python/ray/dashboard/modules/data/tests/test_data_head.py @@ -32,6 +32,7 @@ ] + DATA_SCHEMA OPERATOR_SCHEMA = [ + "name", "operator", ] + DATA_SCHEMA @@ -64,12 +65,23 @@ def test_get_datasets(): operators = dataset["operators"] assert len(operators) == 2 op0 = operators[0] + op1 = operators[1] assert sorted(op0.keys()) == sorted(OPERATOR_SCHEMA) - assert op0["operator"] == "Input0" - assert op0["progress"] == 20 - assert op0["total"] == 20 - assert op0["state"] == "FINISHED" - assert operators[1]["operator"] == "ReadRange->MapBatches()1" + assert sorted(op1.keys()) == sorted(OPERATOR_SCHEMA) + assert { + "operator": "Input0", + "name": "Input", + "state": "FINISHED", + "progress": 20, + "total": 20, + }.items() <= op0.items() + assert { + "operator": "ReadRange->MapBatches()1", + "name": "ReadRange->MapBatches()", + "state": "FINISHED", + "progress": 20, + "total": 20, + }.items() <= op1.items() ds.map_batches(lambda x: x).materialize() data = requests.get(DATA_HEAD_URLS["GET"].format(job_id=job_id)).json() @@ -83,4 +95,4 @@ def test_get_datasets(): if __name__ == "__main__": - sys.exit(pytest.main(["-v", __file__])) + sys.exit(pytest.main(["-vv", __file__])) diff --git a/python/ray/dashboard/modules/metrics/install_and_start_prometheus.py b/python/ray/dashboard/modules/metrics/install_and_start_prometheus.py index a65050212950..ea0ff2459f65 100644 --- a/python/ray/dashboard/modules/metrics/install_and_start_prometheus.py +++ b/python/ray/dashboard/modules/metrics/install_and_start_prometheus.py @@ -26,6 +26,9 @@ def get_system_info(): if architecture == "x86_64": # In the Prometheus filename, it's called amd64 architecture = "amd64" + elif architecture == "aarch64": + # In the Prometheus filename, it's called arm64 + architecture = "arm64" return os_type, architecture @@ -90,6 +93,7 @@ def start_prometheus(prometheus_dir): f"{prometheus_dir}/prometheus", "--config.file", str(config_file), + "--web.enable-lifecycle", ] try: process = subprocess.Popen(prometheus_cmd) @@ -104,6 +108,7 @@ def print_shutdown_message(process_id): message = ( f"Prometheus is running with PID {process_id}.\n" "To stop Prometheus, use the command: " + "`ray metrics shutdown-prometheus`, " f"'kill {process_id}', or if you need to force stop, " f"use 'kill -9 {process_id}'." ) diff --git a/python/ray/dashboard/modules/tests/test_metrics_integration.py b/python/ray/dashboard/modules/tests/test_metrics_integration.py index 7974b4a43cf9..0b125ca2dd5b 100644 --- a/python/ray/dashboard/modules/tests/test_metrics_integration.py +++ b/python/ray/dashboard/modules/tests/test_metrics_integration.py @@ -1,11 +1,14 @@ import subprocess import sys +import time import pytest +from click.testing import CliRunner from ray.dashboard.consts import PROMETHEUS_CONFIG_INPUT_PATH from ray.dashboard.modules.metrics import install_and_start_prometheus from ray.dashboard.modules.metrics.templates import PROMETHEUS_YML_TEMPLATE +from ray.scripts.scripts import metrics_group @pytest.mark.parametrize( @@ -40,6 +43,16 @@ def test_e2e(capsys): subprocess.run(["kill", str(pid)]) +def test_shutdown_prometheus(): + install_and_start_prometheus.main() + runner = CliRunner() + # Sleep for a few seconds to make sure Prometheus is running + # before we try to shut it down. + time.sleep(5) + result = runner.invoke(metrics_group, ["shutdown-prometheus"]) + assert result.exit_code == 0 + + def test_prometheus_config_content(): # Test to make sure the content in the hardcoded file # (python/ray/dashboard/modules/metrics/export/prometheus/prometheus.yml) will diff --git a/python/ray/dashboard/modules/tests/test_utils.py b/python/ray/dashboard/modules/tests/test_utils.py index fc4fa85dfaed..38b24da02504 100644 --- a/python/ray/dashboard/modules/tests/test_utils.py +++ b/python/ray/dashboard/modules/tests/test_utils.py @@ -6,7 +6,7 @@ async def http_get(http_session, url, timeout_seconds=60): - with async_timeout.timeout(timeout_seconds): + async with async_timeout.timeout(timeout_seconds): async with http_session.get(url) as response: return await response.json() diff --git a/python/ray/dashboard/tests/test_dashboard.py b/python/ray/dashboard/tests/test_dashboard.py index 4b61accc53c3..19571d9f76b7 100644 --- a/python/ray/dashboard/tests/test_dashboard.py +++ b/python/ray/dashboard/tests/test_dashboard.py @@ -11,6 +11,7 @@ import time import warnings from unittest.mock import MagicMock +from urllib.parse import quote_plus import pytest import requests @@ -370,7 +371,9 @@ def test_http_get(enable_test_module, ray_start_with_dashboard): while True: time.sleep(3) try: - response = requests.get(webui_url + "/test/http_get?url=" + target_url) + response = requests.get( + webui_url + "/test/http_get?url=" + quote_plus(target_url) + ) response.raise_for_status() try: dump_info = response.json() @@ -385,7 +388,8 @@ def test_http_get(enable_test_module, ray_start_with_dashboard): http_port, grpc_port = ports response = requests.get( - f"http://{ip}:{http_port}" f"/test/http_get_from_agent?url={target_url}" + f"http://{ip}:{http_port}" + f"/test/http_get_from_agent?url={quote_plus(target_url)}" ) response.raise_for_status() try: diff --git a/python/ray/dashboard/timezone_utils.py b/python/ray/dashboard/timezone_utils.py new file mode 100644 index 000000000000..6a0d68b9c1a9 --- /dev/null +++ b/python/ray/dashboard/timezone_utils.py @@ -0,0 +1,56 @@ +import logging +from datetime import datetime + +logger = logging.getLogger(__name__) + +timezones = [ + {"offset": "-12:00", "value": "Etc/+12"}, + {"offset": "-11:00", "value": "Pacific/Pago_Pago"}, + {"offset": "-10:00", "value": "Pacific/Honolulu"}, + {"offset": "-09:00", "value": "America/Anchorage"}, + {"offset": "-08:00", "value": "America/Los_Angeles"}, + {"offset": "-07:00", "value": "America/Phoenix"}, + {"offset": "-06:00", "value": "America/Guatemala"}, + {"offset": "-05:00", "value": "America/Bogota"}, + {"offset": "-04:00", "value": "America/Halifax"}, + {"offset": "-03:30", "value": "America/St_Johns"}, + {"offset": "-03:00", "value": "America/Sao_Paulo"}, + {"offset": "-02:00", "value": "America/Godthab"}, + {"offset": "-01:00", "value": "Atlantic/Azores"}, + {"offset": "+00:00", "value": "Europe/London"}, + {"offset": "+01:00", "value": "Europe/Amsterdam"}, + {"offset": "+02:00", "value": "Asia/Amman"}, + {"offset": "+03:00", "value": "Asia/Baghdad"}, + {"offset": "+03:30", "value": "Asia/Tehran"}, + {"offset": "+04:00", "value": "Asia/Dubai"}, + {"offset": "+04:30", "value": "Asia/Kabul"}, + {"offset": "+05:00", "value": "Asia/Karachi"}, + {"offset": "+05:30", "value": "Asia/Kolkata"}, + {"offset": "+05:45", "value": "Asia/Kathmandu"}, + {"offset": "+06:00", "value": "Asia/Almaty"}, + {"offset": "+06:30", "value": "Asia/Yangon"}, + {"offset": "+07:00", "value": "Asia/Bangkok"}, + {"offset": "+08:00", "value": "Asia/Shanghai"}, + {"offset": "+09:00", "value": "Asia/Irkutsk"}, + {"offset": "+09:30", "value": "Australia/Adelaide"}, + {"offset": "+10:00", "value": "Australia/Brisbane"}, + {"offset": "+11:00", "value": "Asia/Magadan"}, + {"offset": "+12:00", "value": "Pacific/Auckland"}, + {"offset": "+13:00", "value": "Pacific/Tongatapu"}, +] + + +def get_current_timezone_info(): + current_tz = datetime.now().astimezone().tzinfo + offset = current_tz.utcoffset(None) + hours, remainder = divmod(offset.total_seconds(), 3600) + minutes = remainder // 60 + sign = "+" if hours >= 0 else "-" + current_offset = f"{sign}{abs(int(hours)):02d}:{abs(int(minutes)):02d}" + + current_timezone = next( + (tz for tz in timezones if tz["offset"] == current_offset), + {"offset": None, "value": None}, + ) + + return current_timezone diff --git a/python/ray/data/BUILD b/python/ray/data/BUILD index d232ab352ba0..d46db0940c6e 100644 --- a/python/ray/data/BUILD +++ b/python/ray/data/BUILD @@ -99,7 +99,7 @@ py_test( py_test( name = "test_arrow_block", - size = "small", + size = "medium", srcs = ["tests/test_arrow_block.py"], tags = ["team:data", "exclusive"], deps = ["//:ray_lib", ":conftest"], @@ -225,6 +225,14 @@ py_test( deps = ["//:ray_lib", ":conftest"], ) +py_test( + name = "test_hudi", + size = "small", + srcs = ["tests/test_hudi.py"], + tags = ["team:data", "exclusive"], + deps = ["//:ray_lib", ":conftest"], +) + py_test( name = "test_image", size = "small", diff --git a/python/ray/data/__init__.py b/python/ray/data/__init__.py index 89d531aa2ee5..5883ae6c542c 100644 --- a/python/ray/data/__init__.py +++ b/python/ray/data/__init__.py @@ -48,6 +48,7 @@ read_databricks_tables, read_datasource, read_delta_sharing_tables, + read_hudi, read_iceberg, read_images, read_json, @@ -139,6 +140,7 @@ "read_csv", "read_datasource", "read_delta_sharing_tables", + "read_hudi", "read_iceberg", "read_images", "read_json", diff --git a/python/ray/data/_internal/arrow_block.py b/python/ray/data/_internal/arrow_block.py index f12f89d8cceb..1473b8fb6e3b 100644 --- a/python/ray/data/_internal/arrow_block.py +++ b/python/ray/data/_internal/arrow_block.py @@ -21,15 +21,11 @@ from ray._private.utils import _get_pyarrow_version from ray.air.constants import TENSOR_COLUMN_NAME from ray.air.util.tensor_extensions.arrow import ( - ArrowConversionError, - convert_list_to_pyarrow_array, + convert_to_pyarrow_array, pyarrow_table_from_pydict, ) from ray.data._internal.arrow_ops import transform_polars, transform_pyarrow -from ray.data._internal.numpy_support import ( - convert_udf_returns_to_numpy, - validate_numpy_batch, -) +from ray.data._internal.numpy_support import convert_to_numpy from ray.data._internal.row import TableRow from ray.data._internal.table_block import TableBlockAccessor, TableBlockBuilder from ray.data._internal.util import NULL_SENTINEL, find_partitions @@ -43,7 +39,6 @@ U, ) from ray.data.context import DataContext -from ray.util.debug import log_once try: import pyarrow @@ -61,17 +56,6 @@ T = TypeVar("T") logger = logging.getLogger(__name__) -ARROW_OBJECT_FIXABLE_ERRORS = ( - pyarrow.lib.ArrowTypeError, - pyarrow.lib.ArrowNotImplementedError, - pyarrow.lib.ArrowInvalid, -) - - -def is_object_fixable_error(e: ArrowConversionError) -> bool: - """Returns whether this error can be fixed by using an ArrowPythonObjectArray""" - return isinstance(e.__cause__, ARROW_OBJECT_FIXABLE_ERRORS) - # We offload some transformations to polars for performance. def get_sort_transform(context: DataContext) -> Callable: @@ -151,27 +135,14 @@ def __init__(self): @staticmethod def _table_from_pydict(columns: Dict[str, List[Any]]) -> Block: - for col_name, col in columns.items(): - try: - if col_name == TENSOR_COLUMN_NAME or isinstance( - next(iter(col), None), np.ndarray - ): - from ray.data.extensions.tensor_extension import ArrowTensorArray - - columns[col_name] = ArrowTensorArray.from_numpy(col, col_name) - else: - columns[col_name] = convert_list_to_pyarrow_array(col, columns) - except ArrowConversionError as e: - from ray.data.extensions.object_extension import ( - ArrowPythonObjectArray, - object_extension_type_allowed, - ) + pa_cols: Dict[str, pyarrow.Array] = dict() - if object_extension_type_allowed() and is_object_fixable_error(e): - columns[col_name] = ArrowPythonObjectArray.from_objects(col) - else: - raise - return pyarrow_table_from_pydict(columns) + for col_name, col_vals in columns.items(): + np_col_vals = convert_to_numpy(col_vals) + + pa_cols[col_name] = convert_to_pyarrow_array(np_col_vals, col_name) + + return pyarrow_table_from_pydict(pa_cols) @staticmethod def _concat_tables(tables: List[Block]) -> Block: @@ -216,40 +187,6 @@ def from_bytes(cls, data: bytes) -> "ArrowBlockAccessor": reader = pyarrow.ipc.open_stream(data) return cls(reader.read_all()) - @staticmethod - def numpy_to_block( - batch: Union[Dict[str, np.ndarray], Dict[str, list]], - ) -> "pyarrow.Table": - from ray.data.extensions.object_extension import ( - ArrowPythonObjectArray, - object_extension_type_allowed, - ) - from ray.data.extensions.tensor_extension import ArrowTensorArray - - validate_numpy_batch(batch) - - new_batch = {} - for col_name, col in batch.items(): - # Coerce to np.ndarray format if possible. - col = convert_udf_returns_to_numpy(col) - # Use Arrow's native *List types for 1-dimensional ndarrays. - if col.dtype.type is np.object_ or col.ndim > 1: - try: - col = ArrowTensorArray.from_numpy(col, col_name) - except ArrowConversionError as e: - if object_extension_type_allowed() and is_object_fixable_error(e): - if log_once(f"arrow_object_pickle_{col_name}"): - logger.debug( - f"Failed to interpret {col_name} as " - "multi-dimensional arrays. It will be pickled." - ) - col = ArrowPythonObjectArray.from_objects(col) - else: - raise - - new_batch[col_name] = col - return pyarrow_table_from_pydict(new_batch) - @staticmethod def _build_tensor_row( row: ArrowRow, col_name: str = TENSOR_COLUMN_NAME @@ -282,7 +219,7 @@ def _build_tensor_row( def slice(self, start: int, end: int, copy: bool = False) -> "pyarrow.Table": view = self._table.slice(start, end - start) if copy: - view = _copy_table(view) + view = transform_pyarrow.combine_chunks(view) return view def random_shuffle(self, random_seed: Optional[int]) -> "pyarrow.Table": @@ -308,11 +245,6 @@ def to_pandas(self) -> "pandas.DataFrame": def to_numpy( self, columns: Optional[Union[str, List[str]]] = None ) -> Union[np.ndarray, Dict[str, np.ndarray]]: - from ray.air.util.transform_pyarrow import ( - _concatenate_extension_column, - _is_column_extension_type, - ) - if columns is None: columns = self._table.column_names should_be_single_ndarray = False @@ -330,23 +262,24 @@ def to_numpy( f"{column_names_set}" ) - arrays = [] - for column in columns: - array = self._table[column] - if _is_column_extension_type(array): - array = _concatenate_extension_column(array) - elif array.num_chunks == 0: - array = pyarrow.array([], type=array.type) - else: - array = array.combine_chunks() - arrays.append(array.to_numpy(zero_copy_only=False)) + column_values_ndarrays = [] + + for col_name in columns: + col = self._table[col_name] + + # Combine columnar values arrays to make these contiguous + # (making them compatible with numpy format) + combined_array = transform_pyarrow.combine_chunked_array(col) + + column_values_ndarrays.append( + transform_pyarrow.to_numpy(combined_array, zero_copy_only=False) + ) if should_be_single_ndarray: assert len(columns) == 1 - arrays = arrays[0] + return column_values_ndarrays[0] else: - arrays = dict(zip(columns, arrays)) - return arrays + return dict(zip(columns, column_values_ndarrays)) def to_arrow(self) -> "pyarrow.Table": return self._table @@ -715,8 +648,3 @@ def gen(): def block_type(self) -> BlockType: return BlockType.ARROW - - -def _copy_table(table: "pyarrow.Table") -> "pyarrow.Table": - """Copy the provided Arrow table.""" - return transform_pyarrow.combine_chunks(table) diff --git a/python/ray/data/_internal/arrow_ops/transform_pyarrow.py b/python/ray/data/_internal/arrow_ops/transform_pyarrow.py index 093588ca8f34..a71a1eae6f61 100644 --- a/python/ray/data/_internal/arrow_ops/transform_pyarrow.py +++ b/python/ray/data/_internal/arrow_ops/transform_pyarrow.py @@ -1,8 +1,14 @@ from typing import TYPE_CHECKING, List, Union +import numpy as np from packaging.version import parse as parse_version from ray._private.utils import _get_pyarrow_version +from ray.air.util.tensor_extensions.arrow import ( + INT32_OVERFLOW_THRESHOLD, + MIN_PYARROW_VERSION_CHUNKED_ARRAY_TO_NUMPY_ZERO_COPY_ONLY, + PYARROW_VERSION, +) try: import pyarrow @@ -236,6 +242,7 @@ def concat(blocks: List["pyarrow.Table"]) -> "pyarrow.Table": schema = unify_schemas(schemas_to_unify) except Exception as e: raise ArrowConversionError(str(blocks)) from e + if ( any(isinstance(type_, pa.ExtensionType) for type_ in schema.types) or cols_with_null_list @@ -246,6 +253,7 @@ def concat(blocks: List["pyarrow.Table"]) -> "pyarrow.Table": col_chunked_arrays = [] for block in blocks: col_chunked_arrays.append(block.column(col_name)) + if isinstance(schema.field(col_name).type, tensor_types): # For our tensor extension types, manually construct a chunked array # containing chunks from all blocks. This is to handle @@ -326,24 +334,164 @@ def concat_and_sort( return take_table(ret, indices) +def to_numpy( + array: Union["pyarrow.Array", "pyarrow.ChunkedArray"], + *, + zero_copy_only: bool = True, +) -> np.ndarray: + """Wrapper for `Array`s and `ChunkedArray`s `to_numpy` API, + handling API divergence b/w Arrow versions""" + + import pyarrow as pa + + if isinstance(array, pa.Array): + return array.to_numpy(zero_copy_only=zero_copy_only) + elif isinstance(array, pa.ChunkedArray): + if PYARROW_VERSION >= MIN_PYARROW_VERSION_CHUNKED_ARRAY_TO_NUMPY_ZERO_COPY_ONLY: + return array.to_numpy(zero_copy_only=zero_copy_only) + else: + return array.to_numpy() + else: + raise ValueError( + f"Either of `Array` or `ChunkedArray` was expected, got {type(array)}" + ) + + def combine_chunks(table: "pyarrow.Table") -> "pyarrow.Table": - """This is pyarrow.Table.combine_chunks() - with support for extension types. + """This is counterpart for Pyarrow's `Table.combine_chunks` that's using + extended `ChunkedArray` combination protocol. - This will create a new table by combining the chunks the input table has. + For more details check out `combine_chunked_array` py-doc """ + + new_column_values_arrays = [] + + for col in table.columns: + new_column_values_arrays.append(combine_chunked_array(col)) + + return pyarrow.Table.from_arrays(new_column_values_arrays, schema=table.schema) + + +def combine_chunked_array( + array: "pyarrow.ChunkedArray", +) -> Union["pyarrow.Array", "pyarrow.ChunkedArray"]: + """This is counterpart for Pyarrow's `ChunkedArray.combine_chunks` that additionally + + 1. Handles `ExtensionType`s (like ArrowTensorType, ArrowTensorTypeV2, + ArrowPythonObjectType, etc) + + 2. Making sure `ChunkedArray`s comprising provided `Table` are combined + safely, ie avoiding overflows of Arrow's internal offsets (using int32 for + most of its native types, other than "large" kind). + + For more details check py-doc of `_try_combine_chunks_safe` method. + """ + + import pyarrow as pa + from ray.air.util.transform_pyarrow import ( _concatenate_extension_column, _is_column_extension_type, ) - cols = table.columns - new_cols = [] - for col in cols: - if _is_column_extension_type(col): - # Extension arrays don't support concatenation. - arr = _concatenate_extension_column(col) - else: - arr = col.combine_chunks() - new_cols.append(arr) - return pyarrow.Table.from_arrays(new_cols, schema=table.schema) + assert isinstance( + array, pa.ChunkedArray + ), f"Expected `ChunkedArray`, got {type(array)}" + + if _is_column_extension_type(array): + # Arrow `ExtensionArray`s can't be concatenated via `combine_chunks`, + # hence require manual concatenation + return _concatenate_extension_column(array) + elif len(array.chunks) == 0: + # NOTE: In case there's no chunks, we need to explicitly create + # an empty array since calling into `combine_chunks` would fail + # due to it expecting at least 1 chunk to be present + return pa.array([], type=array.type) + else: + return _try_combine_chunks_safe(array) + + +def _try_combine_chunks_safe( + array: "pyarrow.ChunkedArray", max_chunk_size=INT32_OVERFLOW_THRESHOLD +) -> Union["pyarrow.Array", "pyarrow.ChunkedArray"]: + """This method provides a safe way of combining `ChunkedArray`s exceeding 2 GiB + in size, which aren't using "large_*" types (and therefore relying on int32 + offsets). + + When handling provided `ChunkedArray` this method will be either + + - Relying on PyArrow's default `combine_chunks` (therefore returning single + contiguous `Array`) in cases when + - Array's total size is < 2 GiB + - Array's underlying type is of "large" kind (ie using one of the + `large_*` type family) + - Safely combining subsets of tasks such that resulting `Array`s to not + exceed 2 GiB in size (therefore returning another `ChunkedArray` albeit + with potentially smaller number of chunks that have resulted from clumping + the original ones) + + Returns: + - pa.Array if it's possible to combine provided pa.ChunkedArray into single + contiguous array + - pa.ChunkedArray (albeit with chunks re-combined) if it's not possible to + produce single pa.Array + """ + + import pyarrow as pa + + from ray.air.util.transform_pyarrow import _is_column_extension_type + + assert not _is_column_extension_type( + array + ), f"Arrow `ExtensionType`s are not accepted (got {array.type})" + + int64_type_predicates = [ + pa.types.is_large_list, + pa.types.is_large_string, + pa.types.is_large_binary, + pa.types.is_large_unicode, + ] + + if array.nbytes < max_chunk_size or any( + p(array.type) for p in int64_type_predicates + ): + # It's safe to combine provided `ChunkedArray` in either of 2 cases: + # - It's cumulative size is < 2 GiB + # - It's of 'large' kind (ie one using int64 offsets internally) + return array.combine_chunks() + + # In this case it's actually *NOT* safe to try to directly combine + # Arrow's `ChunkedArray` and is impossible to produce single, contiguous + # `Array` since + # - It's estimated to hold > 2 GiB + # - Its type is not of the "large" kind (and hence is using int32 + # offsets internally, which would overflow) + # + # In this case instead of combining into single contiguous array, we + # instead just "clump" existing chunks into bigger ones, but no bigger + # than 2 GiB each. + # + # NOTE: This branch actually returns `ChunkedArray` and not an `Array` + + # To stay under 2 GiB limit we are slicing provided list of chunks into + # slices no larger than 2 GiB (as compared to just directly using `concat_arrays`) + slices = [] + + cur_slice_start = 0 + cur_slice_size_bytes = 0 + + for i, chunk in enumerate(array.chunks): + chunk_size = chunk.nbytes + + if cur_slice_size_bytes + chunk_size > max_chunk_size: + slices.append(array.chunks[cur_slice_start:i]) + + cur_slice_start = i + cur_slice_size_bytes = 0 + + cur_slice_size_bytes += chunk_size + + # Add remaining chunks as last slice + slices.append(array.chunks[cur_slice_start:]) + + return pa.chunked_array([pa.concat_arrays(s) for s in slices]) diff --git a/python/ray/data/_internal/batcher.py b/python/ray/data/_internal/batcher.py index 104e3c7ae51d..d27ed089f03f 100644 --- a/python/ray/data/_internal/batcher.py +++ b/python/ray/data/_internal/batcher.py @@ -11,7 +11,7 @@ # See https://github.com/ray-project/ray/issues/31108 for more details. # TODO(jjyao): remove this once # https://github.com/apache/arrow/issues/35126 is resolved. -MIN_NUM_CHUNKS_TO_TRIGGER_COMBINE_CHUNKS = 2 +MIN_NUM_CHUNKS_TO_TRIGGER_COMBINE_CHUNKS = 10 # Delay compaction until the shuffle buffer has reached this ratio over the min # shuffle buffer size. Setting this to 1 minimizes memory usage, at the cost of @@ -130,10 +130,7 @@ def next_batch(self) -> Block: # the leftovers. leftover.append(block) elif accessor.num_rows() <= needed: - # We need this entire block to fill out a batch. - # We need to call `accessor.slice()` to ensure - # the subsequent block's type are the same. - output.add_block(accessor.slice(0, accessor.num_rows(), copy=False)) + output.add_block(accessor.to_block()) needed -= accessor.num_rows() else: if ( diff --git a/python/ray/data/_internal/datasource/hudi_datasource.py b/python/ray/data/_internal/datasource/hudi_datasource.py new file mode 100644 index 000000000000..828d9baada7f --- /dev/null +++ b/python/ray/data/_internal/datasource/hudi_datasource.py @@ -0,0 +1,91 @@ +import logging +import os +from typing import Dict, Iterator, List, Optional + +from ray.data._internal.util import _check_import +from ray.data.block import BlockMetadata +from ray.data.datasource.datasource import Datasource, ReadTask + +logger = logging.getLogger(__name__) + + +class HudiDatasource(Datasource): + """Hudi datasource, for reading Apache Hudi table.""" + + def __init__( + self, + table_uri: str, + storage_options: Optional[Dict[str, str]] = None, + ): + _check_import(self, module="hudi", package="hudi-python") + + self._table_uri = table_uri + self._storage_options = storage_options + + def get_read_tasks(self, parallelism: int) -> List["ReadTask"]: + import pyarrow + from hudi import HudiTable + + def _perform_read( + table_uri: str, + base_file_paths: List[str], + options: Dict[str, str], + ) -> Iterator["pyarrow.Table"]: + from hudi import HudiFileGroupReader + + for p in base_file_paths: + file_group_reader = HudiFileGroupReader(table_uri, options) + batch = file_group_reader.read_file_slice_by_base_file_path(p) + yield pyarrow.Table.from_batches([batch]) + + hudi_table = HudiTable(self._table_uri, self._storage_options) + + reader_options = { + **hudi_table.storage_options(), + **hudi_table.hudi_options(), + } + + schema = hudi_table.get_schema() + read_tasks = [] + for file_slices_split in hudi_table.split_file_slices(parallelism): + if len(file_slices_split) == 0: + # when the table is empty, this will be an empty split + continue + + num_rows = 0 + relative_paths = [] + input_files = [] + size_bytes = 0 + for file_slice in file_slices_split: + # A file slice in a Hudi table is a logical group of data files + # within a physical partition. Records stored in a file slice + # are associated with a commit on the Hudi table's timeline. + # For more info, see https://hudi.apache.org/docs/file_layouts + num_rows += file_slice.num_records + relative_path = file_slice.base_file_relative_path() + relative_paths.append(relative_path) + full_path = os.path.join(self._table_uri, relative_path) + input_files.append(full_path) + size_bytes += file_slice.base_file_size + + metadata = BlockMetadata( + num_rows=num_rows, + schema=schema, + input_files=input_files, + size_bytes=size_bytes, + exec_stats=None, + ) + + read_task = ReadTask( + read_fn=lambda paths=relative_paths: _perform_read( + self._table_uri, paths, reader_options + ), + metadata=metadata, + ) + read_tasks.append(read_task) + + return read_tasks + + def estimate_inmemory_data_size(self) -> Optional[int]: + # TODO(xushiyan) add APIs to provide estimated in-memory size + return None diff --git a/python/ray/data/_internal/execution/streaming_executor.py b/python/ray/data/_internal/execution/streaming_executor.py index 238f6f9421cc..a4276e2bafe6 100644 --- a/python/ray/data/_internal/execution/streaming_executor.py +++ b/python/ray/data/_internal/execution/streaming_executor.py @@ -188,11 +188,9 @@ def shutdown(self, execution_completed: bool = True): state="FINISHED" if execution_completed else "FAILED", force_update=True, ) - # Clears metrics for this dataset so that they do - # not persist in the grafana dashboard after execution - StatsManager.clear_execution_metrics( - self._dataset_tag, self._get_operator_tags() - ) + # Once Dataset execution completes, mark it as complete + # and remove last cached execution stats. + StatsManager.clear_last_execution_stats(self._dataset_tag) # Freeze the stats and save it. self._final_stats = self._generate_stats() stats_summary_string = self._final_stats.to_summary().to_string( @@ -401,6 +399,7 @@ def _get_state_dict(self, state): "end_time": time.time() if state != "RUNNING" else None, "operators": { f"{op.name}{i}": { + "name": op.name, "progress": op_state.num_completed_tasks, "total": op.num_outputs_total(), "state": state, diff --git a/python/ray/data/_internal/numpy_support.py b/python/ray/data/_internal/numpy_support.py index 9e6a7c305dfb..d04060fc831e 100644 --- a/python/ray/data/_internal/numpy_support.py +++ b/python/ray/data/_internal/numpy_support.py @@ -1,4 +1,5 @@ import collections +import logging from datetime import datetime from typing import Any, Dict, List, Union @@ -7,6 +8,8 @@ from ray.air.util.tensor_extensions.utils import create_ragged_ndarray from ray.data._internal.util import _truncated_repr +logger = logging.getLogger(__name__) + def is_array_like(value: Any) -> bool: """Checks whether objects are array-like, excluding numpy scalars.""" @@ -66,7 +69,7 @@ def _convert_datetime_list_to_array(datetime_list: List[datetime]) -> np.ndarray ) -def convert_udf_returns_to_numpy(udf_return_col: Any) -> Any: +def convert_to_numpy(column_values: Any) -> np.ndarray: """Convert UDF columns (output of map_batches) to numpy, if possible. This includes lists of scalars, objects supporting the array protocol, and lists @@ -80,36 +83,31 @@ def convert_udf_returns_to_numpy(udf_return_col: Any) -> Any: ValueError if an input was array-like but we failed to convert it to an array. """ - if isinstance(udf_return_col, np.ndarray): + if isinstance(column_values, np.ndarray): # No copy/conversion needed, just keep it verbatim. - return udf_return_col + return column_values - if isinstance(udf_return_col, list): - if len(udf_return_col) == 1 and isinstance(udf_return_col[0], np.ndarray): + elif isinstance(column_values, list): + if len(column_values) == 1 and isinstance(column_values[0], np.ndarray): # Optimization to avoid conversion overhead from list to np.array. - udf_return_col = np.expand_dims(udf_return_col[0], axis=0) - return udf_return_col + return np.expand_dims(column_values[0], axis=0) - if all(isinstance(elem, datetime) for elem in udf_return_col): - return _convert_datetime_list_to_array(udf_return_col) + if all(isinstance(elem, datetime) for elem in column_values): + return _convert_datetime_list_to_array(column_values) # Try to convert list values into an numpy array via # np.array(), so users don't need to manually cast. # NOTE: we don't cast generic iterables, since types like # `str` are also Iterable. try: - # Try to cast the inner scalars to numpy as well, to avoid unnecessarily - # creating an inefficient array of array of object dtype. - # But don't convert if the list is nested. Because if sub-lists have - # heterogeneous shapes, we need to create a ragged ndarray. - if not is_nested_list(udf_return_col) and all( - is_valid_udf_return(e) for e in udf_return_col - ): + # Convert array-like objects (like torch.Tensor) to `np.ndarray`s + if all(is_array_like(e) for e in column_values): # Use np.asarray() instead of np.array() to avoid copying if possible. - udf_return_col = [np.asarray(e) for e in udf_return_col] + column_values = [np.asarray(e) for e in column_values] + shapes = set() has_object = False - for e in udf_return_col: + for e in column_values: if isinstance(e, np.ndarray): shapes.add((e.dtype, e.shape)) elif isinstance(e, bytes): @@ -122,24 +120,48 @@ def convert_udf_returns_to_numpy(udf_return_col: Any) -> Any: has_object = True elif not np.isscalar(e): has_object = True + + # When column values are + # - Arrays of heterogeneous shapes + # - Byte-strings (viewed as arrays of heterogeneous shapes) + # - Non-scalar objects (tuples, lists, arbitrary object types) + # + # Custom "ragged ndarray" is created, represented as an array of + # references (ie ndarray with dtype=object) if has_object or len(shapes) > 1: # This util works around some limitations of np.array(dtype=object). - udf_return_col = create_ragged_ndarray(udf_return_col) + return create_ragged_ndarray(column_values) else: - udf_return_col = np.array(udf_return_col) + return np.array(column_values) + except Exception as e: + logger.error( + f"Failed to convert column values to numpy array: " + f"{_truncated_repr(column_values)}", + exc_info=e, + ) + raise ValueError( "Failed to convert column values to numpy array: " - f"({_truncated_repr(udf_return_col)}): {e}." - ) - elif hasattr(udf_return_col, "__array__"): + f"({_truncated_repr(column_values)}): {e}." + ) from e + + elif is_array_like(column_values): # Converts other array-like objects such as torch.Tensor. try: - udf_return_col = np.array(udf_return_col) + # Use np.asarray() instead of np.array() to avoid copying if possible. + return np.asarray(column_values) except Exception as e: + logger.error( + f"Failed to convert column values to numpy array: " + f"{_truncated_repr(column_values)}", + exc_info=e, + ) + raise ValueError( "Failed to convert column values to numpy array: " - f"({_truncated_repr(udf_return_col)}): {e}." - ) + f"({_truncated_repr(column_values)}): {e}." + ) from e - return udf_return_col + else: + return column_values diff --git a/python/ray/data/_internal/pandas_block.py b/python/ray/data/_internal/pandas_block.py index 04ff4a35a7e0..119469b46c1b 100644 --- a/python/ray/data/_internal/pandas_block.py +++ b/python/ray/data/_internal/pandas_block.py @@ -17,10 +17,8 @@ import numpy as np from ray.air.constants import TENSOR_COLUMN_NAME -from ray.data._internal.numpy_support import ( - convert_udf_returns_to_numpy, - validate_numpy_batch, -) +from ray.air.util.tensor_extensions.utils import _is_ndarray_tensor +from ray.data._internal.numpy_support import convert_to_numpy, validate_numpy_batch from ray.data._internal.row import TableRow from ray.data._internal.table_block import TableBlockAccessor, TableBlockBuilder from ray.data._internal.util import find_partitions @@ -114,14 +112,20 @@ def __init__(self): @staticmethod def _table_from_pydict(columns: Dict[str, List[Any]]) -> "pandas.DataFrame": pandas = lazy_import_pandas() - for key, value in columns.items(): - if key == TENSOR_COLUMN_NAME or isinstance( - next(iter(value), None), np.ndarray - ): + + pd_columns: Dict[str, Any] = {} + + for col_name, col_vals in columns.items(): + np_col_vals = convert_to_numpy(col_vals) + + if col_name == TENSOR_COLUMN_NAME or _is_ndarray_tensor(np_col_vals): from ray.data.extensions.tensor_extension import TensorArray - columns[key] = TensorArray(value) - return pandas.DataFrame(columns) + pd_columns[col_name] = TensorArray(np_col_vals) + else: + pd_columns[col_name] = np_col_vals + + return pandas.DataFrame(pd_columns) @staticmethod def _concat_tables(tables: List["pandas.DataFrame"]) -> "pandas.DataFrame": @@ -283,10 +287,6 @@ def numpy_to_block( ) -> "pandas.DataFrame": validate_numpy_batch(batch) - batch = { - column_name: convert_udf_returns_to_numpy(column) - for column_name, column in batch.items() - } block = PandasBlockBuilder._table_from_pydict(batch) return block diff --git a/python/ray/data/_internal/planner/exchange/sort_task_spec.py b/python/ray/data/_internal/planner/exchange/sort_task_spec.py index 827c4a2c7a51..7c67b3dbdefe 100644 --- a/python/ray/data/_internal/planner/exchange/sort_task_spec.py +++ b/python/ray/data/_internal/planner/exchange/sort_task_spec.py @@ -81,8 +81,9 @@ def validate_schema(self, schema: Optional[Union[type, "pyarrow.lib.Schema"]]): for column in self._columns: if column not in schema_names_set: raise ValueError( - "The column '{}' does not exist in the " - "schema '{}'.".format(column, schema) + f"You specified the column '{column}', but there's no such " + "column in the dataset. The dataset has columns: " + f"{schema_names_set}" ) @property diff --git a/python/ray/data/_internal/planner/plan_udf_map_op.py b/python/ray/data/_internal/planner/plan_udf_map_op.py index 9cd81c0d5f3b..605efe7c95cc 100644 --- a/python/ray/data/_internal/planner/plan_udf_map_op.py +++ b/python/ray/data/_internal/planner/plan_udf_map_op.py @@ -352,6 +352,8 @@ def transform_fn( # generators, and in the main event loop, yield them from # the queue as they become available. output_batch_queue = queue.Queue() + # Sentinel object to signal the end of the async generator. + sentinel = object() async def process_batch(batch: DataBatch): try: @@ -366,29 +368,33 @@ async def process_batch(batch: DataBatch): ) # Put the exception into the queue to signal an error async def process_all_batches(): - loop = ray.data._map_actor_context.udf_map_asyncio_loop - tasks = [loop.create_task(process_batch(x)) for x in input_iterable] + try: + loop = ray.data._map_actor_context.udf_map_asyncio_loop + tasks = [loop.create_task(process_batch(x)) for x in input_iterable] - ctx = ray.data.DataContext.get_current() - if ctx.execution_options.preserve_order: - for task in tasks: - await task() - else: - for task in asyncio.as_completed(tasks): - await task + ctx = ray.data.DataContext.get_current() + if ctx.execution_options.preserve_order: + for task in tasks: + await task() + else: + for task in asyncio.as_completed(tasks): + await task + finally: + output_batch_queue.put(sentinel) # Use the existing event loop to create and run Tasks to process each batch loop = ray.data._map_actor_context.udf_map_asyncio_loop - future = asyncio.run_coroutine_threadsafe(process_all_batches(), loop) + asyncio.run_coroutine_threadsafe(process_all_batches(), loop) # Yield results as they become available. - # After all futures are completed, drain the queue to - # yield any remaining results. - while not future.done() or not output_batch_queue.empty(): + while True: # Here, `out_batch` is a one-row output batch # from the async generator, corresponding to a # single row from the input batch. out_batch = output_batch_queue.get() + if out_batch is sentinel: + # Break out of the loop when the sentinel is received. + break if isinstance(out_batch, Exception): raise out_batch _validate_batch_output(out_batch) diff --git a/python/ray/data/_internal/stats.py b/python/ray/data/_internal/stats.py index 46435ec9ceb4..fc6903cd92e2 100644 --- a/python/ray/data/_internal/stats.py +++ b/python/ray/data/_internal/stats.py @@ -378,33 +378,6 @@ def update_iteration_metrics( self.iter_user_s.set(stats.iter_user_s.get(), tags) self.iter_initialize_s.set(stats.iter_initialize_s.get(), tags) - def clear_execution_metrics(self, dataset_tag: str, operator_tags: List[str]): - for operator_tag in operator_tags: - tags = self._create_tags(dataset_tag, operator_tag) - self.spilled_bytes.set(0, tags) - self.allocated_bytes.set(0, tags) - self.freed_bytes.set(0, tags) - self.current_bytes.set(0, tags) - self.output_bytes.set(0, tags) - self.output_rows.set(0, tags) - self.cpu_usage_cores.set(0, tags) - self.gpu_usage_cores.set(0, tags) - - for prom_metric in self.execution_metrics_inputs.values(): - prom_metric.set(0, tags) - - for prom_metric in self.execution_metrics_outputs.values(): - prom_metric.set(0, tags) - - for prom_metric in self.execution_metrics_tasks.values(): - prom_metric.set(0, tags) - - for prom_metric in self.execution_metrics_obj_store_memory.values(): - prom_metric.set(0, tags) - - for prom_metric in self.execution_metrics_misc.values(): - prom_metric.set(0, tags) - def register_dataset(self, job_id: str, dataset_tag: str, operator_tags: List[str]): self.datasets[dataset_tag] = { "job_id": job_id, @@ -593,19 +566,13 @@ def update_execution_metrics( self._last_execution_stats[dataset_tag] = args self._start_thread_if_not_running() - def clear_execution_metrics(self, dataset_tag: str, operator_tags: List[str]): + def clear_last_execution_stats(self, dataset_tag: str): + # After dataset completes execution, remove cached execution stats. + # Marks the dataset as finished on job page's Ray Data Overview. with self._stats_lock: if dataset_tag in self._last_execution_stats: del self._last_execution_stats[dataset_tag] - try: - self._stats_actor( - create_if_not_exists=False - ).clear_execution_metrics.remote(dataset_tag, operator_tags) - except Exception: - # Cluster may be shut down. - pass - # Iteration methods def update_iteration_metrics(self, stats: "DatasetStats", dataset_tag: str): diff --git a/python/ray/data/_internal/table_block.py b/python/ray/data/_internal/table_block.py index 55ea4fcc553a..a8995fc6703b 100644 --- a/python/ray/data/_internal/table_block.py +++ b/python/ray/data/_internal/table_block.py @@ -15,9 +15,10 @@ from ray.air.constants import TENSOR_COLUMN_NAME from ray.data._internal.block_builder import BlockBuilder -from ray.data._internal.numpy_support import convert_udf_returns_to_numpy, is_array_like +from ray.data._internal.numpy_support import is_array_like from ray.data._internal.row import TableRow from ray.data._internal.size_estimator import SizeEstimator +from ray.data._internal.util import MiB from ray.data.block import Block, BlockAccessor if TYPE_CHECKING: @@ -28,7 +29,7 @@ # The max size of Python tuples to buffer before compacting them into a # table in the BlockBuilder. -MAX_UNCOMPACTED_SIZE_BYTES = 50 * 1024 * 1024 +MAX_UNCOMPACTED_SIZE_BYTES = 50 * MiB class TableBlockBuilder(BlockBuilder): @@ -121,14 +122,13 @@ def will_build_yield_copy(self) -> bool: return self._concat_would_copy() and len(self._tables) > 1 def build(self) -> Block: - columns = { - key: convert_udf_returns_to_numpy(col) for key, col in self._columns.items() - } - if columns: - tables = [self._table_from_pydict(columns)] + if self._columns: + tables = [self._table_from_pydict(self._columns)] else: tables = [] + tables.extend(self._tables) + if len(tables) > 0: return self._concat_tables(tables) else: @@ -149,10 +149,7 @@ def _compact_if_needed(self) -> None: assert self._columns if self._uncompacted_size.size_bytes() < MAX_UNCOMPACTED_SIZE_BYTES: return - columns = { - key: convert_udf_returns_to_numpy(col) for key, col in self._columns.items() - } - block = self._table_from_pydict(columns) + block = self._table_from_pydict(self._columns) self.add_block(block) self._uncompacted_size = SizeEstimator() self._columns.clear() diff --git a/python/ray/data/_internal/util.py b/python/ray/data/_internal/util.py index 5e8c921c3733..1d0b70cf6a6c 100644 --- a/python/ray/data/_internal/util.py +++ b/python/ray/data/_internal/util.py @@ -26,7 +26,6 @@ import ray from ray._private.utils import _get_pyarrow_version -from ray.data._internal.arrow_ops.transform_pyarrow import unify_schemas from ray.data.context import DEFAULT_READ_OP_MIN_NUM_BLOCKS, WARN_PREFIX, DataContext if TYPE_CHECKING: @@ -41,6 +40,12 @@ logger = logging.getLogger(__name__) + +KiB = 1024 # bytes +MiB = 1024 * KiB +GiB = 1024 * MiB + + # NOTE: Make sure that these lower and upper bounds stay in sync with version # constraints given in python/setup.py. # Inclusive minimum pyarrow version. @@ -707,6 +712,7 @@ def unify_block_metadata_schema( """ # Some blocks could be empty, in which case we cannot get their schema. # TODO(ekl) validate schema is the same across different blocks. + from ray.data._internal.arrow_ops.transform_pyarrow import unify_schemas # First check if there are blocks with computed schemas, then unify # valid schemas from all such blocks. diff --git a/python/ray/data/block.py b/python/ray/data/block.py index 15cf6b68b20c..fcab3feb67eb 100644 --- a/python/ray/data/block.py +++ b/python/ray/data/block.py @@ -1,4 +1,5 @@ import collections +import logging import os import time from dataclasses import dataclass @@ -25,6 +26,7 @@ from ray.air.util.tensor_extensions.arrow import ArrowConversionError from ray.data._internal.util import _check_pyarrow_version, _truncated_repr from ray.types import ObjectRef +from ray.util import log_once from ray.util.annotations import DeveloperAPI import psutil @@ -57,6 +59,9 @@ Block = Union["pyarrow.Table", "pandas.DataFrame"] +logger = logging.getLogger(__name__) + + @DeveloperAPI class BlockType(Enum): ARROW = "arrow" @@ -67,6 +72,12 @@ class BlockType(Enum): # returned from batch UDFs. DataBatch = Union["pyarrow.Table", "pandas.DataFrame", Dict[str, np.ndarray]] +# User-facing data column type. This is the data type for data that is supplied to and +# returned from column UDFs. +DataBatchColumn = Union[ + "pyarrow.ChunkedArray", "pyarrow.Array", "pandas.Series", np.ndarray +] + # A class type that implements __call__. CallableClass = type @@ -374,6 +385,12 @@ def batch_to_block( try: return cls.batch_to_arrow_block(batch) except ArrowConversionError as e: + if log_once("_fallback_to_pandas_block_warning"): + logger.warning( + f"Failed to convert batch to Arrow due to: {e}; " + f"falling back to Pandas block" + ) + if block_type is None: return cls.batch_to_pandas_block(batch) else: @@ -386,9 +403,9 @@ def batch_to_block( @classmethod def batch_to_arrow_block(cls, batch: Dict[str, Any]) -> Block: """Create an Arrow block from user-facing data formats.""" - from ray.data._internal.arrow_block import ArrowBlockAccessor + from ray.data._internal.arrow_block import ArrowBlockBuilder - return ArrowBlockAccessor.numpy_to_block(batch) + return ArrowBlockBuilder._table_from_pydict(batch) @classmethod def batch_to_pandas_block(cls, batch: Dict[str, Any]) -> Block: diff --git a/python/ray/data/context.py b/python/ray/data/context.py index 5ed9b4fe68ef..347d3da68372 100644 --- a/python/ray/data/context.py +++ b/python/ray/data/context.py @@ -80,6 +80,8 @@ # V2 in turn relies on int64 offsets, therefore having a limit of ~9Eb (exabytes) DEFAULT_USE_ARROW_TENSOR_V2 = env_bool("RAY_DATA_USE_ARROW_TENSOR_V2", True) +DEFAULT_ENABLE_FALLBACK_TO_ARROW_OBJECT_EXT_TYPE = True + DEFAULT_AUTO_LOG_STATS = False DEFAULT_VERBOSE_STATS_LOG = False @@ -222,6 +224,12 @@ class DataContext: read_op_min_num_blocks: Minimum number of read output blocks for a dataset. enable_tensor_extension_casting: Whether to automatically cast NumPy ndarray columns in Pandas DataFrames to tensor extension columns. + use_arrow_tensor_v2: Config enabling V2 version of ArrowTensorArray supporting + tensors > 2Gb in size (off by default) + enable_fallback_to_arrow_object_ext_type: Enables fallback to serialize column + values not suppported by Arrow natively (like user-defined custom Python + classes for ex, etc) using `ArrowPythonObjectType` (simply serializing + these as bytes) enable_auto_log_stats: Whether to automatically log stats after execution. If disabled, you can still manually print stats with ``Dataset.stats()``. verbose_stats_logs: Whether stats logs should be verbose. This includes fields @@ -293,6 +301,9 @@ class DataContext: read_op_min_num_blocks: int = DEFAULT_READ_OP_MIN_NUM_BLOCKS enable_tensor_extension_casting: bool = DEFAULT_ENABLE_TENSOR_EXTENSION_CASTING use_arrow_tensor_v2: bool = DEFAULT_USE_ARROW_TENSOR_V2 + enable_fallback_to_arrow_object_ext_type = ( + DEFAULT_ENABLE_FALLBACK_TO_ARROW_OBJECT_EXT_TYPE + ) enable_auto_log_stats: bool = DEFAULT_AUTO_LOG_STATS verbose_stats_logs: bool = DEFAULT_VERBOSE_STATS_LOG trace_allocations: bool = DEFAULT_TRACE_ALLOCATIONS diff --git a/python/ray/data/dataset.py b/python/ray/data/dataset.py index d576b8eb2ea7..496e29a8dea4 100644 --- a/python/ray/data/dataset.py +++ b/python/ray/data/dataset.py @@ -5,6 +5,7 @@ import logging import time import warnings +from collections.abc import Sequence from typing import ( TYPE_CHECKING, Any, @@ -87,6 +88,7 @@ Block, BlockAccessor, DataBatch, + DataBatchColumn, T, U, UserDefinedFunction, @@ -529,7 +531,8 @@ def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: compute: This argument is deprecated. Use ``concurrency`` argument. batch_format: If ``"default"`` or ``"numpy"``, batches are ``Dict[str, numpy.ndarray]``. If ``"pandas"``, batches are - ``pandas.DataFrame``. + ``pandas.DataFrame``. If ``"pyarrow"``, batches are + ``pyarrow.Table``. zero_copy_batch: Whether ``fn`` should be provided zero-copy, read-only batches. If this is ``True`` and no copy is required for the ``batch_format`` conversion, the batch is a zero-copy, read-only @@ -700,16 +703,21 @@ def _map_batches_without_batch_size_validation( def add_column( self, col: str, - fn: Callable[["pandas.DataFrame"], "pandas.Series"], + fn: Callable[ + [DataBatch], + DataBatchColumn, + ], *, + batch_format: Optional[str] = "pandas", compute: Optional[str] = None, concurrency: Optional[Union[int, Tuple[int, int]]] = None, **ray_remote_args, ) -> "Dataset": """Add the given column to the dataset. - A function generating the new column values given the batch in pandas - format must be specified. + A function generating the new column values given the batch in pyarrow or pandas + format must be specified. This function must operate on batches of + `batch_format`. Examples: @@ -729,11 +737,6 @@ def add_column( id int64 new_id int64 - Overwrite the existing values with zeros. - - >>> ds.add_column("id", lambda df: 0).take(3) - [{'id': 0}, {'id': 0}, {'id': 0}] - Time complexity: O(dataset size / parallelism) Args: @@ -741,6 +744,11 @@ def add_column( column is overwritten. fn: Map function generating the column values given a batch of records in pandas format. + batch_format: If ``"default"`` or ``"numpy"``, batches are + ``Dict[str, numpy.ndarray]``. If ``"pandas"``, batches are + ``pandas.DataFrame``. If ``"pyarrow"``, batches are + ``pyarrow.Table``. If ``"numpy"``, batches are + ``Dict[str, numpy.ndarray]``. compute: This argument is deprecated. Use ``concurrency`` argument. concurrency: The number of Ray workers to use concurrently. For a fixed-sized worker pool of size ``n``, specify ``concurrency=n``. For @@ -749,17 +757,69 @@ def add_column( ray_remote_args: Additional resource requirements to request from ray (e.g., num_gpus=1 to request GPUs for the map tasks). """ + # Check that batch_format + accepted_batch_formats = ["pandas", "pyarrow", "numpy"] + if batch_format not in accepted_batch_formats: + raise ValueError( + f"batch_format argument must be on of {accepted_batch_formats}, " + f"got: {batch_format}" + ) + + def _raise_duplicate_column_error(col: str): + raise ValueError(f"Trying to add an existing column with name {col!r}") - def add_column(batch: "pandas.DataFrame") -> "pandas.DataFrame": - batch.loc[:, col] = fn(batch) - return batch + def add_column(batch: DataBatch) -> DataBatch: + column = fn(batch) + if batch_format == "pandas": + import pandas as pd + + assert isinstance(column, (pd.Series, Sequence)), ( + f"For pandas batch format, the function must return a pandas " + f"Series or sequence, got: {type(column)}" + ) + if col in batch: + _raise_duplicate_column_error(col) + batch.loc[:, col] = column + return batch + elif batch_format == "pyarrow": + import pyarrow as pa + + assert isinstance(column, (pa.Array, pa.ChunkedArray)), ( + f"For pyarrow batch format, the function must return a pyarrow " + f"Array, got: {type(column)}" + ) + # Historically, this method was written for pandas batch format. + # To resolve https://github.com/ray-project/ray/issues/48090, + # we also allow pyarrow batch format which is preferred but would be + # a breaking change to enforce. + + # For pyarrow, the index of the column will be -1 if it is missing in + # which case we'll want to append it + column_idx = batch.schema.get_field_index(col) + if column_idx == -1: + # Append the column to the table + return batch.append_column(col, column) + else: + _raise_duplicate_column_error(col) + + else: + # batch format is assumed to be numpy since we checked at the + # beginning of the add_column function + assert isinstance(column, np.ndarray), ( + f"For numpy batch format, the function must return a " + f"numpy.ndarray, got: {type(column)}" + ) + if col in batch: + _raise_duplicate_column_error(col) + batch[col] = column + return batch if not callable(fn): raise ValueError("`fn` must be callable, got {}".format(fn)) return self.map_batches( add_column, - batch_format="pandas", # TODO(ekl) we should make this configurable. + batch_format=batch_format, compute=compute, concurrency=concurrency, zero_copy_batch=False, @@ -801,7 +861,7 @@ def drop_columns( Args: cols: Names of the columns to drop. If any name does not exist, - an exception is raised. + an exception is raised. Column names must be unique. compute: This argument is deprecated. Use ``concurrency`` argument. concurrency: The number of Ray workers to use concurrently. For a fixed-sized worker pool of size ``n``, specify ``concurrency=n``. For an autoscaling @@ -810,12 +870,15 @@ def drop_columns( ray (e.g., num_gpus=1 to request GPUs for the map tasks). """ # noqa: E501 + if len(cols) != len(set(cols)): + raise ValueError(f"drop_columns expects unique column names, got: {cols}") + def drop_columns(batch): - return batch.drop(columns=cols) + return batch.drop(cols) return self.map_batches( drop_columns, - batch_format="pandas", + batch_format="pyarrow", zero_copy_batch=True, compute=compute, concurrency=concurrency, @@ -4316,7 +4379,8 @@ def to_tf( If your model accepts additional metadata aside from features and label, specify a single additional column or a list of additional columns. A common use case is to include sample weights in the data samples and train a ``tf.keras.Model`` with ``tf.keras.Model.fit``. - >>> ds = ds.add_column("sample weights", lambda df: 1) + >>> import pandas as pd + >>> ds = ds.add_column("sample weights", lambda df: pd.Series([1] * len(df))) >>> ds.to_tf(feature_columns="features", label_columns="target", additional_columns="sample weights") <_OptionsDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.float64, name='features'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'), TensorSpec(shape=(None,), dtype=tf.int64, name='sample weights'))> diff --git a/python/ray/data/examples/data/hudi-tables/0.x_cow_partitioned.zip b/python/ray/data/examples/data/hudi-tables/0.x_cow_partitioned.zip new file mode 100644 index 000000000000..9f78c06de945 Binary files /dev/null and b/python/ray/data/examples/data/hudi-tables/0.x_cow_partitioned.zip differ diff --git a/python/ray/data/extensions/__init__.py b/python/ray/data/extensions/__init__.py index bebf3c2b2a5c..517b4fe7a3a2 100644 --- a/python/ray/data/extensions/__init__.py +++ b/python/ray/data/extensions/__init__.py @@ -8,7 +8,7 @@ ArrowPythonObjectType, PythonObjectArray, PythonObjectDtype, - object_extension_type_allowed, + _object_extension_type_allowed, ) from ray.data.extensions.tensor_extension import ( ArrowConversionError, @@ -40,6 +40,6 @@ "ArrowPythonObjectScalar", "PythonObjectArray", "PythonObjectDtype", - "object_extension_type_allowed", + "_object_extension_type_allowed", "get_arrow_extension_tensor_types", ] diff --git a/python/ray/data/extensions/object_extension.py b/python/ray/data/extensions/object_extension.py index a5daf4811a05..42ab20a231c6 100644 --- a/python/ray/data/extensions/object_extension.py +++ b/python/ray/data/extensions/object_extension.py @@ -2,7 +2,7 @@ ArrowPythonObjectArray, ArrowPythonObjectScalar, ArrowPythonObjectType, - object_extension_type_allowed, + _object_extension_type_allowed, ) from ray.air.util.object_extensions.pandas import ( # noqa: F401 PythonObjectArray, diff --git a/python/ray/data/grouped_data.py b/python/ray/data/grouped_data.py index 8f7b7dde118d..427ea18b7bbf 100644 --- a/python/ray/data/grouped_data.py +++ b/python/ray/data/grouped_data.py @@ -1,3 +1,4 @@ +from functools import partial from typing import Any, Dict, Iterable, List, Optional, Tuple, Union from ray.data._internal.aggregate import Count, Max, Mean, Min, Std, Sum @@ -261,7 +262,10 @@ def wrapped_fn(batch, *args, **kwargs): # Change the name of the wrapped function so that users see the name of their # function rather than `wrapped_fn` in the progress bar. - wrapped_fn.__name__ = fn.__name__ + if isinstance(fn, partial): + wrapped_fn.__name__ = fn.func.__name__ + else: + wrapped_fn.__name__ = fn.__name__ # Note we set batch_size=None here, so it will use the entire block as a batch, # which ensures that each group will be contained within a batch in entirety. diff --git a/python/ray/data/iterator.py b/python/ray/data/iterator.py index 58e9a1b7355e..2f19111af80f 100644 --- a/python/ray/data/iterator.py +++ b/python/ray/data/iterator.py @@ -734,7 +734,8 @@ def to_tf( If your model accepts additional metadata aside from features and label, specify a single additional column or a list of additional columns. A common use case is to include sample weights in the data samples and train a ``tf.keras.Model`` with ``tf.keras.Model.fit``. - >>> ds = ds.add_column("sample weights", lambda df: 1) + >>> import pandas as pd + >>> ds = ds.add_column("sample weights", lambda df: pd.Series([1] * len(df))) >>> it = ds.iterator() >>> it.to_tf(feature_columns="sepal length (cm)", label_columns="target", additional_columns="sample weights") <_OptionsDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.float64, name='sepal length (cm)'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'), TensorSpec(shape=(None,), dtype=tf.int64, name='sample weights'))> diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py index 60eee8571c1d..d60a89858512 100644 --- a/python/ray/data/read_api.py +++ b/python/ray/data/read_api.py @@ -27,6 +27,7 @@ from ray.data._internal.datasource.delta_sharing_datasource import ( DeltaSharingDatasource, ) +from ray.data._internal.datasource.hudi_datasource import HudiDatasource from ray.data._internal.datasource.iceberg_datasource import IcebergDatasource from ray.data._internal.datasource.image_datasource import ( ImageDatasource, @@ -2312,6 +2313,58 @@ def get_dbutils(): ) +@PublicAPI(stability="alpha") +def read_hudi( + table_uri: str, + *, + storage_options: Optional[Dict[str, str]] = None, + ray_remote_args: Optional[Dict[str, Any]] = None, + concurrency: Optional[int] = None, + override_num_blocks: Optional[int] = None, +) -> Dataset: + """ + Create a :class:`~ray.data.Dataset` from an + `Apache Hudi table `_. + + Examples: + >>> import ray + >>> ds = ray.data.read_hudi( # doctest: +SKIP + ... table_uri="/hudi/trips", + ... ) + + Args: + table_uri: The URI of the Hudi table to read from. Local file paths, S3, and GCS + are supported. + storage_options: Extra options that make sense for a particular storage + connection. This is used to store connection parameters like credentials, + endpoint, etc. See more explanation + `here `_. + ray_remote_args: kwargs passed to :meth:`~ray.remote` in the read tasks. + concurrency: The maximum number of Ray tasks to run concurrently. Set this + to control number of tasks to run concurrently. This doesn't change the + total number of tasks run or the total number of output blocks. By default, + concurrency is dynamically decided based on the available resources. + override_num_blocks: Override the number of output blocks from all read tasks. + By default, the number of output blocks is dynamically decided based on + input data size and available resources. You shouldn't manually set this + value in most cases. + + Returns: + A :class:`~ray.data.Dataset` producing records read from the Hudi table. + """ # noqa: E501 + datasource = HudiDatasource( + table_uri=table_uri, + storage_options=storage_options, + ) + + return read_datasource( + datasource=datasource, + ray_remote_args=ray_remote_args, + concurrency=concurrency, + override_num_blocks=override_num_blocks, + ) + + @PublicAPI def from_dask(df: "dask.dataframe.DataFrame") -> MaterializedDataset: """Create a :class:`~ray.data.Dataset` from a diff --git a/python/ray/data/tests/preprocessors/test_encoder.py b/python/ray/data/tests/preprocessors/test_encoder.py index 46b719ba6e83..bfae00596439 100644 --- a/python/ray/data/tests/preprocessors/test_encoder.py +++ b/python/ray/data/tests/preprocessors/test_encoder.py @@ -298,7 +298,7 @@ def test_one_hot_encoder_with_max_categories(): expected_df = pd.DataFrame( { "A": col_a, - "B": [[0, 1], [1, 0], [0, 0], [1, 0]], + "B": [[0, 0], [1, 0], [0, 1], [1, 0]], "C": [[1, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 1]], } ) diff --git a/python/ray/data/tests/test_all_to_all.py b/python/ray/data/tests/test_all_to_all.py index cf0cb8b2b2e7..a6b173383145 100644 --- a/python/ray/data/tests/test_all_to_all.py +++ b/python/ray/data/tests/test_all_to_all.py @@ -1167,7 +1167,6 @@ def test_groupby_map_groups_multicolumn( ray_start_regular_shared, ds_format, num_parts, use_push_based_shuffle ): # Test built-in count aggregation - print(f"Seeding RNG for test_groupby_arrow_count with: {RANDOM_SEED}") random.seed(RANDOM_SEED) xs = list(range(100)) random.shuffle(xs) @@ -1190,6 +1189,33 @@ def test_groupby_map_groups_multicolumn( ] +def test_groupby_map_groups_with_partial(): + """ + The partial function name should show up as + +- Sort + +- MapBatches(func) + """ + from functools import partial + + def func(x, y): + return {f"x_add_{y}": [len(x["id"]) + y]} + + df = pd.DataFrame({"id": list(range(100))}) + df["key"] = df["id"] % 5 + + ds = ray.data.from_pandas(df).groupby("key").map_groups(partial(func, y=5)) + result = ds.take_all() + + assert result == [ + {"x_add_5": 25}, + {"x_add_5": 25}, + {"x_add_5": 25}, + {"x_add_5": 25}, + {"x_add_5": 25}, + ] + assert "MapBatches(func)" in ds.__repr__() + + def test_random_block_order_schema(ray_start_regular_shared): df = pd.DataFrame({"a": np.random.rand(10), "b": np.random.rand(10)}) ds = ray.data.from_pandas(df).randomize_block_order() diff --git a/python/ray/data/tests/test_arrow_block.py b/python/ray/data/tests/test_arrow_block.py index 43888ace8c57..00ed13363f20 100644 --- a/python/ray/data/tests/test_arrow_block.py +++ b/python/ray/data/tests/test_arrow_block.py @@ -1,13 +1,264 @@ +import gc +import os +import sys import types +from tempfile import TemporaryDirectory +from typing import Union import numpy as np import pyarrow as pa import pytest +from pyarrow import parquet as pq import ray from ray._private.test_utils import run_string_as_driver +from ray.air.util.tensor_extensions.arrow import ArrowTensorArray +from ray.data import DataContext from ray.data._internal.arrow_block import ArrowBlockAccessor -from ray.data.extensions.object_extension import object_extension_type_allowed +from ray.data._internal.arrow_ops.transform_pyarrow import combine_chunked_array +from ray.data._internal.util import GiB, MiB +from ray.data.extensions.object_extension import _object_extension_type_allowed + + +@pytest.fixture(scope="module") +def parquet_dataset_single_column_gt_2gb(): + chunk_size = 256 * MiB + num_chunks = 10 + + total_column_size = chunk_size * 10 # ~2.5 GiB + + with TemporaryDirectory() as tmp_dir: + dataset_path = f"{tmp_dir}/large_parquet_chunk_{chunk_size}" + + # Create directory + os.mkdir(dataset_path) + + for i in range(num_chunks): + chunk = b"a" * chunk_size + + d = {"id": [i], "bin": [chunk]} + t = pa.Table.from_pydict(d) + + print(f">>> Table schema: {t.schema} (size={sys.getsizeof(t)})") + + filepath = f"{dataset_path}/chunk_{i}.parquet" + pq.write_table(t, filepath) + + print(f">>> Created a chunk #{i}") + + print(f">>> Created dataset at {dataset_path}") + + yield dataset_path, num_chunks, total_column_size + + print(f">>> Cleaning up dataset at {dataset_path}") + + +@pytest.fixture(scope="module") +def binary_dataset_single_file_gt_2gb(): + total_size = int(2.1 * GiB) + chunk_size = 256 * MiB + num_chunks = total_size // chunk_size + remainder = total_size % chunk_size + + with TemporaryDirectory() as tmp_dir: + dataset_path = f"{tmp_dir}/binary_dataset_gt_2gb_single_file" + + # Create directory + os.mkdir(dataset_path) + + with open(f"{dataset_path}/chunk.bin", "wb") as f: + for i in range(num_chunks): + f.write(b"a" * chunk_size) + + print(f">>> Written chunk #{i}") + + if remainder: + f.write(b"a" * remainder) + + print(f">>> Wrote chunked dataset at: {dataset_path}") + + yield dataset_path, total_size + + print(f">>> Cleaning up dataset: {dataset_path}") + + +@pytest.mark.parametrize( + "col_name", + [ + "bytes", + # TODO fix numpy conversion + # "text", + ], +) +def test_single_row_gt_2gb( + ray_start_regular, + restore_data_context, + binary_dataset_single_file_gt_2gb, + col_name, +): + # Disable (automatic) fallback to `ArrowPythonObjectType` extension type + DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False + + dataset_path, target_binary_size = binary_dataset_single_file_gt_2gb + + def _id(row): + bs = row[col_name] + assert round(len(bs) / GiB, 1) == round(target_binary_size / GiB, 1) + return row + + if col_name == "text": + ds = ray.data.read_text(dataset_path) + elif col_name == "bytes": + ds = ray.data.read_binary_files(dataset_path) + + total = ds.map(_id).count() + + assert total == 1 + + +@pytest.mark.parametrize( + "op", + [ + "map", + "map_batches", + ], +) +def test_arrow_batch_gt_2gb( + ray_start_regular, + parquet_dataset_single_column_gt_2gb, + restore_data_context, + op, +): + # Disable (automatic) fallback to `ArrowPythonObjectType` extension type + DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False + + dataset_path, num_rows, total_column_size = parquet_dataset_single_column_gt_2gb + + def _id(x): + return x + + ds = ray.data.read_parquet(dataset_path) + + if op == "map": + ds = ds.map(_id) + elif op == "map_batches": + # Combine all rows into a single batch using `map_batches` coercing to + # numpy format + ds = ds.map_batches( + _id, + batch_format="numpy", + batch_size=num_rows, + zero_copy_batch=False, + ) + + batch = ds.take_batch() + + total_binary_column_size = sum([len(b) for b in batch["bin"]]) + + print( + f">>> Batch:\n" + f"------\n" + "Column: 'id'\n" + f"Values: {batch['id']}\n" + f"------\n" + "Column: 'bin'\n" + f"Total: {total_binary_column_size / GiB} GiB\n" + f"Values: {[str(v)[:3] + ' x ' + str(len(v)) for v in batch['bin']]}\n" + ) + + assert total_binary_column_size == total_column_size + + # Clean up refs + del batch + del ds + # Force GC to free up object store memory + gc.collect() + + +@pytest.mark.parametrize( + "input_,expected_output", + [ + # Empty chunked array + (pa.chunked_array([], type=pa.int8()), pa.array([], type=pa.int8())), + # Fixed-shape tensors + ( + pa.chunked_array( + [ + ArrowTensorArray.from_numpy(np.arange(3).reshape(3, 1)), + ArrowTensorArray.from_numpy(np.arange(3).reshape(3, 1)), + ] + ), + ArrowTensorArray.from_numpy( + np.concatenate( + [ + np.arange(3).reshape(3, 1), + np.arange(3).reshape(3, 1), + ] + ) + ), + ), + # Ragged (variable-shaped) tensors + ( + pa.chunked_array( + [ + ArrowTensorArray.from_numpy(np.arange(3).reshape(3, 1)), + ArrowTensorArray.from_numpy(np.arange(5).reshape(5, 1)), + ] + ), + ArrowTensorArray.from_numpy( + np.concatenate( + [ + np.arange(3).reshape(3, 1), + np.arange(5).reshape(5, 1), + ] + ) + ), + ), + # Small (< 2 GiB) arrays + ( + pa.chunked_array( + [ + pa.array([1, 2, 3], type=pa.int16()), + pa.array([4, 5, 6], type=pa.int16()), + ] + ), + pa.array([1, 2, 3, 4, 5, 6], type=pa.int16()), + ), + ], +) +def test_combine_chunked_array_small( + input_, expected_output: Union[pa.Array, pa.ChunkedArray] +): + result = combine_chunked_array(input_) + + expected_output.equals(result) + + +def test_combine_chunked_array_large(): + """Verifies `combine_chunked_array` on arrays > 2 GiB""" + + # 144 MiB + ones_1gb = np.ones(shape=(550, 128, 128, 4), dtype=np.int32()).ravel() + + # Total ~2.15 GiB + input_ = pa.chunked_array( + [ + pa.array(ones_1gb), + ] + * 16 + ) + + assert round(input_.nbytes / GiB, 2) == 2.15 + + result = combine_chunked_array(input_) + + assert isinstance(result, pa.ChunkedArray) + assert len(result.chunks) == 2 + + # Should re-combine first provided 14 chunks into 1 + assert result.chunks[0].nbytes == sum([c.nbytes for c in input_.chunks[:14]]) + # Remaining 2 go into the second one + assert result.chunks[1].nbytes == sum([c.nbytes for c in input_.chunks[14:]]) def test_append_column(ray_start_regular_shared): @@ -46,7 +297,7 @@ def test_register_arrow_types(tmp_path): @pytest.mark.skipif( - not object_extension_type_allowed(), reason="Object extension type not supported." + not _object_extension_type_allowed(), reason="Object extension type not supported." ) def test_dict_doesnt_fallback_to_pandas_block(ray_start_regular_shared): # If the UDF returns a column with dict, previously, we would @@ -81,6 +332,4 @@ def fn2(batch): if __name__ == "__main__": - import sys - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/data/tests/test_arrow_serialization.py b/python/ray/data/tests/test_arrow_serialization.py index b3e8b10c19b1..232ed32cc749 100644 --- a/python/ray/data/tests/test_arrow_serialization.py +++ b/python/ray/data/tests/test_arrow_serialization.py @@ -26,7 +26,7 @@ from ray._private.utils import _get_pyarrow_version from ray.data.extensions.object_extension import ( ArrowPythonObjectArray, - object_extension_type_allowed, + _object_extension_type_allowed, ) from ray.data.extensions.tensor_extension import ( ArrowTensorArray, @@ -423,7 +423,7 @@ def pickled_objects_array(): (lazy_fixture("complex_nested_array"), 0.1), ] -if object_extension_type_allowed(): +if _object_extension_type_allowed(): pytest_custom_serialization_arrays.append( # Array of pickled objects (lazy_fixture("pickled_objects_array"), 0.1), @@ -550,7 +550,7 @@ def fn(batch: list): @pytest.mark.skipif( - not object_extension_type_allowed(), reason="Object extension not supported." + not _object_extension_type_allowed(), reason="Object extension not supported." ) def test_arrow_object_and_array_support(ray_start_regular_shared): obj = types.SimpleNamespace(some_attribute="test") diff --git a/python/ray/data/tests/test_dynamic_block_split.py b/python/ray/data/tests/test_dynamic_block_split.py index cda00239331f..398831aaea9c 100644 --- a/python/ray/data/tests/test_dynamic_block_split.py +++ b/python/ray/data/tests/test_dynamic_block_split.py @@ -10,7 +10,7 @@ import ray from ray.data import Dataset -from ray.data._internal.arrow_block import ArrowBlockAccessor +from ray.data._internal.arrow_block import ArrowBlockBuilder from ray.data._internal.datasource.csv_datasource import CSVDatasource from ray.data.block import BlockMetadata from ray.data.datasource import Datasource @@ -68,7 +68,7 @@ def _blocks_generator(): (self.num_rows_per_batch, self.row_size), dtype=np.uint8 ) } - block = ArrowBlockAccessor.numpy_to_block(batch) + block = ArrowBlockBuilder._table_from_pydict(batch) yield block else: yield pd.DataFrame( diff --git a/python/ray/data/tests/test_execution_optimizer.py b/python/ray/data/tests/test_execution_optimizer.py index d657ce1c9d98..af7af855b187 100644 --- a/python/ray/data/tests/test_execution_optimizer.py +++ b/python/ray/data/tests/test_execution_optimizer.py @@ -1145,9 +1145,7 @@ def test_sort_validate_keys(ray_start_regular_shared): assert extract_values("id", ds.sort("id").take_all()) == list(range(10)) invalid_col_name = "invalid_column" - with pytest.raises( - ValueError, match=f"The column '{invalid_col_name}' does not exist" - ): + with pytest.raises(ValueError, match="there's no such column in the dataset"): ds.sort(invalid_col_name).take_all() ds_named = ray.data.from_items( @@ -1165,10 +1163,7 @@ def test_sort_validate_keys(ray_start_regular_shared): assert [d["col1"] for d in r1] == [7, 5, 3, 1] assert [d["col2"] for d in r2] == [8, 6, 4, 2] - with pytest.raises( - ValueError, - match=f"The column '{invalid_col_name}' does not exist in the schema", - ): + with pytest.raises(ValueError, match="there's no such column in the dataset"): ds_named.sort(invalid_col_name).take_all() @@ -1279,9 +1274,7 @@ def test_aggregate_e2e(ray_start_regular_shared, use_push_based_shuffle): def test_aggregate_validate_keys(ray_start_regular_shared): ds = ray.data.range(10) invalid_col_name = "invalid_column" - with pytest.raises( - ValueError, match=f"The column '{invalid_col_name}' does not exist" - ): + with pytest.raises(ValueError): ds.groupby(invalid_col_name).count() ds_named = ray.data.from_items( @@ -1308,7 +1301,7 @@ def test_aggregate_validate_keys(ray_start_regular_shared): with pytest.raises( ValueError, - match=f"The column '{invalid_col_name}' does not exist in the schema", + match="there's no such column in the dataset", ): ds_named.groupby(invalid_col_name).count() diff --git a/python/ray/data/tests/test_hudi.py b/python/ray/data/tests/test_hudi.py new file mode 100644 index 000000000000..af8035cc315f --- /dev/null +++ b/python/ray/data/tests/test_hudi.py @@ -0,0 +1,114 @@ +import os +import zipfile + +import pytest +from packaging.version import parse as parse_version +from pytest_lazyfixture import lazy_fixture + +import ray +from ray._private.utils import _get_pyarrow_version +from ray.data.datasource.path_util import ( + _resolve_paths_and_filesystem, + _unwrap_protocol, +) +from ray.data.tests.conftest import * # noqa +from ray.data.tests.mock_http_server import * # noqa +from ray.tests.conftest import * # noqa + +MIN_PYARROW_VERSION_FOR_HUDI = parse_version("11.0.0") +_VER = _get_pyarrow_version() +PYARROW_VERSION = parse_version(_VER) if _VER else None +PYARROW_VERSION_MEETS_REQUIREMENT = ( + PYARROW_VERSION and PYARROW_VERSION >= MIN_PYARROW_VERSION_FOR_HUDI +) +PYARROW_HUDI_TEST_SKIP_REASON = ( + f"Hudi only supported if pyarrow >= {MIN_PYARROW_VERSION_FOR_HUDI}" +) + + +def _extract_testing_table(fixture_path: str, table_dir: str, target_dir: str) -> str: + with zipfile.ZipFile(fixture_path, "r") as zip_ref: + zip_ref.extractall(target_dir) + return os.path.join(target_dir, table_dir) + + +@pytest.mark.skipif( + not PYARROW_VERSION_MEETS_REQUIREMENT, + reason=PYARROW_HUDI_TEST_SKIP_REASON, +) +@pytest.mark.parametrize( + "fs,data_path", + [ + (None, lazy_fixture("local_path")), + (lazy_fixture("local_fs"), lazy_fixture("local_path")), + ], +) +def test_read_hudi_simple_cow_table(ray_start_regular_shared, fs, data_path): + setup_data_path = _unwrap_protocol(data_path) + target_testing_dir = os.path.join(setup_data_path, "test_hudi") + fixture_path, _ = _resolve_paths_and_filesystem( + "example://hudi-tables/0.x_cow_partitioned.zip", fs + ) + target_table_path = _extract_testing_table( + fixture_path[0], "trips_table", target_testing_dir + ) + + ds = ray.data.read_hudi(target_table_path) + + assert ds.schema().names == [ + "_hoodie_commit_time", + "_hoodie_commit_seqno", + "_hoodie_record_key", + "_hoodie_partition_path", + "_hoodie_file_name", + "ts", + "uuid", + "rider", + "driver", + "fare", + "city", + ] + assert ds.count() == 5 + rows = ( + ds.select_columns(["_hoodie_commit_time", "ts", "uuid", "fare"]) + .sort("fare") + .take_all() + ) + assert rows == [ + { + "_hoodie_commit_time": "20240402123035233", + "ts": 1695115999911, + "uuid": "c8abbe79-8d89-47ea-b4ce-4d224bae5bfa", + "fare": 17.85, + }, + { + "_hoodie_commit_time": "20240402123035233", + "ts": 1695159649087, + "uuid": "334e26e9-8355-45cc-97c6-c31daf0df330", + "fare": 19.1, + }, + { + "_hoodie_commit_time": "20240402123035233", + "ts": 1695091554788, + "uuid": "e96c4396-3fad-413a-a942-4cb36106d721", + "fare": 27.7, + }, + { + "_hoodie_commit_time": "20240402123035233", + "ts": 1695516137016, + "uuid": "e3cf430c-889d-4015-bc98-59bdce1e530c", + "fare": 34.15, + }, + { + "_hoodie_commit_time": "20240402144910683", + "ts": 1695046462179, + "uuid": "9909a8b1-2d15-4d3d-8ec9-efc48c536a00", + "fare": 339.0, + }, + ] + + +if __name__ == "__main__": + import sys + + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/data/tests/test_map.py b/python/ray/data/tests/test_map.py index 9b1a4f8d4575..41100f4b8a2c 100644 --- a/python/ray/data/tests/test_map.py +++ b/python/ray/data/tests/test_map.py @@ -9,6 +9,7 @@ import numpy as np import pandas as pd import pyarrow as pa +import pyarrow.compute as pc import pyarrow.parquet as pq import pytest @@ -330,18 +331,101 @@ def map_generator(item: dict) -> Iterator[int]: def test_add_column(ray_start_regular_shared): - ds = ray.data.range(5).add_column("foo", lambda x: 1) + """Tests the add column API.""" + + # Test with pyarrow batch format + ds = ray.data.range(5).add_column( + "foo", lambda x: pa.array([1] * x.num_rows), batch_format="pyarrow" + ) + assert ds.take(1) == [{"id": 0, "foo": 1}] + + # Test with chunked array batch format + ds = ray.data.range(5).add_column( + "foo", lambda x: pa.chunked_array([[1] * x.num_rows]), batch_format="pyarrow" + ) + assert ds.take(1) == [{"id": 0, "foo": 1}] + + ds = ray.data.range(5).add_column( + "foo", lambda x: pc.add(x["id"], 1), batch_format="pyarrow" + ) + assert ds.take(1) == [{"id": 0, "foo": 1}] + + # Adding a column that is already there should result in an error + with pytest.raises( + ray.exceptions.UserCodeException, + match="Trying to add an existing column with name 'id'", + ): + ds = ray.data.range(5).add_column( + "id", lambda x: pc.add(x["id"], 1), batch_format="pyarrow" + ) + assert ds.take(2) == [{"id": 1}, {"id": 2}] + + # Adding a column in the wrong format should result in an error + with pytest.raises( + ray.exceptions.UserCodeException, match="For pyarrow batch format" + ): + ds = ray.data.range(5).add_column("id", lambda x: [1], batch_format="pyarrow") + assert ds.take(2) == [{"id": 1}, {"id": 2}] + + # Test with numpy batch format + ds = ray.data.range(5).add_column( + "foo", lambda x: np.array([1] * len(list(x.keys())[0])), batch_format="numpy" + ) + assert ds.take(1) == [{"id": 0, "foo": 1}] + + ds = ray.data.range(5).add_column( + "foo", lambda x: np.add(x["id"], 1), batch_format="numpy" + ) + assert ds.take(1) == [{"id": 0, "foo": 1}] + + # Adding a column that is already there should result in an error + with pytest.raises( + ray.exceptions.UserCodeException, + match="Trying to add an existing column with name 'id'", + ): + ds = ray.data.range(5).add_column( + "id", lambda x: np.add(x["id"], 1), batch_format="numpy" + ) + assert ds.take(2) == [{"id": 1}, {"id": 2}] + + # Adding a column in the wrong format should result in an error + with pytest.raises( + ray.exceptions.UserCodeException, match="For numpy batch format" + ): + ds = ray.data.range(5).add_column("id", lambda x: [1], batch_format="numpy") + assert ds.take(2) == [{"id": 1}, {"id": 2}] + + # Test with pandas batch format + ds = ray.data.range(5).add_column("foo", lambda x: pd.Series([1] * x.shape[0])) assert ds.take(1) == [{"id": 0, "foo": 1}] ds = ray.data.range(5).add_column("foo", lambda x: x["id"] + 1) assert ds.take(1) == [{"id": 0, "foo": 1}] - ds = ray.data.range(5).add_column("id", lambda x: x["id"] + 1) - assert ds.take(2) == [{"id": 1}, {"id": 2}] + # Adding a column that is already there should result in an error + with pytest.raises( + ray.exceptions.UserCodeException, + match="Trying to add an existing column with name 'id'", + ): + ds = ray.data.range(5).add_column("id", lambda x: x["id"] + 1) + assert ds.take(2) == [{"id": 1}, {"id": 2}] + + # Adding a column in the wrong format should result in an error + with pytest.raises( + ray.exceptions.UserCodeException, match="For pandas batch format" + ): + ds = ray.data.range(5).add_column( + "id", lambda x: np.array([1]), batch_format="pandas" + ) + assert ds.take(2) == [{"id": 1}, {"id": 2}] with pytest.raises(ValueError): ds = ray.data.range(5).add_column("id", 0) + # Test that an invalid batch_format raises an error + with pytest.raises(ValueError): + ray.data.range(5).add_column("foo", lambda x: x["id"] + 1, batch_format="foo") + @pytest.mark.parametrize("names", (["foo", "bar"], {"spam": "foo", "ham": "bar"})) def test_rename_columns(ray_start_regular_shared, names): @@ -362,14 +446,15 @@ def test_drop_columns(ray_start_regular_shared, tmp_path): assert ds.drop_columns(["col2"]).take(1) == [{"col1": 1, "col3": 3}] assert ds.drop_columns(["col1", "col3"]).take(1) == [{"col2": 2}] assert ds.drop_columns([]).take(1) == [{"col1": 1, "col2": 2, "col3": 3}] - assert ds.drop_columns(["col1", "col2", "col3"]).take(1) == [{}] - assert ds.drop_columns(["col1", "col1", "col2", "col1"]).take(1) == [ - {"col3": 3} - ] + assert ds.drop_columns(["col1", "col2", "col3"]).take(1) == [] + assert ds.drop_columns(["col1", "col2"]).take(1) == [{"col3": 3}] # Test dropping non-existent column with pytest.raises((UserCodeException, KeyError)): ds.drop_columns(["dummy_col", "col1", "col2"]).materialize() + with pytest.raises(ValueError, match="drop_columns expects unique column names"): + ds1.drop_columns(["col1", "col2", "col2"]) + def test_select_columns(ray_start_regular_shared): # Test pandas and arrow diff --git a/python/ray/data/tests/test_mongo.py b/python/ray/data/tests/test_mongo.py index 97828aae6bea..eb03aab39f80 100644 --- a/python/ray/data/tests/test_mongo.py +++ b/python/ray/data/tests/test_mongo.py @@ -93,13 +93,13 @@ def test_read_write_mongo(ray_start_regular_shared, start_mongo): override_num_blocks=2, ) assert ds._block_num_rows() == [3, 2] - assert str(ds) == ( - "Dataset(\n" - " num_rows=5,\n" - " schema={_id: fixed_size_binary[12], float_field: double, " - "int_field: int32}\n" - ")" - ) + assert ds.count() == 5 + assert ds.schema().names == ["_id", "float_field", "int_field"] + # We are not testing the datatype of _id here, because it varies per platform + assert ds.schema().types[1:] == [ + pa.float64(), + pa.int32(), + ] assert df.equals(ds.drop_columns(["_id"]).to_pandas()) # Read a subset of the collection. @@ -111,13 +111,8 @@ def test_read_write_mongo(ray_start_regular_shared, start_mongo): override_num_blocks=2, ) assert ds._block_num_rows() == [2, 1] - assert str(ds) == ( - "Dataset(\n" - " num_rows=3,\n" - " schema={_id: fixed_size_binary[12], float_field: double, " - "int_field: int32}\n" - ")" - ) + assert ds.count() == 3 + assert ds.schema().names == ["_id", "float_field", "int_field"] df[df["int_field"] < 3].equals(ds.drop_columns(["_id"]).to_pandas()) # Read with auto-tuned parallelism. @@ -126,13 +121,14 @@ def test_read_write_mongo(ray_start_regular_shared, start_mongo): database=foo_db, collection=foo_collection, ) - assert str(ds) == ( - "Dataset(\n" - " num_rows=5,\n" - " schema={_id: fixed_size_binary[12], float_field: double, " - "int_field: int32}\n" - ")" - ) + + assert ds.count() == 5 + assert ds.schema().names == ["_id", "float_field", "int_field"] + # We are not testing the datatype of _id here, because it varies per platform + assert ds.schema().types[1:] == [ + pa.float64(), + pa.int32(), + ] assert df.equals(ds.drop_columns(["_id"]).to_pandas()) # Read with a parallelism larger than number of rows. @@ -142,13 +138,14 @@ def test_read_write_mongo(ray_start_regular_shared, start_mongo): collection=foo_collection, override_num_blocks=1000, ) - assert str(ds) == ( - "Dataset(\n" - " num_rows=5,\n" - " schema={_id: fixed_size_binary[12], float_field: double, " - "int_field: int32}\n" - ")" - ) + + assert ds.count() == 5 + assert ds.schema().names == ["_id", "float_field", "int_field"] + # We are not testing the datatype of _id here, because it varies per platform + assert ds.schema().types[1:] == [ + pa.float64(), + pa.int32(), + ] assert df.equals(ds.drop_columns(["_id"]).to_pandas()) # Add a column and then write back to MongoDB. diff --git a/python/ray/data/tests/test_numpy_support.py b/python/ray/data/tests/test_numpy_support.py index c14038918c0a..ec67bcf689bb 100644 --- a/python/ray/data/tests/test_numpy_support.py +++ b/python/ray/data/tests/test_numpy_support.py @@ -6,6 +6,7 @@ import ray from ray.air.util.tensor_extensions.utils import create_ragged_ndarray +from ray.data import DataContext from ray.data.tests.conftest import * # noqa from ray.tests.conftest import * # noqa @@ -27,22 +28,31 @@ def assert_structure_equals(a, b): assert a.dtype == b.dtype assert a.shape == b.shape for i in range(len(a)): - assert np.array_equiv(a[i], b[i]), (i, a, b) + assert np.array_equal(a[i], b[i]), (i, a[i], b[i]) -def test_list_of_scalars(ray_start_regular_shared): +def test_list_of_scalars(ray_start_regular_shared, restore_data_context): + # Disable (automatic) fallback to `ArrowPythonObjectType` extension type + DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False + data = [1, 2, 3] output = do_map_batches(data) assert_structure_equals(output, np.array([1, 2, 3], dtype=np.int64)) -def test_list_of_numpy_scalars(ray_start_regular_shared): +def test_list_of_numpy_scalars(ray_start_regular_shared, restore_data_context): + # Disable (automatic) fallback to `ArrowPythonObjectType` extension type + DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False + data = [np.int64(1), np.int64(2), np.int64(3)] output = do_map_batches(data) assert_structure_equals(output, np.array([1, 2, 3], dtype=np.int64)) -def test_list_of_objects(ray_start_regular_shared): +def test_list_of_objects(ray_start_regular_shared, restore_data_context): + # NOTE: Fallback is enabled by default, this is purely for notational purposes + DataContext.get_current().enable_fallback_to_arrow_object_ext_type = True + data = [1, 2, 3, UserObj()] output = do_map_batches(data) assert_structure_equals(output, np.array([1, 2, 3, UserObj()])) @@ -88,34 +98,51 @@ def test_list_of_objects(ray_start_regular_shared): ), ], ) -def test_list_of_datetimes(data, expected_output, ray_start_regular_shared): +def test_list_of_datetimes( + data, expected_output, ray_start_regular_shared, restore_data_context +): + # Disable (automatic) fallback to `ArrowPythonObjectType` extension type + DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False + output = do_map_batches(data) assert_structure_equals(output, expected_output) -def test_array_like(ray_start_regular_shared): +def test_array_like(ray_start_regular_shared, restore_data_context): + # Disable (automatic) fallback to `ArrowPythonObjectType` extension type + DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False + data = torch.Tensor([1, 2, 3]) output = do_map_batches(data) assert_structure_equals(output, np.array([1.0, 2.0, 3.0], dtype=np.float32)) -def test_list_of_arrays(ray_start_regular_shared): +def test_list_of_arrays(ray_start_regular_shared, restore_data_context): + # Disable (automatic) fallback to `ArrowPythonObjectType` extension type + DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False + data = [np.array([1, 2, 3]), np.array([4, 5, 6])] output = do_map_batches(data) assert_structure_equals(output, np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64)) -def test_list_of_array_like(ray_start_regular_shared): +def test_list_of_array_like(ray_start_regular_shared, restore_data_context): + # Disable (automatic) fallback to `ArrowPythonObjectType` extension type + DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False + data = [torch.Tensor([1, 2, 3]), torch.Tensor([4, 5, 6])] output = do_map_batches(data) assert_structure_equals(output, np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)) -def test_ragged_array_like(ray_start_regular_shared): +def test_ragged_tensors_map_batches(ray_start_regular_shared, restore_data_context): + # Disable (automatic) fallback to `ArrowPythonObjectType` extension type + DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False + data = [torch.Tensor([1, 2, 3]), torch.Tensor([1, 2])] output = do_map_batches(data) assert_structure_equals( - output, np.array([np.array([1, 2, 3]), np.array([1, 2])], dtype=object) + output, create_ragged_ndarray([np.array([1, 2, 3]), np.array([1, 2])]) ) data = [torch.zeros((3, 5, 10)), torch.zeros((3, 8, 8))] @@ -125,23 +152,42 @@ def test_ragged_array_like(ray_start_regular_shared): ) -def test_scalar_nested_arrays(ray_start_regular_shared): +def test_scalar_nested_arrays(ray_start_regular_shared, restore_data_context): + # Disable (automatic) fallback to `ArrowPythonObjectType` extension type + DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False + data = [[[1]], [[2]]] output = do_map_batches(data) - assert_structure_equals(output, create_ragged_ndarray(data)) + + assert_structure_equals( + output, + create_ragged_ndarray( + [np.array([1], dtype=np.object_), np.array([2], dtype=np.object_)] + ), + ) -def test_scalar_lists_not_converted(ray_start_regular_shared): +def test_scalar_lists_not_converted(ray_start_regular_shared, restore_data_context): + # Disable (automatic) fallback to `ArrowPythonObjectType` extension type + DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False + data = [[1, 2], [1, 2]] output = do_map_batches(data) - assert_structure_equals(output, create_ragged_ndarray([[1, 2], [1, 2]])) + assert_structure_equals( + output, create_ragged_ndarray([np.array([1, 2]), np.array([1, 2])]) + ) data = [[1, 2, 3], [1, 2]] output = do_map_batches(data) - assert_structure_equals(output, create_ragged_ndarray([[1, 2, 3], [1, 2]])) + assert_structure_equals( + output, create_ragged_ndarray([np.array([1, 2, 3]), np.array([1, 2])]) + ) -def test_scalar_numpy(ray_start_regular_shared): +def test_scalar_numpy(ray_start_regular_shared, restore_data_context): + # Disable (automatic) fallback to `ArrowPythonObjectType` extension type + DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False + data = np.int64(1) ds = ray.data.range(2, override_num_blocks=1) ds = ds.map(lambda x: {"output": data}) @@ -149,7 +195,10 @@ def test_scalar_numpy(ray_start_regular_shared): assert_structure_equals(output, np.array([1, 1], dtype=np.int64)) -def test_scalar_arrays(ray_start_regular_shared): +def test_scalar_arrays(ray_start_regular_shared, restore_data_context): + # Disable (automatic) fallback to `ArrowPythonObjectType` extension type + DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False + data = np.array([1, 2, 3]) ds = ray.data.range(2, override_num_blocks=1) ds = ds.map(lambda x: {"output": data}) @@ -157,7 +206,10 @@ def test_scalar_arrays(ray_start_regular_shared): assert_structure_equals(output, np.array([[1, 2, 3], [1, 2, 3]], dtype=np.int64)) -def test_bytes(ray_start_regular_shared): +def test_bytes(ray_start_regular_shared, restore_data_context): + # Disable (automatic) fallback to `ArrowPythonObjectType` extension type + DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False + """Tests that bytes are converted to object dtype instead of zero-terminated.""" data = b"\x1a\n\x00\n\x1a" ds = ray.data.range(1, override_num_blocks=1) @@ -166,7 +218,10 @@ def test_bytes(ray_start_regular_shared): assert_structure_equals(output, np.array([b"\x1a\n\x00\n\x1a"], dtype=object)) -def test_scalar_array_like(ray_start_regular_shared): +def test_uniform_tensors(ray_start_regular_shared, restore_data_context): + # Disable (automatic) fallback to `ArrowPythonObjectType` extension type + DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False + data = torch.Tensor([1, 2, 3]) ds = ray.data.range(2, override_num_blocks=1) ds = ds.map(lambda x: {"output": data}) @@ -174,17 +229,24 @@ def test_scalar_array_like(ray_start_regular_shared): assert_structure_equals(output, np.array([[1, 2, 3], [1, 2, 3]], dtype=np.float32)) -def test_scalar_ragged_arrays(ray_start_regular_shared): +def test_scalar_ragged_arrays(ray_start_regular_shared, restore_data_context): + # Disable (automatic) fallback to `ArrowPythonObjectType` extension type + DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False + data = [np.array([1, 2, 3]), np.array([1, 2])] ds = ray.data.range(2, override_num_blocks=1) ds = ds.map(lambda x: {"output": data[x["id"]]}) output = ds.take_batch()["output"] + assert_structure_equals( output, np.array([np.array([1, 2, 3]), np.array([1, 2])], dtype=object) ) -def test_scalar_ragged_array_like(ray_start_regular_shared): +def test_ragged_tensors(ray_start_regular_shared, restore_data_context): + # Disable (automatic) fallback to `ArrowPythonObjectType` extension type + DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False + data = [torch.Tensor([1, 2, 3]), torch.Tensor([1, 2])] ds = ray.data.range(2, override_num_blocks=1) ds = ds.map(lambda x: {"output": data[x["id"]]}) @@ -202,7 +264,10 @@ def test_scalar_ragged_array_like(ray_start_regular_shared): ) -def test_nested_ragged_arrays(ray_start_regular_shared): +def test_nested_ragged_arrays(ray_start_regular_shared, restore_data_context): + # Disable (automatic) fallback to `ArrowPythonObjectType` extension type + DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False + data = [ {"a": [[1], [2, 3]]}, {"a": [[4, 5], [6]]}, @@ -216,10 +281,26 @@ def f(row): # https://github.com/ray-project/ray/issues/35340 -def test_complex_ragged_arrays(ray_start_regular_shared): +def test_complex_ragged_arrays(ray_start_regular_shared, restore_data_context): + # Disable (automatic) fallback to `ArrowPythonObjectType` extension type + DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False + data = [[{"a": 1}, {"a": 2}, {"a": 3}], [{"b": 1}]] output = do_map_batches(data) - assert_structure_equals(output, create_ragged_ndarray(data)) + + # Assert resulting objects are coerced to appropriate shape, following + # table's schema + assert_structure_equals( + output, + create_ragged_ndarray( + [ + np.array( + [{"a": 1, "b": None}, {"a": 2, "b": None}, {"a": 3, "b": None}] + ), + np.array([{"a": None, "b": 1}]), + ] + ), + ) data = ["hi", 1, None, [[[[]]]], {"a": [[{"b": 2, "c": UserObj()}]]}, UserObj()] output = do_map_batches(data) diff --git a/python/ray/data/tests/test_object_gc.py b/python/ray/data/tests/test_object_gc.py index b56c4542618d..2b1947e0498d 100644 --- a/python/ray/data/tests/test_object_gc.py +++ b/python/ray/data/tests/test_object_gc.py @@ -1,6 +1,7 @@ import sys import threading +import pandas as pd import pytest import ray @@ -107,7 +108,7 @@ def test_tf_iteration(shutdown_only): # The size of dataset is 500*(80*80*4)*8B, about 100MB. ds = ray.data.range_tensor( 500, shape=(80, 80, 4), override_num_blocks=100 - ).add_column("label", lambda x: 1) + ).add_column("label", lambda df: pd.Series([1] * len(df))) # to_tf check_to_tf_no_spill(ctx, ds.map(lambda x: x)) diff --git a/python/ray/data/tests/test_pandas_block.py b/python/ray/data/tests/test_pandas_block.py index 725d2c8ce1bb..4585d0e2a133 100644 --- a/python/ray/data/tests/test_pandas_block.py +++ b/python/ray/data/tests/test_pandas_block.py @@ -4,7 +4,7 @@ import ray import ray.data from ray.data._internal.pandas_block import PandasBlockAccessor -from ray.data.extensions.object_extension import object_extension_type_allowed +from ray.data.extensions.object_extension import _object_extension_type_allowed def test_append_column(ray_start_regular_shared): @@ -20,7 +20,7 @@ def test_append_column(ray_start_regular_shared): @pytest.mark.skipif( - object_extension_type_allowed(), reason="Objects can be put into Arrow" + _object_extension_type_allowed(), reason="Objects can be put into Arrow" ) def test_dict_fallback_to_pandas_block(ray_start_regular_shared): # If the UDF returns a column with dict, this throws diff --git a/python/ray/data/tests/test_stats.py b/python/ray/data/tests/test_stats.py index d8d85515092c..a41e060fb3f5 100644 --- a/python/ray/data/tests/test_stats.py +++ b/python/ray/data/tests/test_stats.py @@ -1648,6 +1648,7 @@ def test_stats_actor_datasets(ray_start_cluster): assert "Input0" in operators assert "ReadRange->MapBatches()1" in operators for value in operators.values(): + assert value["name"] in ["Input", "ReadRange->MapBatches()"] assert value["progress"] == 20 assert value["total"] == 20 assert value["state"] == "FINISHED" @@ -1663,8 +1664,9 @@ def test_stats_manager(shutdown_only): datasets = [None] * num_threads # Mock clear methods so that _last_execution_stats and _last_iteration_stats # are not cleared. We will assert on them afterwards. - with patch.object(StatsManager, "clear_execution_metrics"), patch.object( - StatsManager, "clear_iteration_metrics" + with ( + patch.object(StatsManager, "clear_last_execution_stats"), + patch.object(StatsManager, "clear_iteration_metrics"), ): def update_stats_manager(i): @@ -1689,9 +1691,7 @@ def update_stats_manager(i): dataset_tag = create_dataset_tag(dataset._name, dataset._uuid) assert dataset_tag in StatsManager._last_execution_stats assert dataset_tag in StatsManager._last_iteration_stats - StatsManager.clear_execution_metrics( - dataset_tag, ["Input0", "ReadRange->MapBatches()1"] - ) + StatsManager.clear_last_execution_stats(dataset_tag) StatsManager.clear_iteration_metrics(dataset_tag) wait_for_condition(lambda: not StatsManager._update_thread.is_alive()) diff --git a/python/ray/data/tests/test_strict_mode.py b/python/ray/data/tests/test_strict_mode.py index 49b4b9cc4e37..e34c2f428329 100644 --- a/python/ray/data/tests/test_strict_mode.py +++ b/python/ray/data/tests/test_strict_mode.py @@ -181,7 +181,7 @@ def test_strict_schema(ray_start_regular_shared): from ray.data._internal.pandas_block import PandasBlockSchema from ray.data.extensions.object_extension import ( ArrowPythonObjectType, - object_extension_type_allowed, + _object_extension_type_allowed, ) from ray.data.extensions.tensor_extension import ArrowTensorType @@ -199,7 +199,7 @@ def test_strict_schema(ray_start_regular_shared): ds = ray.data.from_items([{"x": 2, "y": object(), "z": [1, 2]}]) schema = ds.schema() - if object_extension_type_allowed(): + if _object_extension_type_allowed(): assert isinstance(schema.base_schema, pa.lib.Schema) assert schema.names == ["x", "y", "z"] assert schema.types == [ diff --git a/python/ray/data/tests/test_transform_pyarrow.py b/python/ray/data/tests/test_transform_pyarrow.py index 570bd8f6592b..a221bd6c7683 100644 --- a/python/ray/data/tests/test_transform_pyarrow.py +++ b/python/ray/data/tests/test_transform_pyarrow.py @@ -18,7 +18,7 @@ ArrowTensorArray, ArrowTensorType, ArrowVariableShapedTensorType, - object_extension_type_allowed, + _object_extension_type_allowed, ) @@ -199,7 +199,7 @@ def test_arrow_concat_tensor_extension_uniform_but_different(): @pytest.mark.skipif( - not object_extension_type_allowed(), reason="Object extension type not supported." + not _object_extension_type_allowed(), reason="Object extension type not supported." ) def test_arrow_concat_with_objects(): obj = types.SimpleNamespace(a=1, b="test") @@ -458,9 +458,10 @@ def map(x): assert op == "map_batches" def map_batches(x): + row_id = x["id"][0] return { "id": x["id"], - "my_data": data[x["id"][0]], + "my_data": [data[row_id]], } ds = ds.map_batches(map_batches, batch_size=None) @@ -472,14 +473,14 @@ def map_batches(x): @pytest.mark.skipif( - object_extension_type_allowed(), reason="Arrow table supports pickled objects" + _object_extension_type_allowed(), reason="Arrow table supports pickled objects" ) @pytest.mark.parametrize( "op, data", [ ("map", [UnsupportedType(), 1]), - ("map_batches", [[None], [1]]), - ("map_batches", [[{"a": 1}], [{"a": 2}]]), + ("map_batches", [None, 1]), + ("map_batches", [{"a": 1}, {"a": 2}]), ], ) def test_fallback_to_pandas_on_incompatible_data( @@ -497,34 +498,59 @@ def test_fallback_to_pandas_on_incompatible_data( @pytest.mark.parametrize( - "op, data", + "op, data, should_fail, expected_type", [ - ("map", [1, 2**100]), - ("map_batches", [[1.0], [2**4]]), + # Case A: Upon serializing to Arrow fallback to `ArrowPythonObjectType` + ("map_batches", [1, 2**100], False, ArrowPythonObjectType()), + ("map_batches", [1.0, 2**100], False, ArrowPythonObjectType()), + ("map_batches", ["1.0", 2**100], False, ArrowPythonObjectType()), + # Case B: No fallback to `ArrowPythonObjectType` and hence arrow is enforcing + # deduced schema + ("map_batches", [1.0, 2**4], True, None), + ("map_batches", ["1.0", 2**4], True, None), ], ) -def test_pyarrow_conversion_error_detailed_info( +def test_pyarrow_conversion_error_handling( ray_start_regular_shared, op, data, + should_fail: bool, + expected_type: pa.DataType, ): # Ray Data infers the block type (arrow or pandas) and the block schema - # based on the first UDF output. - # In one of the following cases, an error will be raised: - # * The first UDF output is compatible with Arrow, but the second is not. - # * Both UDF outputs are compatible with Arrow, but the second has a different - # schema. - # Check that we'll raise an ArrowConversionError with detailed information - # about the incompatible data. + # based on the first *block* produced by UDF. + # + # These tests simulate following scenarios + # 1. (Case A) Type of the value of the first block is deduced as Arrow scalar + # type, but second block carries value that overflows pa.int64 representation, + # and column henceforth will be serialized as `ArrowPythonObjectExtensionType` + # coercing first block to it as well + # + # 2. (Case B) Both blocks carry proper Arrow scalars which, however, have + # diverging types and therefore Arrow fails during merging of these blocks + # into 1 ds = _create_dataset(op, data) - with pytest.raises(Exception) as e: + if should_fail: + with pytest.raises(Exception) as e: + ds.materialize() + + error_msg = str(e.value) + expected_msg = "ArrowConversionError: Error converting data to Arrow:" + + assert expected_msg in error_msg + assert "my_data" in error_msg + + else: ds.materialize() - error_msg = str(e.value) - expected_msg = "ArrowConversionError: Error converting data to Arrow:" - assert expected_msg in error_msg, error_msg - assert "my_data" in error_msg, error_msg + assert ds.schema().base_schema == pa.schema( + [pa.field("id", pa.int64()), pa.field("my_data", expected_type)] + ) + + assert ds.take_all() == [ + {"id": i, "my_data": data[i]} for i in range(len(data)) + ] if __name__ == "__main__": diff --git a/python/ray/includes/common.pxd b/python/ray/includes/common.pxd index 7d4b6ece9e7a..f5c6d4655ac9 100644 --- a/python/ray/includes/common.pxd +++ b/python/ray/includes/common.pxd @@ -329,7 +329,8 @@ cdef extern from "ray/core_worker/common.h" nogil: unordered_map[c_string, double] &resources, c_string concurrency_group_name, int64_t generator_backpressure_num_objects, - c_string serialized_runtime_env, c_bool enable_task_events) + c_string serialized_runtime_env, c_bool enable_task_events, + const unordered_map[c_string, c_string] &labels) cdef cppclass CActorCreationOptions "ray::core::ActorCreationOptions": CActorCreationOptions() @@ -347,7 +348,8 @@ cdef extern from "ray/core_worker/common.h" nogil: const c_vector[CConcurrencyGroup] &concurrency_groups, c_bool execute_out_of_order, int32_t max_pending_calls, - c_bool enable_task_events) + c_bool enable_task_events, + const unordered_map[c_string, c_string] &labels) cdef cppclass CPlacementGroupCreationOptions \ "ray::core::PlacementGroupCreationOptions": diff --git a/python/ray/includes/libcoreworker.pxd b/python/ray/includes/libcoreworker.pxd index 242c5f10dd49..87f5d59a8583 100644 --- a/python/ray/includes/libcoreworker.pxd +++ b/python/ray/includes/libcoreworker.pxd @@ -191,6 +191,8 @@ cdef extern from "ray/core_worker/core_worker.h" nogil: CJobID GetCurrentJobId() CTaskID GetCurrentTaskId() + const c_string GetCurrentTaskName() + const c_string GetCurrentTaskFunctionName() void UpdateTaskIsDebuggerPaused( const CTaskID &task_id, const c_bool is_debugger_paused) diff --git a/python/ray/remote_function.py b/python/ray/remote_function.py index 72c07fd63b86..b44eae3d84ce 100644 --- a/python/ray/remote_function.py +++ b/python/ray/remote_function.py @@ -4,6 +4,7 @@ import uuid from functools import wraps from threading import Lock +from typing import Optional import ray._private.signature from ray import Language, cross_language @@ -120,6 +121,22 @@ def __init__( if "runtime_env" in self._default_options: self._default_options["runtime_env"] = self._runtime_env + # Pre-calculate runtime env info, to avoid re-calculation at `remote` + # invocation. When `remote` call has specified extra `option` field, + # runtime env will be overwritten and re-serialized. + # + # Caveat: To support dynamic runtime envs in + # `func.option(runtime_env={...}).remote()`, we recalculate the serialized + # runtime env info in the `option` call. But it's acceptable since + # pre-calculation here only happens once at `RemoteFunction` initialization. + self._serialized_base_runtime_env_info = "" + if self._runtime_env: + self._serialized_base_runtime_env_info = get_runtime_env_info( + self._runtime_env, + is_job_runtime_env=False, + serialize=True, + ) + self._language = language self._is_generator = inspect.isgeneratorfunction(function) self._function = function @@ -136,7 +153,12 @@ def __init__( # Override task.remote's signature and docstring @wraps(function) def _remote_proxy(*args, **kwargs): - return self._remote(args=args, kwargs=kwargs, **self._default_options) + return self._remote( + serialized_runtime_env_info=self._serialized_base_runtime_env_info, + args=args, + kwargs=kwargs, + **self._default_options, + ) self.remote = _remote_proxy @@ -217,6 +239,7 @@ def options(self, **task_options): _metadata: Extended options for Ray libraries. For example, _metadata={"workflows.io/options": } for Ray workflows. + _labels: The key-value labels of a task. Examples: @@ -239,15 +262,29 @@ def f(): updated_options = ray_option_utils.update_options(default_options, task_options) ray_option_utils.validate_task_options(updated_options, in_options=True) - # only update runtime_env when ".options()" specifies new runtime_env + # Only update runtime_env and re-calculate serialized runtime env info when + # ".options()" specifies new runtime_env. + serialized_runtime_env_info = self._serialized_base_runtime_env_info if "runtime_env" in task_options: updated_options["runtime_env"] = parse_runtime_env( updated_options["runtime_env"] ) + # Re-calculate runtime env info based on updated runtime env. + if updated_options["runtime_env"]: + serialized_runtime_env_info = get_runtime_env_info( + updated_options["runtime_env"], + is_job_runtime_env=False, + serialize=True, + ) class FuncWrapper: def remote(self, *args, **kwargs): - return func_cls._remote(args=args, kwargs=kwargs, **updated_options) + return func_cls._remote( + args=args, + kwargs=kwargs, + serialized_runtime_env_info=serialized_runtime_env_info, + **updated_options, + ) @DeveloperAPI def bind(self, *args, **kwargs): @@ -263,7 +300,13 @@ class or functions. @wrap_auto_init @_tracing_task_invocation - def _remote(self, args=None, kwargs=None, **task_options): + def _remote( + self, + args=None, + kwargs=None, + serialized_runtime_env_info: Optional[str] = None, + **task_options, + ): """Submit the remote function for execution.""" # We pop the "max_calls" coming from "@ray.remote" here. We no longer need # it in "_remote()". @@ -329,7 +372,6 @@ def _remote(self, args=None, kwargs=None, **task_options): # TODO(suquark): cleanup these fields name = task_options["name"] - runtime_env = parse_runtime_env(task_options["runtime_env"]) placement_group = task_options["placement_group"] placement_group_bundle_index = task_options["placement_group_bundle_index"] placement_group_capture_child_tasks = task_options[ @@ -404,19 +446,12 @@ def _remote(self, args=None, kwargs=None, **task_options): else: scheduling_strategy = "DEFAULT" - serialized_runtime_env_info = None - if runtime_env is not None: - serialized_runtime_env_info = get_runtime_env_info( - runtime_env, - is_job_runtime_env=False, - serialize=True, - ) - if _task_launch_hook: _task_launch_hook(self._function_descriptor, resources, scheduling_strategy) # Override enable_task_events to default for actor if not specified (i.e. None) enable_task_events = task_options.get("enable_task_events") + labels = task_options.get("_labels") def invocation(args, kwargs): if self._is_cross_language: @@ -447,6 +482,7 @@ def invocation(args, kwargs): serialized_runtime_env_info or "{}", generator_backpressure_num_objects, enable_task_events, + labels, ) # Reset worker's debug context from the last "remote" command # (which applies only to this .remote call). diff --git a/python/ray/runtime_context.py b/python/ray/runtime_context.py index 78f7cee9502d..5cacae69371b 100644 --- a/python/ray/runtime_context.py +++ b/python/ray/runtime_context.py @@ -124,7 +124,7 @@ def get_worker_id(self) -> str: @property @Deprecated(message="Use get_task_id() instead", warning=True) def task_id(self): - """Get current task ID for this worker or driver. + """Get current task ID for this worker. Task ID is the id of a Ray task. This shouldn't be used in a driver process. @@ -155,7 +155,7 @@ def f(): Returns: The current worker's task id. None if there's no task id. """ - # only worker mode has actor_id + # only worker mode has task_id assert ( self.worker.mode == ray._private.worker.WORKER_MODE ), f"This method is only available when the process is a\ @@ -165,7 +165,7 @@ def f(): return task_id if not task_id.is_nil() else None def get_task_id(self) -> Optional[str]: - """Get current task ID for this worker or driver. + """Get current task ID for this worker. Task ID is the id of a Ray task. The ID will be in hex format. This shouldn't be used in a driver process. @@ -201,7 +201,7 @@ def get_task_id(): Returns: The current worker's task id in hex. None if there's no task id. """ - # only worker mode has actor_id + # only worker mode has task_id if self.worker.mode != ray._private.worker.WORKER_MODE: logger.warning( "This method is only available when the process is a " @@ -212,12 +212,116 @@ def get_task_id(): return task_id.hex() if not task_id.is_nil() else None def _get_current_task_id(self) -> TaskID: - async_task_id = ray._raylet.async_task_id.get() - if async_task_id is None: - task_id = self.worker.current_task_id - else: - task_id = async_task_id - return task_id + return self.worker.current_task_id + + def get_task_name(self) -> Optional[str]: + """Get current task name for this worker. + + Task name by default is the task's funciton call string. It can also be + specified in options when triggering a task. + + Example: + + .. testcode:: + + import ray + + @ray.remote + class Actor: + def get_task_name(self): + return ray.get_runtime_context().get_task_name() + + @ray.remote + class AsyncActor: + async def get_task_name(self): + return ray.get_runtime_context().get_task_name() + + @ray.remote + def get_task_name(): + return ray.get_runtime_context().get_task_name() + + a = Actor.remote() + b = AsyncActor.remote() + # Task names are available for actor tasks. + print(ray.get(a.get_task_name.remote())) + # Task names are avaiable for async actor tasks. + print(ray.get(b.get_task_name.remote())) + # Task names are available for normal tasks. + # Get default task name + print(ray.get(get_task_name.remote())) + # Get specified task name + print(ray.get(get_task_name.options(name="task_name").remote())) + + .. testoutput:: + :options: +MOCK + + Actor.get_task_name + AsyncActor.get_task_name + get_task_name + task_nams + + Returns: + The current worker's task name + """ + # only worker mode has task_name + if self.worker.mode != ray._private.worker.WORKER_MODE: + logger.warning( + "This method is only available when the process is a " + f"worker. Current mode: {self.worker.mode}" + ) + return None + return self.worker.current_task_name + + def get_task_function_name(self) -> Optional[str]: + """Get current task function name string for this worker. + + Example: + + .. testcode:: + + import ray + + @ray.remote + class Actor: + def get_task_function_name(self): + return ray.get_runtime_context().get_task_function_name() + + @ray.remote + class AsyncActor: + async def get_task_function_name(self): + return ray.get_runtime_context().get_task_function_name() + + @ray.remote + def get_task_function_name(): + return ray.get_runtime_context().get_task_function_name() + + a = Actor.remote() + b = AsyncActor.remote() + # Task functions are available for actor tasks. + print(ray.get(a.get_task_function_name.remote())) + # Task functions are available for async actor tasks. + print(ray.get(b.get_task_function_name.remote())) + # Task functions are available for normal tasks. + print(ray.get(get_task_function_name.remote())) + + .. testoutput:: + :options: +MOCK + + [python modual name].Actor.get_task_function_name + [python modual name].AsyncActor.get_task_function_name + [python modual name].get_task_function_name + + Returns: + The current worker's task function call string + """ + # only worker mode has task_function_name + if self.worker.mode != ray._private.worker.WORKER_MODE: + logger.warning( + "This method is only available when the process is a " + f"worker. Current mode: {self.worker.mode}" + ) + return None + return self.worker.current_task_function_name @property @Deprecated(message="Use get_actor_id() instead", warning=True) diff --git a/python/ray/scripts/scripts.py b/python/ray/scripts/scripts.py index a69c1369db72..eed702bb7438 100644 --- a/python/ray/scripts/scripts.py +++ b/python/ray/scripts/scripts.py @@ -15,6 +15,7 @@ from typing import Optional, Set, List, Tuple from ray.dashboard.modules.metrics import install_and_start_prometheus from ray.util.check_open_ports import check_open_ports +import requests import click import psutil @@ -621,6 +622,15 @@ def debug(address: str, verbose: bool): type=str, help="a JSON serialized dictionary mapping label name to label value.", ) +@click.option( + "--include-log-monitor", + default=None, + type=bool, + help="If set to True or left unset, a log monitor will start monitoring " + "the log files of all processes on this node and push their contents to GCS. " + "Only one log monitor should be started per physical host to avoid log " + "duplication on the driver process.", +) @add_click_logging_options @PublicAPI def start( @@ -667,6 +677,7 @@ def start( ray_debugger_external, disable_usage_stats, labels, + include_log_monitor, ): """Start Ray processes manually on the local machine.""" @@ -756,6 +767,7 @@ def start( no_monitor=no_monitor, tracing_startup_hook=tracing_startup_hook, ray_debugger_external=ray_debugger_external, + include_log_monitor=include_log_monitor, ) if ray_constants.RAY_START_HOOK in os.environ: @@ -2583,6 +2595,15 @@ def launch_prometheus(): install_and_start_prometheus.main() +@metrics_group.command(name="shutdown-prometheus") +def shutdown_prometheus(): + try: + requests.post("http://localhost:9090/-/quit") + except requests.exceptions.RequestException as e: + print(f"An error occurred: {e}") + sys.exit(1) + + def add_command_alias(command, name, hidden): new_command = copy.deepcopy(command) new_command.hidden = hidden diff --git a/python/ray/serve/_private/constants.py b/python/ray/serve/_private/constants.py index 8fae61c95329..6e45bae19ceb 100644 --- a/python/ray/serve/_private/constants.py +++ b/python/ray/serve/_private/constants.py @@ -194,9 +194,8 @@ # Logging format with record key to format string dict SERVE_LOG_RECORD_FORMAT = { SERVE_LOG_REQUEST_ID: "%(request_id)s", - SERVE_LOG_ROUTE: "%(route)s", SERVE_LOG_APPLICATION: "%(application)s", - SERVE_LOG_MESSAGE: "%(filename)s:%(lineno)d - %(message)s", + SERVE_LOG_MESSAGE: "-- %(message)s", SERVE_LOG_LEVEL_NAME: "%(levelname)s", SERVE_LOG_TIME: "%(asctime)s", } @@ -356,3 +355,17 @@ RAY_SERVE_FORCE_LOCAL_TESTING_MODE = ( os.environ.get("RAY_SERVE_FORCE_LOCAL_TESTING_MODE", "0") == "1" ) + +# Run sync methods defined in the replica in a thread pool by default. +RAY_SERVE_RUN_SYNC_IN_THREADPOOL = ( + os.environ.get("RAY_SERVE_RUN_SYNC_IN_THREADPOOL", "0") == "1" +) + +RAY_SERVE_RUN_SYNC_IN_THREADPOOL_WARNING = ( + "Calling sync method '{method_name}' directly on the " + "asyncio loop. In a future version, sync methods will be run in a " + "threadpool by default. Ensure your sync methods are thread safe " + "or keep the existing behavior by making them `async def`. Opt " + "into the new behavior by setting " + "RAY_SERVE_RUN_SYNC_IN_THREADPOOL=1." +) diff --git a/python/ray/serve/_private/controller.py b/python/ray/serve/_private/controller.py index 8eff4c80315a..4aa6906b241f 100644 --- a/python/ray/serve/_private/controller.py +++ b/python/ray/serve/_private/controller.py @@ -226,8 +226,7 @@ def reconfigure_global_logging_config(self, global_logging_config: LoggingConfig self.global_logging_config = global_logging_config self.long_poll_host.notify_changed( - LongPollNamespace.GLOBAL_LOGGING_CONFIG, - global_logging_config, + {LongPollNamespace.GLOBAL_LOGGING_CONFIG: global_logging_config} ) configure_component_logger( component_name="controller", diff --git a/python/ray/serve/_private/default_impl.py b/python/ray/serve/_private/default_impl.py index f47dfa85c178..489f0aaa25f9 100644 --- a/python/ray/serve/_private/default_impl.py +++ b/python/ray/serve/_private/default_impl.py @@ -56,6 +56,12 @@ def create_deployment_scheduler( ) +def create_replica_impl(**kwargs): + from ray.serve._private.replica import Replica + + return Replica(**kwargs) + + def create_dynamic_handle_options(**kwargs): return DynamicHandleOptions(**kwargs) diff --git a/python/ray/serve/_private/deployment_state.py b/python/ray/serve/_private/deployment_state.py index ca0fb2d446c6..09fa27876397 100644 --- a/python/ray/serve/_private/deployment_state.py +++ b/python/ray/serve/_private/deployment_state.py @@ -1384,6 +1384,13 @@ def deployment_name(self) -> str: def app_name(self) -> str: return self._id.app_name + @property + def _failed_to_start_threshold(self) -> int: + return min( + MAX_DEPLOYMENT_CONSTRUCTOR_RETRY_COUNT, + self._target_state.target_num_replicas * 3, + ) + def get_alive_replica_actor_ids(self) -> Set[str]: return {replica.actor_id for replica in self._replicas.get()} @@ -1448,16 +1455,17 @@ def broadcast_running_replicas_if_changed(self) -> None: return self._long_poll_host.notify_changed( - (LongPollNamespace.RUNNING_REPLICAS, self._id), - running_replica_infos, - ) - # NOTE(zcin): notify changed for Java routers. Since Java only - # supports 1.x API, there is no concept of applications in Java, - # so the key should remain a string describing the deployment - # name. If there are no Java routers, this is a no-op. - self._long_poll_host.notify_changed( - (LongPollNamespace.RUNNING_REPLICAS, self._id.name), - running_replica_infos, + { + (LongPollNamespace.RUNNING_REPLICAS, self._id): running_replica_infos, + # NOTE(zcin): notify changed for Java routers. Since Java only + # supports 1.x API, there is no concept of applications in Java, + # so the key should remain a string describing the deployment + # name. If there are no Java routers, this is a no-op. + ( + LongPollNamespace.RUNNING_REPLICAS, + self._id.name, + ): running_replica_infos, + } ) self._last_broadcasted_running_replica_infos = running_replica_infos self._multiplexed_model_ids_updated = False @@ -1473,8 +1481,7 @@ def broadcast_deployment_config_if_changed(self) -> None: return self._long_poll_host.notify_changed( - (LongPollNamespace.DEPLOYMENT_CONFIG, self._id), - current_deployment_config, + {(LongPollNamespace.DEPLOYMENT_CONFIG, self._id): current_deployment_config} ) self._last_broadcasted_deployment_config = current_deployment_config @@ -1845,11 +1852,10 @@ def scale_deployment_replicas( if to_add > 0: # Exponential backoff - failed_to_start_threshold = min( - MAX_DEPLOYMENT_CONSTRUCTOR_RETRY_COUNT, - self._target_state.target_num_replicas * 3, - ) - if self._replica_constructor_retry_counter >= failed_to_start_threshold: + if ( + self._replica_constructor_retry_counter + >= self._failed_to_start_threshold + ): # Wait 1, 2, 4, ... seconds before consecutive retries, with random # offset added to avoid synchronization if ( @@ -1909,17 +1915,13 @@ def check_curr_status(self) -> Tuple[bool, bool]: ) failed_to_start_count = self._replica_constructor_retry_counter - failed_to_start_threshold = min( - MAX_DEPLOYMENT_CONSTRUCTOR_RETRY_COUNT, - self._target_state.target_num_replicas * 3, - ) # Got to make a call to complete current deploy() goal after # start failure threshold reached, while we might still have # pending replicas in current goal. if ( - failed_to_start_count >= failed_to_start_threshold - and failed_to_start_threshold != 0 + failed_to_start_count >= self._failed_to_start_threshold + and self._failed_to_start_threshold != 0 ): if running_at_target_version_replica_cnt > 0: # At least one RUNNING replica at target state, partial @@ -2043,17 +2045,27 @@ def record_replica_startup_failure(self, error_msg: str): self._replica_constructor_retry_counter += 1 self._replica_constructor_error_msg = error_msg + retrying_msg = "Retrying" + if self._failed_to_start_threshold != 0: + remaining_retries = ( + self._failed_to_start_threshold + - self._replica_constructor_retry_counter + ) + retrying_msg += f" {remaining_retries} more time(s)" + + message = ( + f"A replica failed to start with exception. {retrying_msg}. Error:\n" + f"{error_msg}" + ) + self._curr_status_info = self._curr_status_info.update_message(message) + def update_replica_startup_backoff_time(self): """Updates the replica startup backoff time.""" # If replicas have failed enough times, execute exponential backoff # Wait 1, 2, 4, ... seconds before consecutive retries (or use a custom # backoff factor by setting EXPONENTIAL_BACKOFF_FACTOR) - failed_to_start_threshold = min( - MAX_DEPLOYMENT_CONSTRUCTOR_RETRY_COUNT, - self._target_state.target_num_replicas * 3, - ) - if self._replica_constructor_retry_counter > failed_to_start_threshold: + if self._replica_constructor_retry_counter > self._failed_to_start_threshold: self._backoff_time_s = min( EXPONENTIAL_BACKOFF_FACTOR * self._backoff_time_s, MAX_BACKOFF_TIME_S ) diff --git a/python/ray/serve/_private/endpoint_state.py b/python/ray/serve/_private/endpoint_state.py index abc4c0615ad6..fd2074fd6669 100644 --- a/python/ray/serve/_private/endpoint_state.py +++ b/python/ray/serve/_private/endpoint_state.py @@ -46,7 +46,7 @@ def _checkpoint(self): def _notify_route_table_changed(self): self._long_poll_host.notify_changed( - LongPollNamespace.ROUTE_TABLE, self._endpoints + {LongPollNamespace.ROUTE_TABLE: self._endpoints} ) def _get_endpoint_for_route(self, route: str) -> Optional[DeploymentID]: diff --git a/python/ray/serve/_private/local_testing_mode.py b/python/ray/serve/_private/local_testing_mode.py index 6ccc16cd3628..af38c04c5e65 100644 --- a/python/ray/serve/_private/local_testing_mode.py +++ b/python/ray/serve/_private/local_testing_mode.py @@ -10,7 +10,10 @@ import ray from ray import cloudpickle from ray.serve._private.common import DeploymentID, RequestMetadata -from ray.serve._private.constants import SERVE_LOGGER_NAME +from ray.serve._private.constants import ( + RAY_SERVE_RUN_SYNC_IN_THREADPOOL, + SERVE_LOGGER_NAME, +) from ray.serve._private.replica import UserCallableWrapper from ray.serve._private.replica_result import ReplicaResult from ray.serve._private.router import Router @@ -66,6 +69,7 @@ def make_local_deployment_handle( deployment.init_args, deployment.init_kwargs, deployment_id=deployment_id, + run_sync_methods_in_threadpool=RAY_SERVE_RUN_SYNC_IN_THREADPOOL, ) try: logger.info(f"Initializing local replica class for {deployment_id}.") @@ -310,4 +314,6 @@ def generator_result_callback(item: Any): return noop_future def shutdown(self): - pass + noop_future = concurrent.futures.Future() + noop_future.set_result(None) + return noop_future diff --git a/python/ray/serve/_private/logging_utils.py b/python/ray/serve/_private/logging_utils.py index 207f0574c317..5081829670bc 100644 --- a/python/ray/serve/_private/logging_utils.py +++ b/python/ray/serve/_private/logging_utils.py @@ -141,8 +141,6 @@ def format(self, record: logging.LogRecord) -> str: record_formats_attrs = [] if SERVE_LOG_REQUEST_ID in record.__dict__: record_formats_attrs.append(SERVE_LOG_RECORD_FORMAT[SERVE_LOG_REQUEST_ID]) - if SERVE_LOG_ROUTE in record.__dict__: - record_formats_attrs.append(SERVE_LOG_RECORD_FORMAT[SERVE_LOG_ROUTE]) record_formats_attrs.append(SERVE_LOG_RECORD_FORMAT[SERVE_LOG_MESSAGE]) record_format += " ".join(record_formats_attrs) @@ -153,9 +151,9 @@ def format(self, record: logging.LogRecord) -> str: return formatter.format(record) -def access_log_msg(*, method: str, status: str, latency_ms: float): +def access_log_msg(*, method: str, route: str, status: str, latency_ms: float): """Returns a formatted message for an HTTP or ServeHandle access log.""" - return f"{method.upper()} {status.upper()} {latency_ms:.1f}ms" + return f"{method} {route} {status} {latency_ms:.1f}ms" def log_to_stderr_filter(record: logging.LogRecord) -> bool: diff --git a/python/ray/serve/_private/long_poll.py b/python/ray/serve/_private/long_poll.py index f3538913b76b..d6fb52e72310 100644 --- a/python/ray/serve/_private/long_poll.py +++ b/python/ray/serve/_private/long_poll.py @@ -4,6 +4,7 @@ import random from asyncio.events import AbstractEventLoop from collections import defaultdict +from collections.abc import Mapping from dataclasses import dataclass from enum import Enum, auto from typing import Any, Callable, DefaultDict, Dict, Optional, Set, Tuple, Union @@ -179,12 +180,12 @@ class LongPollHost: The desired use case is to embed this in an Ray actor. Client will be expected to call actor.listen_for_change.remote(...). On the host side, - you can call host.notify_changed(key, object) to update the state and + you can call host.notify_changed({key: object}) to update the state and potentially notify whoever is polling for these values. Internally, we use snapshot_ids for each object to identify client with outdated object and immediately return the result. If the client has the - up-to-date verison, then the listen_for_change call will only return when + up-to-date version, then the listen_for_change call will only return when the object is updated. """ @@ -306,15 +307,15 @@ async def listen_for_change( self._count_send(LongPollState.TIME_OUT) return LongPollState.TIME_OUT else: - updated_object_key: str = async_task_to_watched_keys[done.pop()] - updated_object = { - updated_object_key: UpdatedObject( + updated_objects = {} + for task in done: + updated_object_key = async_task_to_watched_keys[task] + updated_objects[updated_object_key] = UpdatedObject( self.object_snapshots[updated_object_key], self.snapshot_ids[updated_object_key], ) - } - self._count_send(updated_object) - return updated_object + self._count_send(updated_objects) + return updated_objects async def listen_for_change_java( self, @@ -403,21 +404,22 @@ def _listen_result_to_proto_bytes( proto = LongPollResult(**data) return proto.SerializeToString() - def notify_changed( - self, - object_key: KeyType, - updated_object: Any, - ): - try: - self.snapshot_ids[object_key] += 1 - except KeyError: - # Initial snapshot id must be >= 0, so that the long poll client - # can send a negative initial snapshot id to get a fast update. - # They should also be randomized; - # see https://github.com/ray-project/ray/pull/45881#discussion_r1645243485 - self.snapshot_ids[object_key] = random.randint(0, 1_000_000) - self.object_snapshots[object_key] = updated_object - logger.debug(f"LongPollHost: Notify change for key {object_key}.") - - for event in self.notifier_events.pop(object_key, set()): - event.set() + def notify_changed(self, updates: Mapping[KeyType, Any]) -> None: + """ + Update the current snapshot of some objects + and notify any long poll clients. + """ + for object_key, updated_object in updates.items(): + try: + self.snapshot_ids[object_key] += 1 + except KeyError: + # Initial snapshot id must be >= 0, so that the long poll client + # can send a negative initial snapshot id to get a fast update. + # They should also be randomized; see + # https://github.com/ray-project/ray/pull/45881#discussion_r1645243485 + self.snapshot_ids[object_key] = random.randint(0, 1_000_000) + self.object_snapshots[object_key] = updated_object + logger.debug(f"LongPollHost: Notify change for key {object_key}.") + + for event in self.notifier_events.pop(object_key, set()): + event.set() diff --git a/python/ray/serve/_private/proxy.py b/python/ray/serve/_private/proxy.py index 614a8dc39508..2b5967a7a75f 100644 --- a/python/ray/serve/_private/proxy.py +++ b/python/ray/serve/_private/proxy.py @@ -454,9 +454,11 @@ async def proxy_request(self, proxy_request: ProxyRequest) -> ResponseGenerator: latency_ms = (time.time() - start_time) * 1000.0 if response_handler_info.should_record_access_log: + request_context = ray.serve.context._serve_request_context.get() logger.info( access_log_msg( method=proxy_request.method, + route=request_context.route, status=str(status.code), latency_ms=latency_ms, ), @@ -988,8 +990,7 @@ async def send_request_to_replica( status_code = str(asgi_message["status"]) status = ResponseStatus( code=status_code, - # TODO(edoakes): we need a more nuanced check than this. - is_error=status_code != "200", + is_error=not status_code.startswith("2"), ) expecting_trailers = asgi_message.get("trailers", False) elif asgi_message["type"] == "websocket.accept": @@ -1010,11 +1011,16 @@ async def send_request_to_replica( # the trailers message has been sent. if not asgi_message.get("more_trailers", False): response_generator.stop_checking_for_disconnect() - elif asgi_message["type"] == "websocket.disconnect": + elif asgi_message["type"] in [ + "websocket.close", + "websocket.disconnect", + ]: + status_code = str(asgi_message["code"]) status = ResponseStatus( - code=str(asgi_message["code"]), - # TODO(edoakes): we need a more nuanced check than this. - is_error=False, + code=status_code, + # All status codes are considered errors aside from: + # 1000 (CLOSE_NORMAL), 1001 (CLOSE_GOING_AWAY). + is_error=status_code not in ["1000", "1001"], ) response_generator.stop_checking_for_disconnect() diff --git a/python/ray/serve/_private/proxy_request_response.py b/python/ray/serve/_private/proxy_request_response.py index 8050c4be215d..0ca2235fd3dd 100644 --- a/python/ray/serve/_private/proxy_request_response.py +++ b/python/ray/serve/_private/proxy_request_response.py @@ -58,7 +58,8 @@ def request_type(self) -> str: @property def method(self) -> str: - return self.scope.get("method", "websocket").upper() + # WebSocket messages don't have a 'method' field. + return self.scope.get("method", "WS").upper() @property def route_path(self) -> str: diff --git a/python/ray/serve/_private/replica.py b/python/ray/serve/_private/replica.py index b90c837b6cc0..23fc7d237f94 100644 --- a/python/ray/serve/_private/replica.py +++ b/python/ray/serve/_private/replica.py @@ -1,5 +1,6 @@ import asyncio import concurrent.futures +import functools import inspect import logging import os @@ -7,12 +8,23 @@ import threading import time import traceback +import warnings +from abc import ABC, abstractmethod from contextlib import contextmanager -from functools import wraps from importlib import import_module -from typing import Any, AsyncGenerator, Callable, Dict, Optional, Tuple, Union +from typing import ( + Any, + AsyncGenerator, + Callable, + Dict, + Generator, + Optional, + Tuple, + Union, +) import starlette.responses +from anyio import to_thread from starlette.types import ASGIApp, Message import ray @@ -37,11 +49,14 @@ HEALTH_CHECK_METHOD, RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE, RAY_SERVE_REPLICA_AUTOSCALING_METRIC_RECORD_PERIOD_S, + RAY_SERVE_RUN_SYNC_IN_THREADPOOL, + RAY_SERVE_RUN_SYNC_IN_THREADPOOL_WARNING, RECONFIGURE_METHOD, SERVE_CONTROLLER_NAME, SERVE_LOGGER_NAME, SERVE_NAMESPACE, ) +from ray.serve._private.default_impl import create_replica_impl from ray.serve._private.http_util import ( ASGIAppReplicaWrapper, ASGIArgs, @@ -231,31 +246,23 @@ def _add_autoscaling_metrics_point(self) -> None: ) -class ReplicaActor: - """Actor definition for replicas of Ray Serve deployments. +StatusCodeCallback = Callable[[str], None] - This class defines the interface that the controller and deployment handles - (i.e., from proxies and other replicas) use to interact with a replica. - All interaction with the user-provided callable is done via the - `UserCallableWrapper` class. - """ - - async def __init__( +class ReplicaBase(ABC): + def __init__( self, replica_id: ReplicaID, - serialized_deployment_def: bytes, - serialized_init_args: bytes, - serialized_init_kwargs: bytes, - deployment_config_proto_bytes: bytes, + deployment_def: Callable, + init_args: Tuple, + init_kwargs: Dict, + deployment_config: DeploymentConfig, version: DeploymentVersion, ): self._version = version self._replica_id = replica_id self._deployment_id = replica_id.deployment_id - self._deployment_config = DeploymentConfig.from_proto_bytes( - deployment_config_proto_bytes - ) + self._deployment_config = deployment_config self._component_name = f"{self._deployment_id.name}" if self._deployment_id.app_name: self._component_name = ( @@ -266,15 +273,12 @@ async def __init__( self._configure_logger_and_profilers(self._deployment_config.logging_config) self._event_loop = get_or_create_event_loop() - deployment_def = cloudpickle.loads(serialized_deployment_def) - if isinstance(deployment_def, str): - deployment_def = _load_deployment_def_from_import_path(deployment_def) - self._user_callable_wrapper = UserCallableWrapper( deployment_def, - cloudpickle.loads(serialized_init_args), - cloudpickle.loads(serialized_init_kwargs), + init_args, + init_kwargs, deployment_id=self._deployment_id, + run_sync_methods_in_threadpool=RAY_SERVE_RUN_SYNC_IN_THREADPOOL, ) # Guards against calling the user's callable constructor multiple times. @@ -308,6 +312,7 @@ def _set_internal_replica_context(self, *, servable_object: Callable = None): def _configure_logger_and_profilers( self, logging_config: Union[None, Dict, LoggingConfig] ): + if logging_config is None: logging_config = {} if isinstance(logging_config, dict): @@ -330,18 +335,10 @@ def _configure_logger_and_profilers( component_id=self._component_id, ) - def push_proxy_handle(self, handle: ActorHandle): - pass - - def get_num_ongoing_requests(self) -> int: - """Fetch the number of ongoing requests at this replica (queue length). - - This runs on a separate thread (using a Ray concurrency group) so it will - not be blocked by user code. - """ + def get_num_ongoing_requests(self): return self._metrics_manager.get_num_ongoing_requests() - def _maybe_get_asgi_route( + def _maybe_get_http_route( self, request_metadata: RequestMetadata, request_args: Tuple[Any] ) -> Optional[str]: """Get the matched route string for ASGI apps to be used in logs & metrics. @@ -374,49 +371,43 @@ def _maybe_get_asgi_route( return route - @contextmanager - def _wrap_user_method_call( + def _maybe_get_http_method( self, request_metadata: RequestMetadata, request_args: Tuple[Any] - ): - """Context manager that wraps user method calls. + ) -> Optional[str]: + """Get the HTTP method to be used in logs & metrics. - 1) Sets the request context var with appropriate metadata. - 2) Records the access log message (if not disabled). - 3) Records per-request metrics via the metrics manager. + If this is not an HTTP request, returns None. """ - route = self._maybe_get_asgi_route(request_metadata, request_args) - ray.serve.context._serve_request_context.set( - ray.serve.context._RequestContext( - route=route, - request_id=request_metadata.request_id, - _internal_request_id=request_metadata.internal_request_id, - app_name=self._deployment_id.app_name, - multiplexed_model_id=request_metadata.multiplexed_model_id, - grpc_context=request_metadata.grpc_context, - ) - ) + if request_metadata.is_http_request: + req: StreamingHTTPRequest = request_args[0] + # WebSocket messages don't have a 'method' field. + return req.asgi_scope.get("method", "WS") + + return None + @contextmanager + def _handle_errors_and_metrics( + self, request_metadata: RequestMetadata, request_args: Tuple[Any] + ) -> Generator[StatusCodeCallback, None, None]: start_time = time.time() user_exception = None + + status_code = None + + def _status_code_callback(s: str): + nonlocal status_code + status_code = s + try: self._metrics_manager.inc_num_ongoing_requests() - yield + yield _status_code_callback except asyncio.CancelledError as e: user_exception = e - - # Recursively cancel child requests - requests_pending_assignment = ( - ray.serve.context._get_requests_pending_assignment( - request_metadata.internal_request_id - ) - ) - for task in requests_pending_assignment.values(): - task.cancel() + self._on_request_cancelled(request_metadata, e) except Exception as e: user_exception = e logger.exception("Request failed.") - if ray.util.pdb._is_ray_debugger_post_mortem_enabled(): - ray.util.pdb._post_mortem() + self._on_request_failed(request_metadata, e) finally: self._metrics_manager.dec_num_ongoing_requests() @@ -428,16 +419,21 @@ def _wrap_user_method_call( else: status_str = "ERROR" + http_method = self._maybe_get_http_method(request_metadata, request_args) + http_route = request_metadata.route + # Set in _wrap_user_method_call. logger.info( access_log_msg( - method=request_metadata.call_method, - status=status_str, + method=http_method or "CALL", + route=http_route or request_metadata.call_method, + # Prefer the HTTP status code if it was populated. + status=status_code or status_str, latency_ms=latency_ms, ), extra={"serve_access_log": True}, ) self._metrics_manager.record_request_metrics( - route=route, + route=http_route, status_str=status_str, latency_ms=latency_ms, was_error=user_exception is not None, @@ -451,6 +447,7 @@ async def _call_user_generator( request_metadata: RequestMetadata, request_args: Tuple[Any], request_kwargs: Dict[str, Any], + status_code_callback: StatusCodeCallback, ) -> AsyncGenerator[Any, None]: """Calls a user method for a streaming call and yields its results. @@ -476,6 +473,7 @@ def _enqueue_thread_safe(item: Any): ) ) + first_message_peeked = False while True: wait_for_message_task = self._event_loop.create_task( result_queue.wait_for_message() @@ -492,6 +490,16 @@ def _enqueue_thread_safe(item: Any): # and use vanilla pickle (we know it's safe because these messages # only contain primitive Python types). if request_metadata.is_http_request: + # Peek the first ASGI message to determine the status code. + if not first_message_peeked: + msg = messages[0] + first_message_peeked = True + if msg["type"] == "http.response.start": + # HTTP responses begin with exactly one + # "http.response.start" message containing the "status" + # field. Other response types like WebSockets may not. + status_code_callback(str(msg["status"])) + yield pickle.dumps(messages) else: for msg in messages: @@ -516,13 +524,8 @@ def _enqueue_thread_safe(item: Any): wait_for_message_task.cancel() async def handle_request( - self, - pickled_request_metadata: bytes, - *request_args, - **request_kwargs, + self, request_metadata: RequestMetadata, *request_args, **request_kwargs ) -> Tuple[bytes, Any]: - """Entrypoint for `stream=False` calls.""" - request_metadata = pickle.loads(pickled_request_metadata) with self._wrap_user_method_call(request_metadata, request_args): return await asyncio.wrap_future( self._user_callable_wrapper.call_user_method( @@ -531,40 +534,23 @@ async def handle_request( ) async def handle_request_streaming( - self, - pickled_request_metadata: bytes, - *request_args, - **request_kwargs, + self, request_metadata: RequestMetadata, *request_args, **request_kwargs ) -> AsyncGenerator[Any, None]: """Generator that is the entrypoint for all `stream=True` handle calls.""" - request_metadata = pickle.loads(pickled_request_metadata) - with self._wrap_user_method_call(request_metadata, request_args): + with self._wrap_user_method_call( + request_metadata, request_args + ) as status_code_callback: async for result in self._call_user_generator( request_metadata, request_args, request_kwargs, + status_code_callback=status_code_callback, ): yield result async def handle_request_with_rejection( - self, - pickled_request_metadata: bytes, - *request_args, - **request_kwargs, - ) -> AsyncGenerator[Any, None]: - """Entrypoint for all requests with strict max_ongoing_requests enforcement. - - The first response from this generator is always a system message indicating - if the request was accepted (the replica has capacity for the request) or - rejected (the replica is already at max_ongoing_requests). - - For non-streaming requests, there will only be one more message, the unary - result of the user request handler. - - For streaming requests, the subsequent messages will be the results of the - user request handler (which must be a generator). - """ - request_metadata = pickle.loads(pickled_request_metadata) + self, request_metadata: RequestMetadata, *request_args, **request_kwargs + ): limit = self._deployment_config.max_ongoing_requests num_ongoing_requests = self.get_num_ongoing_requests() if num_ongoing_requests >= limit: @@ -573,21 +559,19 @@ async def handle_request_with_rejection( f"rejecting request {request_metadata.request_id}.", extra={"log_to_stderr": False}, ) - yield pickle.dumps( - ReplicaQueueLengthInfo( - accepted=False, num_ongoing_requests=num_ongoing_requests - ) + yield ReplicaQueueLengthInfo( + accepted=False, num_ongoing_requests=num_ongoing_requests ) return - with self._wrap_user_method_call(request_metadata, request_args): - yield pickle.dumps( - ReplicaQueueLengthInfo( - accepted=True, - # NOTE(edoakes): `_wrap_user_method_call` will increment the number - # of ongoing requests to include this one, so re-fetch the value. - num_ongoing_requests=self.get_num_ongoing_requests(), - ) + with self._wrap_user_method_call( + request_metadata, request_args + ) as status_code_callback: + yield ReplicaQueueLengthInfo( + accepted=True, + # NOTE(edoakes): `_wrap_user_method_call` will increment the number + # of ongoing requests to include this one, so re-fetch the value. + num_ongoing_requests=self.get_num_ongoing_requests(), ) if request_metadata.is_streaming: @@ -595,6 +579,7 @@ async def handle_request_with_rejection( request_metadata, request_args, request_kwargs, + status_code_callback=status_code_callback, ): yield result else: @@ -604,81 +589,29 @@ async def handle_request_with_rejection( ) ) - async def handle_request_from_java( - self, - proto_request_metadata: bytes, - *request_args, - **request_kwargs, - ) -> Any: - from ray.serve.generated.serve_pb2 import ( - RequestMetadata as RequestMetadataProto, - ) - - proto = RequestMetadataProto.FromString(proto_request_metadata) - request_metadata: RequestMetadata = RequestMetadata( - request_id=proto.request_id, - internal_request_id=proto.internal_request_id, - call_method=proto.call_method, - multiplexed_model_id=proto.multiplexed_model_id, - route=proto.route, - ) - with self._wrap_user_method_call(request_metadata, request_args): - return await asyncio.wrap_future( - self._user_callable_wrapper.call_user_method( - request_metadata, request_args, request_kwargs - ) - ) - - async def is_allocated(self) -> str: - """poke the replica to check whether it's alive. - - When calling this method on an ActorHandle, it will complete as - soon as the actor has started running. We use this mechanism to - detect when a replica has been allocated a worker slot. - At this time, the replica can transition from PENDING_ALLOCATION - to PENDING_INITIALIZATION startup state. - - Returns: - The PID, actor ID, node ID, node IP, and log filepath id of the replica. - """ - - return ( - os.getpid(), - ray.get_runtime_context().get_actor_id(), - ray.get_runtime_context().get_worker_id(), - ray.get_runtime_context().get_node_id(), - ray.util.get_node_ip_address(), - get_component_logger_file_path(), - ) - - async def initialize_and_get_metadata( - self, - deployment_config: DeploymentConfig = None, - _after: Optional[Any] = None, - ) -> Tuple[DeploymentConfig, DeploymentVersion, Optional[float], Optional[int]]: - """Handles initializing the replica. + @abstractmethod + async def _on_initialized(self): + raise NotImplementedError - Returns: 3-tuple containing - 1. DeploymentConfig of the replica - 2. DeploymentVersion of the replica - 3. Initialization duration in seconds - """ - # Unused `_after` argument is for scheduling: passing an ObjectRef - # allows delaying this call until after the `_after` call has returned. + async def initialize(self, deployment_config: DeploymentConfig): try: # Ensure that initialization is only performed once. # When controller restarts, it will call this method again. async with self._user_callable_initialized_lock: - initialization_start_time = time.time() + self._initialization_start_time = time.time() if not self._user_callable_initialized: self._user_callable_asgi_app = await asyncio.wrap_future( self._user_callable_wrapper.initialize_callable() ) + await self._on_initialized() self._user_callable_initialized = True - self._set_internal_replica_context( - servable_object=self._user_callable_wrapper.user_callable - ) + if deployment_config: + await asyncio.wrap_future( + self._user_callable_wrapper.set_sync_method_threadpool_limit( + deployment_config.max_ongoing_requests + ) + ) await asyncio.wrap_future( self._user_callable_wrapper.call_reconfigure( deployment_config.user_config @@ -689,20 +622,10 @@ async def initialize_and_get_metadata( # an initial health check. If an initial health check fails, # consider it an initialization failure. await self.check_health() - - # Save the initialization latency if the replica is initializing - # for the first time. - if self._initialization_latency is None: - self._initialization_latency = time.time() - initialization_start_time - - return self._get_metadata() except Exception: raise RuntimeError(traceback.format_exc()) from None - async def reconfigure( - self, - deployment_config: DeploymentConfig, - ) -> Tuple[DeploymentConfig, DeploymentVersion, Optional[float], Optional[int]]: + async def reconfigure(self, deployment_config: DeploymentConfig): try: user_config_changed = ( deployment_config.user_config != self._deployment_config.user_config @@ -722,6 +645,11 @@ async def reconfigure( if logging_config_changed: self._configure_logger_and_profilers(deployment_config.logging_config) + await asyncio.wrap_future( + self._user_callable_wrapper.set_sync_method_threadpool_limit( + deployment_config.max_ongoing_requests + ) + ) if user_config_changed: await asyncio.wrap_future( self._user_callable_wrapper.call_reconfigure( @@ -734,12 +662,10 @@ async def reconfigure( self._set_internal_replica_context( servable_object=self._user_callable_wrapper.user_callable ) - - return self._get_metadata() except Exception: raise RuntimeError(traceback.format_exc()) from None - def _get_metadata( + def get_metadata( self, ) -> Tuple[DeploymentConfig, DeploymentVersion, Optional[float], Optional[int]]: return ( @@ -749,26 +675,22 @@ def _get_metadata( self._port, ) - def _save_cpu_profile_data(self) -> str: - """Saves CPU profiling data, if CPU profiling is enabled. - - Logs a warning if CPU profiling is disabled. - """ + @abstractmethod + def _on_request_cancelled( + self, request_metadata: RequestMetadata, e: asyncio.CancelledError + ): + pass - if self.cpu_profiler is not None: - import marshal + @abstractmethod + def _on_request_failed(self, request_metadata: RequestMetadata, e: Exception): + pass - self.cpu_profiler.snapshot_stats() - with open(self.cpu_profiler_log, "wb") as f: - marshal.dump(self.cpu_profiler.stats, f) - logger.info(f'Saved CPU profile data to file "{self.cpu_profiler_log}"') - return self.cpu_profiler_log - else: - logger.error( - "Attempted to save CPU profile data, but failed because no " - "CPU profiler was running! Enable CPU profiling by enabling " - "the RAY_SERVE_ENABLE_CPU_PROFILING env var." - ) + @abstractmethod + @contextmanager + def _wrap_user_method_call( + self, request_metadata: RequestMetadata, request_args: Tuple[Any] + ) -> Generator[StatusCodeCallback, None, None]: + pass async def _drain_ongoing_requests(self): """Wait for any ongoing requests to finish. @@ -825,6 +747,254 @@ async def check_health(self): await asyncio.wrap_future(f) +class Replica(ReplicaBase): + async def _on_initialized(self): + self._set_internal_replica_context( + servable_object=self._user_callable_wrapper.user_callable + ) + + # Save the initialization latency if the replica is initializing + # for the first time. + if self._initialization_latency is None: + self._initialization_latency = time.time() - self._initialization_start_time + + def _on_request_cancelled( + self, request_metadata: RequestMetadata, e: asyncio.CancelledError + ): + """Recursively cancels child requests.""" + requests_pending_assignment = ( + ray.serve.context._get_requests_pending_assignment( + request_metadata.internal_request_id + ) + ) + for task in requests_pending_assignment.values(): + task.cancel() + + def _on_request_failed(self, request_metadata: RequestMetadata, e: Exception): + if ray.util.pdb._is_ray_debugger_post_mortem_enabled(): + ray.util.pdb._post_mortem() + + @contextmanager + def _wrap_user_method_call( + self, request_metadata: RequestMetadata, request_args: Tuple[Any] + ) -> Generator[StatusCodeCallback, None, None]: + """Context manager that wraps user method calls. + + 1) Sets the request context var with appropriate metadata. + 2) Records the access log message (if not disabled). + 3) Records per-request metrics via the metrics manager. + """ + request_metadata.route = self._maybe_get_http_route( + request_metadata, request_args + ) + ray.serve.context._serve_request_context.set( + ray.serve.context._RequestContext( + route=request_metadata.route, + request_id=request_metadata.request_id, + _internal_request_id=request_metadata.internal_request_id, + app_name=self._deployment_id.app_name, + multiplexed_model_id=request_metadata.multiplexed_model_id, + grpc_context=request_metadata.grpc_context, + ) + ) + + with self._handle_errors_and_metrics( + request_metadata, request_args + ) as status_code_callback: + yield status_code_callback + + +class ReplicaActor: + """Actor definition for replicas of Ray Serve deployments. + + This class defines the interface that the controller and deployment handles + (i.e., from proxies and other replicas) use to interact with a replica. + + All interaction with the user-provided callable is done via the + `UserCallableWrapper` class. + """ + + async def __init__( + self, + replica_id: ReplicaID, + serialized_deployment_def: bytes, + serialized_init_args: bytes, + serialized_init_kwargs: bytes, + deployment_config_proto_bytes: bytes, + version: DeploymentVersion, + ): + deployment_config = DeploymentConfig.from_proto_bytes( + deployment_config_proto_bytes + ) + deployment_def = cloudpickle.loads(serialized_deployment_def) + if isinstance(deployment_def, str): + deployment_def = _load_deployment_def_from_import_path(deployment_def) + + self._replica_impl: ReplicaBase = create_replica_impl( + replica_id=replica_id, + deployment_def=deployment_def, + init_args=cloudpickle.loads(serialized_init_args), + init_kwargs=cloudpickle.loads(serialized_init_kwargs), + deployment_config=deployment_config, + version=version, + ) + + def push_proxy_handle(self, handle: ActorHandle): + pass + + def get_num_ongoing_requests(self) -> int: + """Fetch the number of ongoing requests at this replica (queue length). + + This runs on a separate thread (using a Ray concurrency group) so it will + not be blocked by user code. + """ + return self._replica_impl.get_num_ongoing_requests() + + async def is_allocated(self) -> str: + """poke the replica to check whether it's alive. + + When calling this method on an ActorHandle, it will complete as + soon as the actor has started running. We use this mechanism to + detect when a replica has been allocated a worker slot. + At this time, the replica can transition from PENDING_ALLOCATION + to PENDING_INITIALIZATION startup state. + + Returns: + The PID, actor ID, node ID, node IP, and log filepath id of the replica. + """ + + return ( + os.getpid(), + ray.get_runtime_context().get_actor_id(), + ray.get_runtime_context().get_worker_id(), + ray.get_runtime_context().get_node_id(), + ray.util.get_node_ip_address(), + get_component_logger_file_path(), + ) + + async def initialize_and_get_metadata( + self, deployment_config: DeploymentConfig = None, _after: Optional[Any] = None + ): + """Handles initializing the replica. + + Returns: 3-tuple containing + 1. DeploymentConfig of the replica + 2. DeploymentVersion of the replica + 3. Initialization duration in seconds + """ + # Unused `_after` argument is for scheduling: passing an ObjectRef + # allows delaying this call until after the `_after` call has returned. + await self._replica_impl.initialize(deployment_config) + return self._replica_impl.get_metadata() + + async def check_health(self): + await self._replica_impl.check_health() + + async def reconfigure( + self, deployment_config + ) -> Tuple[DeploymentConfig, DeploymentVersion, Optional[float], Optional[int]]: + await self._replica_impl.reconfigure(deployment_config) + return self._replica_impl.get_metadata() + + async def handle_request( + self, + pickled_request_metadata: bytes, + *request_args, + **request_kwargs, + ) -> Tuple[bytes, Any]: + """Entrypoint for `stream=False` calls.""" + request_metadata = pickle.loads(pickled_request_metadata) + return await self._replica_impl.handle_request( + request_metadata, *request_args, **request_kwargs + ) + + async def handle_request_streaming( + self, + pickled_request_metadata: bytes, + *request_args, + **request_kwargs, + ) -> AsyncGenerator[Any, None]: + """Generator that is the entrypoint for all `stream=True` handle calls.""" + request_metadata = pickle.loads(pickled_request_metadata) + async for result in self._replica_impl.handle_request_streaming( + request_metadata, *request_args, **request_kwargs + ): + yield result + + async def handle_request_with_rejection( + self, + pickled_request_metadata: bytes, + *request_args, + **request_kwargs, + ) -> AsyncGenerator[Any, None]: + """Entrypoint for all requests with strict max_ongoing_requests enforcement. + + The first response from this generator is always a system message indicating + if the request was accepted (the replica has capacity for the request) or + rejected (the replica is already at max_ongoing_requests). + + For non-streaming requests, there will only be one more message, the unary + result of the user request handler. + + For streaming requests, the subsequent messages will be the results of the + user request handler (which must be a generator). + """ + request_metadata = pickle.loads(pickled_request_metadata) + async for result in self._replica_impl.handle_request_with_rejection( + request_metadata, *request_args, **request_kwargs + ): + if isinstance(result, ReplicaQueueLengthInfo): + yield pickle.dumps(result) + else: + yield result + + async def handle_request_from_java( + self, + proto_request_metadata: bytes, + *request_args, + **request_kwargs, + ) -> Any: + from ray.serve.generated.serve_pb2 import ( + RequestMetadata as RequestMetadataProto, + ) + + proto = RequestMetadataProto.FromString(proto_request_metadata) + request_metadata: RequestMetadata = RequestMetadata( + request_id=proto.request_id, + internal_request_id=proto.internal_request_id, + call_method=proto.call_method, + multiplexed_model_id=proto.multiplexed_model_id, + route=proto.route, + ) + return await self._replica_impl.handle_request( + request_metadata, *request_args, **request_kwargs + ) + + async def perform_graceful_shutdown(self): + await self._replica_impl.perform_graceful_shutdown() + + def _save_cpu_profile_data(self) -> str: + """Saves CPU profiling data, if CPU profiling is enabled. + + Logs a warning if CPU profiling is disabled. + """ + + if self.cpu_profiler is not None: + import marshal + + self.cpu_profiler.snapshot_stats() + with open(self.cpu_profiler_log, "wb") as f: + marshal.dump(self.cpu_profiler.stats, f) + logger.info(f'Saved CPU profile data to file "{self.cpu_profiler_log}"') + return self.cpu_profiler_log + else: + logger.error( + "Attempted to save CPU profile data, but failed because no " + "CPU profiler was running! Enable CPU profiling by enabling " + "the RAY_SERVE_ENABLE_CPU_PROFILING env var." + ) + + class UserCallableWrapper: """Wraps a user-provided callable that is used to handle requests to a replica.""" @@ -835,6 +1005,7 @@ def __init__( init_kwargs: Dict, *, deployment_id: DeploymentID, + run_sync_methods_in_threadpool: bool, ): if not (inspect.isfunction(deployment_def) or inspect.isclass(deployment_def)): raise TypeError( @@ -848,6 +1019,8 @@ def __init__( self._is_function = inspect.isfunction(deployment_def) self._deployment_id = deployment_id self._destructor_called = False + self._run_sync_methods_in_threadpool = run_sync_methods_in_threadpool + self._warned_about_sync_method_change = False # Will be populated in `initialize_callable`. self._callable = None @@ -878,7 +1051,7 @@ def _run_on_user_code_event_loop(f: Callable) -> Callable: f ), "_run_on_user_code_event_loop can only be used on coroutine functions." - @wraps(f) + @functools.wraps(f) def wrapper(self, *args, **kwargs) -> concurrent.futures.Future: return asyncio.run_coroutine_threadsafe( f(self, *args, **kwargs), @@ -887,6 +1060,12 @@ def wrapper(self, *args, **kwargs) -> concurrent.futures.Future: return wrapper + @_run_on_user_code_event_loop + async def set_sync_method_threadpool_limit(self, limit: int): + # NOTE(edoakes): the limit is thread local, so this must + # be run on the user code event loop. + to_thread.current_default_thread_limiter().total_tokens = limit + def _get_user_callable_method(self, method_name: str) -> Callable: if self._is_function: return self._callable @@ -927,17 +1106,89 @@ async def _send_user_result_over_asgi( else: await Response(result).send(scope, receive, send) - async def _call_func_or_gen(self, callable: Callable, *args, **kwargs) -> Any: + async def _call_func_or_gen( + self, + callable: Callable, + *, + args: Optional[Tuple[Any]] = None, + kwargs: Optional[Dict[str, Any]] = None, + request_metadata: Optional[RequestMetadata] = None, + generator_result_callback: Optional[Callable] = None, + run_sync_methods_in_threadpool_override: Optional[bool] = None, + ) -> Tuple[Any, bool]: """Call the callable with the provided arguments. This is a convenience wrapper that will work for `def`, `async def`, generator, and async generator functions. + + Returns the result and a boolean indicating if the result was a sync generator + that has already been consumed. """ - result = callable(*args, **kwargs) - if inspect.iscoroutine(result): - result = await result + sync_gen_consumed = False + args = args if args is not None else tuple() + kwargs = kwargs if kwargs is not None else dict() + run_sync_in_threadpool = ( + self._run_sync_methods_in_threadpool + if run_sync_methods_in_threadpool_override is None + else run_sync_methods_in_threadpool_override + ) + is_sync_method = ( + inspect.isfunction(callable) or inspect.ismethod(callable) + ) and not ( + inspect.iscoroutinefunction(callable) + or inspect.isasyncgenfunction(callable) + ) - return result + if is_sync_method and run_sync_in_threadpool: + is_generator = inspect.isgeneratorfunction(callable) + if is_generator: + sync_gen_consumed = True + if request_metadata and not request_metadata.is_streaming: + # TODO(edoakes): make this check less redundant with the one in + # _handle_user_method_result. + raise TypeError( + f"Method '{callable.__name__}' returned a generator. " + "You must use `handle.options(stream=True)` to call " + "generators on a deployment." + ) + + def run_callable(): + result = callable(*args, **kwargs) + if is_generator: + for r in result: + # TODO(edoakes): make this less redundant with the handling in + # _handle_user_method_result. + if request_metadata and request_metadata.is_grpc_request: + r = (request_metadata.grpc_context, r.SerializeToString()) + generator_result_callback(r) + + result = None + + return result + + # NOTE(edoakes): we use anyio.to_thread here because it's what Starlette + # uses (and therefore FastAPI too). The max size of the threadpool is + # set to max_ongoing_requests in the replica wrapper. + # anyio.to_thread propagates ContextVars to the worker thread automatically. + result = await to_thread.run_sync(run_callable) + else: + if ( + is_sync_method + and not self._warned_about_sync_method_change + and run_sync_methods_in_threadpool_override is None + ): + self._warned_about_sync_method_change = True + warnings.warn( + RAY_SERVE_RUN_SYNC_IN_THREADPOOL_WARNING.format( + method_name=callable.__name__, + ) + ) + + result = callable(*args, **kwargs) + if inspect.iscoroutine(result): + result = await result + + return result, sync_gen_consumed @property def user_callable(self) -> Optional[Callable]: @@ -974,8 +1225,10 @@ async def initialize_callable(self) -> Optional[ASGIApp]: self._callable = self._deployment_def.__new__(self._deployment_def) await self._call_func_or_gen( self._callable.__init__, - *self._init_args, - **self._init_kwargs, + args=self._init_args, + kwargs=self._init_kwargs, + # Always run the constructor on the main user code thread. + run_sync_methods_in_threadpool_override=False, ) if isinstance(self._callable, ASGIAppReplicaWrapper): @@ -997,7 +1250,7 @@ async def initialize_callable(self) -> Optional[ASGIApp]: def _raise_if_not_initialized(self, method_name: str): if self._callable is None: raise RuntimeError( - "`initialize_callable` must be called before `{method_name}`." + f"`initialize_callable` must be called before `{method_name}`." ) def call_user_health_check(self) -> Optional[concurrent.futures.Future]: @@ -1037,7 +1290,7 @@ async def call_reconfigure(self, user_config: Any): ) await self._call_func_or_gen( getattr(self._callable, RECONFIGURE_METHOD), - user_config, + args=(user_config,), ) def _prepare_args_for_http_request( @@ -1109,6 +1362,7 @@ async def _handle_user_method_result( user_method_name: str, request_metadata: RequestMetadata, *, + sync_gen_consumed: bool, generator_result_callback: Optional[Callable], is_asgi_app: bool, asgi_args: Optional[ASGIArgs], @@ -1142,7 +1396,7 @@ async def _handle_user_method_result( # For the FastAPI codepath, the response has already been sent over # ASGI, but for the vanilla deployment codepath we need to send it. await self._send_user_result_over_asgi(result, asgi_args) - elif not request_metadata.is_http_request: + elif not request_metadata.is_http_request and not sync_gen_consumed: # If a unary method is called with stream=True for anything EXCEPT # an HTTP request, raise an error. # HTTP requests are always streaming regardless of if the method @@ -1227,19 +1481,32 @@ async def call_user_method( request_args[0], request_metadata, user_method_params ) - result = await self._handle_user_method_result( - await self._call_func_or_gen( - user_method, *request_args, **request_kwargs - ), + result, sync_gen_consumed = await self._call_func_or_gen( + user_method, + args=request_args, + kwargs=request_kwargs, + request_metadata=request_metadata, + generator_result_callback=generator_result_callback + if request_metadata.is_streaming + else None, + ) + return await self._handle_user_method_result( + result, user_method_name, request_metadata, + sync_gen_consumed=sync_gen_consumed, generator_result_callback=generator_result_callback, is_asgi_app=is_asgi_app, asgi_args=asgi_args, ) except Exception: - if request_metadata.is_http_request and asgi_args is not None: + if ( + request_metadata.is_http_request + and asgi_args is not None + # If the callable is an ASGI app, it already sent a 500 status response. + and not is_asgi_app + ): await self._send_user_result_over_asgi( starlette.responses.Response( "Internal Server Error", status_code=500 @@ -1252,8 +1519,6 @@ async def call_user_method( if receive_task is not None and not receive_task.done(): receive_task.cancel() - return result - @_run_on_user_code_event_loop async def call_destructor(self): """Explicitly call the `__del__` method of the user callable. @@ -1277,7 +1542,11 @@ async def call_destructor(self): try: if hasattr(self._callable, "__del__"): # Make sure to accept `async def __del__(self)` as well. - await self._call_func_or_gen(self._callable.__del__) + await self._call_func_or_gen( + self._callable.__del__, + # Always run the destructor on the main user callable thread. + run_sync_methods_in_threadpool_override=False, + ) if hasattr(self._callable, "__serve_multiplex_wrapper"): await getattr(self._callable, "__serve_multiplex_wrapper").shutdown() diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index 9cd8c10f5f82..85d391c95d52 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -327,7 +327,7 @@ def assign_request( pass @abstractmethod - def shutdown(self): + def shutdown(self) -> concurrent.futures.Future: pass @@ -680,7 +680,7 @@ def assign_request( loop=self._asyncio_loop, ) - def shutdown(self): - asyncio.run_coroutine_threadsafe( + def shutdown(self) -> concurrent.futures.Future: + return asyncio.run_coroutine_threadsafe( self._asyncio_router.shutdown(), loop=self._asyncio_loop - ).result() + ) diff --git a/python/ray/serve/api.py b/python/ray/serve/api.py index 182795889d47..13b92c7fcaae 100644 --- a/python/ray/serve/api.py +++ b/python/ray/serve/api.py @@ -474,6 +474,7 @@ def _run( else: client = _private_api.serve_start( http_options={"location": "EveryNode"}, + global_logging_config=logging_config, ) # Record after Ray has been started. ServeUsageTag.API_VERSION.record("v2") diff --git a/python/ray/serve/handle.py b/python/ray/serve/handle.py index 0eba1c5dc5ee..ead2e174948b 100644 --- a/python/ray/serve/handle.py +++ b/python/ray/serve/handle.py @@ -76,16 +76,6 @@ def __init__( extra={"log_to_stderr": False}, ) - def _get_or_create_router(self) -> Router: - if self._router is None: - self._router = self._create_router( - handle_id=self.handle_id, - deployment_id=self.deployment_id, - handle_options=self.init_options, - ) - - return self._router - @staticmethod def _gen_handle_tag(app_name: str, deployment_name: str, handle_id: str): if app_name: @@ -150,8 +140,13 @@ def _init(self, **kwargs): f"was initialized with {self.init_options}." ) - self.init_options = create_init_handle_options(**kwargs) - self._get_or_create_router() + init_options = create_init_handle_options(**kwargs) + self._router = self._create_router( + handle_id=self.handle_id, + deployment_id=self.deployment_id, + handle_options=init_options, + ) + self.init_options = init_options # Record handle api telemetry when not in the proxy if ( @@ -209,7 +204,13 @@ def __getattr__(self, name): def shutdown(self): if self._router: - self._router.shutdown() + shutdown_future = self._router.shutdown() + shutdown_future.result() + + async def shutdown_async(self): + if self._router: + shutdown_future = self._router.shutdown() + await asyncio.wrap_future(shutdown_future) def __repr__(self): return f"{self.__class__.__name__}" f"(deployment='{self.deployment_name}')" diff --git a/python/ray/serve/tests/BUILD b/python/ray/serve/tests/BUILD index 369b9a339c6f..9f3208084538 100644 --- a/python/ray/serve/tests/BUILD +++ b/python/ray/serve/tests/BUILD @@ -467,3 +467,25 @@ py_test_module_list( "//python/ray/serve:serve_lib", ], ) + + +# Test currently off-by-default behavior to run replica sync methods in a threadpool. +# TODO(edoakes): remove this once the FF is flipped on by default. +py_test_module_list( + size = "small", + env = {"RAY_SERVE_RUN_SYNC_IN_THREADPOOL": "1"}, + files = [ + "test_replica_sync_methods.py", + ], + name_suffix = "_with_run_sync_in_threadpool", + tags = [ + "exclusive", + "no_windows", + "team:serve", + ], + deps = [ + ":common", + ":conftest", + "//python/ray/serve:serve_lib", + ], +) diff --git a/python/ray/serve/tests/test_cli.py b/python/ray/serve/tests/test_cli.py index 943024cfbe38..2b2c0ff279dd 100644 --- a/python/ray/serve/tests/test_cli.py +++ b/python/ray/serve/tests/test_cli.py @@ -606,6 +606,34 @@ def check_for_failed_deployment(): wait_for_condition(check_for_failed_deployment) +@pytest.mark.skipif(sys.platform == "win32", reason="File path incorrect on Windows.") +def test_status_constructor_retry_error(ray_start_stop): + """Deploys Serve deployment that errors out in constructor, checks that the + retry message is surfaced. + """ + + config_file_name = os.path.join( + os.path.dirname(__file__), "test_config_files", "deployment_fail_2.yaml" + ) + + subprocess.check_output(["serve", "deploy", config_file_name]) + + def check_for_failed_deployment(): + cli_output = subprocess.check_output( + ["serve", "status", "-a", "http://localhost:52365/"] + ) + status = yaml.safe_load(cli_output)["applications"][SERVE_DEFAULT_APP_NAME] + assert status["status"] == "DEPLOYING" + + deployment_status = status["deployments"]["A"] + assert deployment_status["status"] == "UPDATING" + assert deployment_status["status_trigger"] == "CONFIG_UPDATE_STARTED" + assert "ZeroDivisionError" in deployment_status["message"] + return True + + wait_for_condition(check_for_failed_deployment) + + @pytest.mark.skipif(sys.platform == "win32", reason="File path incorrect on Windows.") def test_status_package_unavailable_in_controller(ray_start_stop): """Test that exceptions raised from packages that are installed on deployment actors diff --git a/python/ray/serve/tests/test_config_files/deployment_fail_2.yaml b/python/ray/serve/tests/test_config_files/deployment_fail_2.yaml new file mode 100644 index 000000000000..38da015b0bb3 --- /dev/null +++ b/python/ray/serve/tests/test_config_files/deployment_fail_2.yaml @@ -0,0 +1,3 @@ +applications: + - name: default + import_path: ray.serve.tests.test_config_files.fail_2.node diff --git a/python/ray/serve/tests/test_config_files/fail_2.py b/python/ray/serve/tests/test_config_files/fail_2.py new file mode 100644 index 000000000000..2e95aa93d98f --- /dev/null +++ b/python/ray/serve/tests/test_config_files/fail_2.py @@ -0,0 +1,13 @@ +import time + +from ray import serve + + +@serve.deployment +class A: + def __init__(self): + time.sleep(5) + 1 / 0 + + +node = A.bind() diff --git a/python/ray/serve/tests/test_controller_recovery.py b/python/ray/serve/tests/test_controller_recovery.py index 0042323221b3..51d641dbedfc 100644 --- a/python/ray/serve/tests/test_controller_recovery.py +++ b/python/ray/serve/tests/test_controller_recovery.py @@ -64,7 +64,7 @@ def __call__(self, *args): replica_version_hash = None for replica in deployment_dict[id]: - ref = replica.actor_handle._get_metadata.remote() + ref = replica.actor_handle.initialize_and_get_metadata.remote() _, version, _, _ = ray.get(ref) if replica_version_hash is None: replica_version_hash = hash(version) @@ -116,7 +116,7 @@ def __call__(self, *args): # Ensure recovered replica version has are the same for replica_name in recovered_replica_names: actor_handle = ray.get_actor(replica_name, namespace=SERVE_NAMESPACE) - ref = actor_handle._get_metadata.remote() + ref = actor_handle.initialize_and_get_metadata.remote() _, version, _, _ = ray.get(ref) assert replica_version_hash == hash( version @@ -487,7 +487,7 @@ def check_proxy_handle_in_controller(): resp = requests.get("http://127.0.0.1:8000") assert resp.status_code == 200 wait_for_condition( - check_log_file, log_file=file_path, expected_regex=['.*"message":.*GET 200.*'] + check_log_file, log_file=file_path, expected_regex=['.*"message":.*GET / 200.*'] ) diff --git a/python/ray/serve/tests/test_gcs_failure.py b/python/ray/serve/tests/test_gcs_failure.py index cb582a3df51c..3dabb58a99fc 100644 --- a/python/ray/serve/tests/test_gcs_failure.py +++ b/python/ray/serve/tests/test_gcs_failure.py @@ -30,9 +30,14 @@ def serve_ha(external_redis, monkeypatch): # noqa: F811 ) serve.start() yield (address_info, _get_global_client()) - ray.shutdown() + + # When GCS is down, right now some core worker members are not cleared + # properly in ray.shutdown. + ray.worker._global_node.start_gcs_server() + # Clear cache and global serve client serve.shutdown() + ray.shutdown() @pytest.mark.skipif( @@ -127,6 +132,7 @@ def router_populated_with_replicas( else: replicas = get_replicas_func() + print(f"Replica set in router: {replicas}") assert len(replicas) >= threshold # Return early if we don't need to check cache @@ -299,7 +305,4 @@ def test_proxy_router_updated_replicas_then_gcs_failure(serve_ha): if __name__ == "__main__": - # When GCS is down, right now some core worker members are not cleared - # properly in ray.shutdown. Given that this is not hi-pri issue, - # using --forked for isolation. - sys.exit(pytest.main(["-v", "-s", "--forked", __file__])) + sys.exit(pytest.main(["-v", "-s", __file__])) diff --git a/python/ray/serve/tests/test_handle_2.py b/python/ray/serve/tests/test_handle_2.py index cc58f970f5b7..6b238d8211d9 100644 --- a/python/ray/serve/tests/test_handle_2.py +++ b/python/ray/serve/tests/test_handle_2.py @@ -472,5 +472,30 @@ async def _assert_one_waiter(): tasks = pending +def test_shutdown(serve_instance): + @serve.deployment + class Hi: + def __call__(self): + return "hi" + + h = serve.run(Hi.bind()) + assert h.remote().result() == "hi" + + h.shutdown() + + +@pytest.mark.asyncio +async def test_shutdown_async(serve_instance): + @serve.deployment + class Hi: + def __call__(self): + return "hi" + + h = serve.run(Hi.bind()) + assert await h.remote() == "hi" + + await h.shutdown_async() + + if __name__ == "__main__": sys.exit(pytest.main(["-v", "-s", __file__])) diff --git a/python/ray/serve/tests/test_logging.py b/python/ray/serve/tests/test_logging.py index b5b723a6d987..24f9a47a01a1 100644 --- a/python/ray/serve/tests/test_logging.py +++ b/python/ray/serve/tests/test_logging.py @@ -14,13 +14,15 @@ import pytest import requests import starlette +from fastapi import FastAPI +from starlette.responses import PlainTextResponse import ray import ray.util.state as state_api from ray import serve from ray._private.ray_logging.formatters import JSONFormatter from ray._private.test_utils import wait_for_condition -from ray.serve._private.common import ReplicaID, ServeComponentType +from ray.serve._private.common import DeploymentID, ReplicaID, ServeComponentType from ray.serve._private.constants import SERVE_LOG_EXTRA_FIELDS, SERVE_LOGGER_NAME from ray.serve._private.logging_utils import ( ServeComponentFilter, @@ -97,6 +99,97 @@ def __call__(self): assert rotation_config["backup_count"] == backup_count +def test_http_access_log(serve_instance): + name = "deployment_name" + + fastapi_app = FastAPI() + + @serve.deployment(name=name) + @serve.ingress(fastapi_app) + class Handler: + def __init__(self): + self._replica_unique_id = serve.get_replica_context().replica_id.unique_id + + @fastapi_app.get("/") + def get_root(self): + return PlainTextResponse(self._replica_unique_id) + + @fastapi_app.post("/") + def post_root(self): + return PlainTextResponse(self._replica_unique_id) + + @fastapi_app.get("/{status}") + def template(self, status: str): + return PlainTextResponse(self._replica_unique_id, status_code=int(status)) + + @fastapi_app.put("/fail") + def fail(self): + raise RuntimeError("OOPS!") + + serve.run(Handler.bind()) + + f = io.StringIO() + with redirect_stderr(f): + + def check_log( + replica_id: ReplicaID, + method: str, + route: str, + status_code: str, + fail: bool = False, + ): + s = f.getvalue() + return all( + [ + name in s, + _get_expected_replica_log_content(replica_id) in s, + f"-- {method} {route} {status_code}" in s, + "ms" in s, + ("OOPS!" in s and "RuntimeError" in s) + if fail + else True, # Check for stacktrace. + ] + ) + + r = requests.get("http://localhost:8000/") + assert r.status_code == 200 + replica_id = ReplicaID(unique_id=r.text, deployment_id=DeploymentID(name=name)) + wait_for_condition( + check_log, replica_id=replica_id, method="GET", route="/", status_code="200" + ) + + r = requests.post("http://localhost:8000/") + assert r.status_code == 200 + wait_for_condition( + check_log, + replica_id=replica_id, + method="POST", + route="/", + status_code="200", + ) + + r = requests.get("http://localhost:8000/350") + assert r.status_code == 350 + wait_for_condition( + check_log, + replica_id=replica_id, + method="GET", + route="/{status}", + status_code="350", + ) + + r = requests.put("http://localhost:8000/fail") + assert r.status_code == 500 + wait_for_condition( + check_log, + replica_id=replica_id, + method="PUT", + route="/fail", + status_code="500", + fail=True, + ) + + def test_handle_access_log(serve_instance): name = "handler" @@ -122,7 +215,7 @@ def check_log(replica_id: ReplicaID, method_name: str, fail: bool = False): [ name in s, _get_expected_replica_log_content(replica_id) in s, - method_name.upper() in s, + method_name in s, ("ERROR" if fail else "OK") in s, "ms" in s, ("blah blah blah" in s and "RuntimeError" in s) @@ -258,6 +351,9 @@ def fn(*args): "actor_id": ray.get_runtime_context().get_actor_id(), "worker_id": ray.get_runtime_context().get_worker_id(), "node_id": ray.get_runtime_context().get_node_id(), + "task_name": ray.get_runtime_context().get_task_name(), + "task_func_name": ray.get_runtime_context().get_task_function_name(), + "actor_name": ray.get_runtime_context().get_actor_name(), } @serve.deployment( @@ -276,6 +372,9 @@ def __call__(self, req: starlette.requests.Request): "actor_id": ray.get_runtime_context().get_actor_id(), "worker_id": ray.get_runtime_context().get_worker_id(), "node_id": ray.get_runtime_context().get_node_id(), + "task_name": ray.get_runtime_context().get_task_name(), + "task_func_name": ray.get_runtime_context().get_task_function_name(), + "actor_name": ray.get_runtime_context().get_actor_name(), } serve.run(fn.bind(), name="app1", route_prefix="/fn") @@ -288,15 +387,14 @@ def __call__(self, req: starlette.requests.Request): # Check the component log expected_log_infos = [ - f"{resp['request_id']} {resp['route']} replica.py", - f"{resp2['request_id']} {resp2['route']} replica.py", + f"{resp['request_id']} -- ", + f"{resp2['request_id']} -- ", ] # Check User log user_log_regexes = [ - f".*{resp['request_id']} {resp['route']}.* user func.*", - f".*{resp2['request_id']} {resp2['route']}.* user log " - "message from class method.*", + f".*{resp['request_id']} -- user func.*", + f".*{resp2['request_id']} -- user log.*" "message from class method.*", ] def check_log(): @@ -326,6 +424,9 @@ def check_log(): f'"worker_id": "{resp["worker_id"]}", ' f'"node_id": "{resp["node_id"]}", ' f'"actor_id": "{resp["actor_id"]}", ' + f'"task_name": "{resp["task_name"]}", ' + f'"task_func_name": "{resp["task_func_name"]}", ' + f'"actor_name": "{resp["actor_name"]}", ' f'"deployment": "{resp["app_name"]}_fn", ' f'"replica": "{method_replica_id}", ' f'"component_name": "replica".*' @@ -338,17 +439,17 @@ def check_log(): f'"worker_id": "{resp2["worker_id"]}", ' f'"node_id": "{resp2["node_id"]}", ' f'"actor_id": "{resp2["actor_id"]}", ' + f'"task_name": "{resp2["task_name"]}", ' + f'"task_func_name": "{resp2["task_func_name"]}", ' + f'"actor_name": "{resp2["actor_name"]}", ' f'"deployment": "{resp2["app_name"]}_Model", ' f'"replica": "{class_method_replica_id}", ' f'"component_name": "replica".*' ) else: - user_method_log_regex = ( - f".*{resp['request_id']} {resp['route']}.* user func.*" - ) + user_method_log_regex = f".*{resp['request_id']} -- user func.*" user_class_method_log_regex = ( - f".*{resp2['request_id']} {resp2['route']}.* " - "user log message from class method.*" + f".*{resp2['request_id']} -- .*" "user log message from class method.*" ) def check_log_file(log_file: str, expected_regex: list): diff --git a/python/ray/serve/tests/test_long_poll.py b/python/ray/serve/tests/test_long_poll.py index 86bf03880e33..2ba31d414e05 100644 --- a/python/ray/serve/tests/test_long_poll.py +++ b/python/ray/serve/tests/test_long_poll.py @@ -38,7 +38,7 @@ def test_notifier_events_cleared_without_update(serve_instance): host = ray.remote(LongPollHost).remote( listen_for_change_request_timeout_s=(0.1, 0.1) ) - ray.get(host.notify_changed.remote("key_1", 999)) + ray.get(host.notify_changed.remote({"key_1": 999})) # Get an initial object snapshot for the key. object_ref = host.listen_for_change.remote({"key_1": -1}) @@ -60,8 +60,8 @@ def test_host_standalone(serve_instance): host = ray.remote(LongPollHost).remote() # Write two values - ray.get(host.notify_changed.remote("key_1", 999)) - ray.get(host.notify_changed.remote("key_2", 999)) + ray.get(host.notify_changed.remote({"key_1": 999})) + ray.get(host.notify_changed.remote({"key_2": 999})) object_ref = host.listen_for_change.remote({"key_1": -1, "key_2": -1}) # We should be able to get the result immediately @@ -77,7 +77,7 @@ def test_host_standalone(serve_instance): assert len(not_done) == 1 # Now update the value, we should immediately get updated value - ray.get(host.notify_changed.remote("key_2", 999)) + ray.get(host.notify_changed.remote({"key_2": 999})) result = ray.get(object_ref) assert len(result) == 1 assert "key_2" in result @@ -88,13 +88,13 @@ def test_long_poll_wait_for_keys(serve_instance): # are set. host = ray.remote(LongPollHost).remote() object_ref = host.listen_for_change.remote({"key_1": -1, "key_2": -1}) - ray.get(host.notify_changed.remote("key_1", 999)) - ray.get(host.notify_changed.remote("key_2", 999)) - # We should be able to get the one of the result immediately + ray.get(host.notify_changed.remote({"key_1": 123, "key_2": 456})) + + # We should be able to get the both results immediately result: Dict[str, UpdatedObject] = ray.get(object_ref) - assert set(result.keys()).issubset({"key_1", "key_2"}) - assert {v.object_snapshot for v in result.values()} == {999} + assert result.keys() == {"key_1", "key_2"} + assert {v.object_snapshot for v in result.values()} == {123, 456} def test_long_poll_restarts(serve_instance): @@ -106,7 +106,7 @@ class RestartableLongPollHost: def __init__(self) -> None: print("actor started") self.host = LongPollHost() - self.host.notify_changed("timer", time.time()) + self.host.notify_changed({"timer": time.time()}) self.should_exit = False async def listen_for_change(self, key_to_ids): @@ -142,8 +142,8 @@ async def test_client_callbacks(serve_instance): host = ray.remote(LongPollHost).remote() # Write two values - ray.get(host.notify_changed.remote("key_1", 100)) - ray.get(host.notify_changed.remote("key_2", 999)) + ray.get(host.notify_changed.remote({"key_1": 100})) + ray.get(host.notify_changed.remote({"key_2": 999})) callback_results = dict() @@ -167,7 +167,7 @@ def key_2_callback(result): timeout=1, ) - ray.get(host.notify_changed.remote("key_2", 1999)) + ray.get(host.notify_changed.remote({"key_2": 1999})) await async_wait_for_condition( lambda: callback_results == {"key_1": 100, "key_2": 999}, @@ -178,7 +178,7 @@ def key_2_callback(result): @pytest.mark.asyncio async def test_client_threadsafe(serve_instance): host = ray.remote(LongPollHost).remote() - ray.get(host.notify_changed.remote("key_1", 100)) + ray.get(host.notify_changed.remote({"key_1": 100})) e = asyncio.Event() @@ -198,7 +198,7 @@ def key_1_callback(_): def test_listen_for_change_java(serve_instance): host = ray.remote(LongPollHost).remote() - ray.get(host.notify_changed.remote("key_1", 999)) + ray.get(host.notify_changed.remote({"key_1": 999})) request_1 = {"keys_to_snapshot_ids": {"key_1": -1}} object_ref = host.listen_for_change_java.remote( LongPollRequest(**request_1).SerializeToString() @@ -211,7 +211,7 @@ def test_listen_for_change_java(serve_instance): endpoints: Dict[DeploymentID, EndpointInfo] = dict() endpoints["deployment_name"] = EndpointInfo(route="/test/xlang/poll") endpoints["deployment_name1"] = EndpointInfo(route="/test/xlang/poll1") - ray.get(host.notify_changed.remote(LongPollNamespace.ROUTE_TABLE, endpoints)) + ray.get(host.notify_changed.remote({LongPollNamespace.ROUTE_TABLE: endpoints})) object_ref_2 = host.listen_for_change_java.remote( LongPollRequest(**request_2).SerializeToString() ) @@ -240,7 +240,7 @@ def test_listen_for_change_java(serve_instance): ] ray.get( host.notify_changed.remote( - (LongPollNamespace.RUNNING_REPLICAS, "deployment_name"), replicas + {(LongPollNamespace.RUNNING_REPLICAS, "deployment_name"): replicas} ) ) object_ref_3 = host.listen_for_change_java.remote( diff --git a/python/ray/serve/tests/test_metrics.py b/python/ray/serve/tests/test_metrics.py index f93e37661394..6f64666a96ba 100644 --- a/python/ray/serve/tests/test_metrics.py +++ b/python/ray/serve/tests/test_metrics.py @@ -6,7 +6,11 @@ import grpc import pytest import requests -from fastapi import FastAPI +from fastapi import FastAPI, WebSocket +from starlette.requests import Request +from starlette.responses import PlainTextResponse +from websockets.exceptions import ConnectionClosed +from websockets.sync.client import connect import ray import ray.util.state as state_api @@ -583,6 +587,161 @@ def f(*args): print("serve_grpc_request_latency_ms_sum working as expected.") +def test_proxy_metrics_http_status_code_is_error(serve_start_shutdown): + """Verify that 2xx status codes aren't errors, others are.""" + + def check_request_count_metrics( + expected_error_count: int, + expected_success_count: int, + ): + resp = requests.get("http://127.0.0.1:9999").text + error_count = 0 + success_count = 0 + for line in resp.split("\n"): + if line.startswith("ray_serve_num_http_error_requests_total"): + error_count += int(float(line.split(" ")[-1])) + if line.startswith("ray_serve_num_http_requests_total"): + success_count += int(float(line.split(" ")[-1])) + + assert error_count == expected_error_count + assert success_count == expected_success_count + return True + + @serve.deployment + async def return_status_code(request: Request): + code = int((await request.body()).decode("utf-8")) + return PlainTextResponse("", status_code=code) + + serve.run(return_status_code.bind()) + + # 200 is not an error. + r = requests.get("http://127.0.0.1:8000/", data=b"200") + assert r.status_code == 200 + wait_for_condition( + check_request_count_metrics, + expected_error_count=0, + expected_success_count=1, + ) + + # 2xx is not an error. + r = requests.get("http://127.0.0.1:8000/", data=b"250") + assert r.status_code == 250 + wait_for_condition( + check_request_count_metrics, + expected_error_count=0, + expected_success_count=2, + ) + + # 3xx is an error. + r = requests.get("http://127.0.0.1:8000/", data=b"300") + assert r.status_code == 300 + wait_for_condition( + check_request_count_metrics, + expected_error_count=1, + expected_success_count=3, + ) + + # 4xx is an error. + r = requests.get("http://127.0.0.1:8000/", data=b"400") + assert r.status_code == 400 + wait_for_condition( + check_request_count_metrics, + expected_error_count=2, + expected_success_count=4, + ) + + # 5xx is an error. + r = requests.get("http://127.0.0.1:8000/", data=b"500") + assert r.status_code == 500 + wait_for_condition( + check_request_count_metrics, + expected_error_count=3, + expected_success_count=5, + ) + + +def test_proxy_metrics_websocket_status_code_is_error(serve_start_shutdown): + """Verify that status codes aisde from 1000 or 1001 are errors.""" + + def check_request_count_metrics( + expected_error_count: int, + expected_success_count: int, + ): + resp = requests.get("http://127.0.0.1:9999").text + error_count = 0 + success_count = 0 + for line in resp.split("\n"): + if line.startswith("ray_serve_num_http_error_requests_total"): + error_count += int(float(line.split(" ")[-1])) + if line.startswith("ray_serve_num_http_requests_total"): + success_count += int(float(line.split(" ")[-1])) + + assert error_count == expected_error_count + assert success_count == expected_success_count + return True + + fastapi_app = FastAPI() + + @serve.deployment + @serve.ingress(fastapi_app) + class WebSocketServer: + @fastapi_app.websocket("/") + async def accept_then_close(self, ws: WebSocket): + await ws.accept() + code = int(await ws.receive_text()) + await ws.close(code=code) + + serve.run(WebSocketServer.bind()) + + # Regular disconnect (1000) is not an error. + with connect("ws://localhost:8000/") as ws: + with pytest.raises(ConnectionClosed): + ws.send("1000") + ws.recv() + + wait_for_condition( + check_request_count_metrics, + expected_error_count=0, + expected_success_count=1, + ) + + # Goaway disconnect (1001) is not an error. + with connect("ws://localhost:8000/") as ws: + with pytest.raises(ConnectionClosed): + ws.send("1001") + ws.recv() + + wait_for_condition( + check_request_count_metrics, + expected_error_count=0, + expected_success_count=2, + ) + + # Other codes are errors. + with connect("ws://localhost:8000/") as ws: + with pytest.raises(ConnectionClosed): + ws.send("1011") + ws.recv() + + wait_for_condition( + check_request_count_metrics, + expected_error_count=1, + expected_success_count=3, + ) + + # Other codes are errors. + with connect("ws://localhost:8000/") as ws: + with pytest.raises(ConnectionClosed): + ws.send("3000") + ws.recv() + + wait_for_condition( + check_request_count_metrics, + expected_error_count=2, + expected_success_count=4, + ) + + def test_replica_metrics_fields(serve_start_shutdown): """Test replica metrics fields""" @@ -1581,7 +1740,7 @@ def test_long_poll_host_sends_counted(serve_instance): ) # Write a value. - ray.get(host.notify_changed.remote("key_1", 999)) + ray.get(host.notify_changed.remote({"key_1": 999})) object_ref = host.listen_for_change.remote({"key_1": -1}) # Check that the result's size is reported. @@ -1595,8 +1754,8 @@ def test_long_poll_host_sends_counted(serve_instance): ) # Write two new values. - ray.get(host.notify_changed.remote("key_1", 1000)) - ray.get(host.notify_changed.remote("key_2", 1000)) + ray.get(host.notify_changed.remote({"key_1": 1000})) + ray.get(host.notify_changed.remote({"key_2": 1000})) object_ref = host.listen_for_change.remote( {"key_1": result_1["key_1"].snapshot_id, "key_2": -1} ) diff --git a/python/ray/serve/tests/test_multiplex.py b/python/ray/serve/tests/test_multiplex.py index 994605565fd5..1da243af2212 100644 --- a/python/ray/serve/tests/test_multiplex.py +++ b/python/ray/serve/tests/test_multiplex.py @@ -21,7 +21,7 @@ def _get_replica_scheduler(handle: DeploymentHandle) -> ReplicaScheduler: # TODO(edoakes): we shouldn't be reaching into private fields, but better # to isolate it to one place (this function). - return handle._get_or_create_router()._asyncio_router._replica_scheduler + return handle._router._asyncio_router._replica_scheduler @pytest.fixture() diff --git a/python/ray/serve/tests/test_replica_sync_methods.py b/python/ray/serve/tests/test_replica_sync_methods.py new file mode 100644 index 000000000000..d6485704138f --- /dev/null +++ b/python/ray/serve/tests/test_replica_sync_methods.py @@ -0,0 +1,127 @@ +import asyncio +import sys + +import pytest +import requests +from anyio import to_thread +from fastapi import FastAPI +from starlette.responses import PlainTextResponse + +import ray +from ray import serve +from ray._private.test_utils import SignalActor, wait_for_condition +from ray.serve._private.constants import RAY_SERVE_RUN_SYNC_IN_THREADPOOL + + +@pytest.mark.skipif( + not RAY_SERVE_RUN_SYNC_IN_THREADPOOL, + reason="Run sync method in threadpool FF disabled.", +) +@pytest.mark.parametrize("use_fastapi", [False, True]) +def test_not_running_in_asyncio_loop(serve_instance, use_fastapi: bool): + if use_fastapi: + fastapi_app = FastAPI() + + @serve.deployment + @serve.ingress(fastapi_app) + class D: + @fastapi_app.get("/") + def root(self): + with pytest.raises(RuntimeError, match="no running event loop"): + asyncio.get_running_loop() + + else: + + @serve.deployment + class D: + def __call__(self) -> str: + with pytest.raises(RuntimeError, match="no running event loop"): + asyncio.get_running_loop() + + serve.run(D.bind()) + # Would error if the check fails. + requests.get("http://localhost:8000/").raise_for_status() + + +@pytest.mark.skipif( + not RAY_SERVE_RUN_SYNC_IN_THREADPOOL, + reason="Run sync method in threadpool FF disabled.", +) +def test_concurrent_execution(serve_instance): + signal_actor = SignalActor.remote() + + @serve.deployment + class D: + def do_sync(self): + ray.get(signal_actor.wait.remote()) + + async def do_async(self): + await signal_actor.wait.remote() + + h = serve.run(D.bind()) + + sync_results = [h.do_sync.remote(), h.do_sync.remote()] + async_results = [h.do_async.remote(), h.do_async.remote()] + + wait_for_condition(lambda: ray.get(signal_actor.cur_num_waiters.remote()) == 4) + ray.get(signal_actor.send.remote()) + [r.result() for r in sync_results + async_results] + + +@pytest.mark.skipif( + not RAY_SERVE_RUN_SYNC_IN_THREADPOOL, + reason="Run sync method in threadpool FF disabled.", +) +@pytest.mark.parametrize("use_fastapi", [False, True]) +def test_context_vars_propagated(serve_instance, use_fastapi: bool): + if use_fastapi: + fastapi_app = FastAPI() + + @serve.deployment + @serve.ingress(fastapi_app) + class D: + @fastapi_app.get("/") + def root(self): + return PlainTextResponse( + serve.context._serve_request_context.get().request_id + ) + + else: + + @serve.deployment + class D: + def __call__(self) -> str: + return PlainTextResponse( + serve.context._serve_request_context.get().request_id + ) + + serve.run(D.bind()) + + r = requests.get("http://localhost:8000/", headers={"X-Request-Id": "TEST-ID"}) + r.raise_for_status() + # If context vars weren't propagated, the request ID would be empty. + assert r.text == "TEST-ID" + + +@pytest.mark.skipif( + not RAY_SERVE_RUN_SYNC_IN_THREADPOOL, + reason="Run sync method in threadpool FF disabled.", +) +def test_thread_limit_set_to_max_ongoing_requests(serve_instance): + @serve.deployment + class D: + async def __call__(self): + return to_thread.current_default_thread_limiter().total_tokens + + h = serve.run(D.bind()) + + # Check that it's set if max_ongoing_requests is defaulted. + assert h.remote().result() == 5 + + # Update to a custom value, check again. + h = serve.run(D.options(max_ongoing_requests=10).bind()) + assert h.remote().result() == 10 + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", "-s", __file__])) diff --git a/python/ray/serve/tests/unit/test_deployment_state.py b/python/ray/serve/tests/unit/test_deployment_state.py index dfeb9fc7524c..42facaf40282 100644 --- a/python/ray/serve/tests/unit/test_deployment_state.py +++ b/python/ray/serve/tests/unit/test_deployment_state.py @@ -2492,9 +2492,7 @@ def create_deployment_state( check_counts(ds1, total=3, by_state=[(ReplicaState.STOPPING, 3, None)]) assert ds1._replica_constructor_retry_counter == 3 - # An error message should show up after - # 3 * num_replicas startup failures. - assert "" == ds1.curr_status_info.message + assert "Retrying 6 more time(s)" in ds1.curr_status_info.message # Set all of ds1's replicas to stopped. for replica in ds1._replicas.get(): @@ -2512,7 +2510,7 @@ def create_deployment_state( assert ds1.curr_status_info.status == DeploymentStatus.UPDATING check_counts(ds1, total=3, by_state=[(ReplicaState.STOPPING, 3, None)]) assert ds1._replica_constructor_retry_counter == 6 - assert "" == ds1.curr_status_info.message + assert "Retrying 3 more time(s)" in ds1.curr_status_info.message # Set all of ds1's replicas to stopped. for replica in ds1._replicas.get(): @@ -2527,7 +2525,7 @@ def create_deployment_state( assert ds1.curr_status_info.status == DeploymentStatus.UPDATING check_counts(ds1, total=3, by_state=[(ReplicaState.STOPPING, 3, None)]) assert ds1._replica_constructor_retry_counter == 9 - assert "" == ds1.curr_status_info.message + assert "Retrying 0 more time(s)" in ds1.curr_status_info.message # Set all of ds1's replicas to stopped. for replica in ds1._replicas.get(): @@ -2540,7 +2538,7 @@ def create_deployment_state( assert ds1.curr_status_info.status == DeploymentStatus.UNHEALTHY check_counts(ds1, total=0) assert ds1._replica_constructor_retry_counter == 9 - assert "Replica scheduling failed" in ds1.curr_status_info.message + assert "The deployment failed to start" in ds1.curr_status_info.message def test_deploy_with_transient_constructor_failure(mock_deployment_state_manager): diff --git a/python/ray/serve/tests/unit/test_proxy_request_response.py b/python/ray/serve/tests/unit/test_proxy_request_response.py index 70e2fdb2d581..7b2c4388b657 100644 --- a/python/ray/serve/tests/unit/test_proxy_request_response.py +++ b/python/ray/serve/tests/unit/test_proxy_request_response.py @@ -57,7 +57,7 @@ def test_method(self): """ proxy_request = self.create_asgi_proxy_request(scope={}) assert isinstance(proxy_request, ProxyRequest) - assert proxy_request.method == "WEBSOCKET" + assert proxy_request.method == "WS" method = "fake-method" proxy_request = self.create_asgi_proxy_request(scope={"method": method}) diff --git a/python/ray/serve/tests/unit/test_user_callable_wrapper.py b/python/ray/serve/tests/unit/test_user_callable_wrapper.py index b03c9ca7e39e..39188f32c421 100644 --- a/python/ray/serve/tests/unit/test_user_callable_wrapper.py +++ b/python/ray/serve/tests/unit/test_user_callable_wrapper.py @@ -4,7 +4,7 @@ import sys import threading from dataclasses import dataclass -from typing import AsyncGenerator, Callable, Generator, Optional +from typing import Any, AsyncGenerator, Callable, Dict, Generator, Optional, Tuple import pytest from fastapi import FastAPI @@ -90,13 +90,18 @@ async def basic_async_generator(n: int, raise_exception: bool = False): def _make_user_callable_wrapper( - callable: Optional[Callable] = None, *init_args, **init_kwargs + callable: Optional[Callable] = None, + *, + init_args: Optional[Tuple[Any]] = None, + init_kwargs: Optional[Dict[str, Any]] = None, + run_sync_methods_in_threadpool: bool = False, ) -> UserCallableWrapper: return UserCallableWrapper( callable if callable is not None else BasicClass, - init_args, - init_kwargs, + init_args or tuple(), + init_kwargs or dict(), deployment_id=DeploymentID(name="test_name"), + run_sync_methods_in_threadpool=run_sync_methods_in_threadpool, ) @@ -144,8 +149,11 @@ def test_calling_methods_before_initialize(): user_callable_wrapper.call_reconfigure(None).result() -def test_basic_class_callable(): - user_callable_wrapper = _make_user_callable_wrapper() +@pytest.mark.parametrize("run_sync_methods_in_threadpool", [False, True]) +def test_basic_class_callable(run_sync_methods_in_threadpool: bool): + user_callable_wrapper = _make_user_callable_wrapper( + run_sync_methods_in_threadpool=run_sync_methods_in_threadpool + ) user_callable_wrapper.initialize_callable().result() @@ -215,8 +223,11 @@ def test_basic_class_callable(): ).result() -def test_basic_class_callable_generators(): - user_callable_wrapper = _make_user_callable_wrapper() +@pytest.mark.parametrize("run_sync_methods_in_threadpool", [False, True]) +def test_basic_class_callable_generators(run_sync_methods_in_threadpool: bool): + user_callable_wrapper = _make_user_callable_wrapper( + run_sync_methods_in_threadpool=run_sync_methods_in_threadpool + ) user_callable_wrapper.initialize_callable().result() result_list = [] @@ -291,9 +302,12 @@ def test_basic_class_callable_generators(): assert result_list == [0] +@pytest.mark.parametrize("run_sync_methods_in_threadpool", [False, True]) @pytest.mark.parametrize("fn", [basic_sync_function, basic_async_function]) -def test_basic_function_callable(fn: Callable): - user_callable_wrapper = _make_user_callable_wrapper(fn) +def test_basic_function_callable(fn: Callable, run_sync_methods_in_threadpool: bool): + user_callable_wrapper = _make_user_callable_wrapper( + fn, run_sync_methods_in_threadpool=run_sync_methods_in_threadpool + ) user_callable_wrapper.initialize_callable().result() # Call non-generator function with is_streaming. @@ -325,9 +339,14 @@ def test_basic_function_callable(fn: Callable): ).result() +@pytest.mark.parametrize("run_sync_methods_in_threadpool", [False, True]) @pytest.mark.parametrize("fn", [basic_sync_generator, basic_async_generator]) -def test_basic_function_callable_generators(fn: Callable): - user_callable_wrapper = _make_user_callable_wrapper(fn) +def test_basic_function_callable_generators( + fn: Callable, run_sync_methods_in_threadpool: bool +): + user_callable_wrapper = _make_user_callable_wrapper( + fn, run_sync_methods_in_threadpool=run_sync_methods_in_threadpool + ) user_callable_wrapper.initialize_callable().result() result_list = [] @@ -366,36 +385,68 @@ def test_basic_function_callable_generators(fn: Callable): @pytest.mark.asyncio -async def test_user_code_runs_on_separate_loop(): +@pytest.mark.parametrize("run_sync_methods_in_threadpool", [False, True]) +async def test_user_code_runs_on_separate_loop(run_sync_methods_in_threadpool: bool): main_loop = asyncio.get_running_loop() class GetLoop: def __init__(self): self._constructor_loop = asyncio.get_running_loop() - def check_health(self): + async def check_health(self): check_health_loop = asyncio.get_running_loop() assert ( check_health_loop == self._constructor_loop ), "User constructor and health check should run on the same loop." return check_health_loop - def __call__(self) -> asyncio.AbstractEventLoop: + async def call_async(self) -> Optional[asyncio.AbstractEventLoop]: user_method_loop = asyncio.get_running_loop() assert ( user_method_loop == self._constructor_loop ), "User constructor and other methods should run on the same loop." + return user_method_loop - user_callable_wrapper = _make_user_callable_wrapper(GetLoop) + def call_sync(self): + if run_sync_methods_in_threadpool: + with pytest.raises(RuntimeError, match="no running event loop"): + asyncio.get_running_loop() + + user_method_loop = None + else: + user_method_loop = asyncio.get_running_loop() + assert ( + user_method_loop == self._constructor_loop + ), "User constructor and other methods should run on the same loop." + + return user_method_loop + + user_callable_wrapper = _make_user_callable_wrapper( + GetLoop, run_sync_methods_in_threadpool=run_sync_methods_in_threadpool + ) user_callable_wrapper.initialize_callable().result() - request_metadata = _make_request_metadata() + + # Async methods should all run on the same loop. + request_metadata = _make_request_metadata(call_method="call_async") user_code_loop = user_callable_wrapper.call_user_method( request_metadata, tuple(), dict() ).result() assert isinstance(user_code_loop, asyncio.AbstractEventLoop) assert user_code_loop != main_loop + # Sync methods should run on the same loop if run_sync_methods_in_threadpool is off, + # else run in no asyncio loop. + request_metadata = _make_request_metadata(call_method="call_sync") + user_code_loop = user_callable_wrapper.call_user_method( + request_metadata, tuple(), dict() + ).result() + if run_sync_methods_in_threadpool: + assert user_code_loop is None + else: + assert isinstance(user_code_loop, asyncio.AbstractEventLoop) + assert user_code_loop != main_loop + # `check_health` method asserts that it runs on the correct loop. user_callable_wrapper.call_user_health_check().result() @@ -412,7 +463,7 @@ def __call__(self) -> str: msg = "hello world" user_callable_wrapper = _make_user_callable_wrapper( AsyncInitializer, - msg, + init_args=(msg,), ) user_callable_wrapper.initialize_callable().result() request_metadata = _make_request_metadata() @@ -498,8 +549,11 @@ def stream(self, msg: serve_pb2.UserDefinedMessage): yield serve_pb2.UserDefinedResponse(greeting=f"Hello {msg.greeting} {i}!") -def test_grpc_unary_request(): - user_callable_wrapper = _make_user_callable_wrapper(gRPCClass) +@pytest.mark.parametrize("run_sync_methods_in_threadpool", [False, True]) +def test_grpc_unary_request(run_sync_methods_in_threadpool: bool): + user_callable_wrapper = _make_user_callable_wrapper( + gRPCClass, run_sync_methods_in_threadpool=run_sync_methods_in_threadpool + ) user_callable_wrapper.initialize_callable().result() grpc_request = gRPCRequest( @@ -518,8 +572,11 @@ def test_grpc_unary_request(): @pytest.mark.asyncio -def test_grpc_streaming_request(): - user_callable_wrapper = _make_user_callable_wrapper(gRPCClass) +@pytest.mark.parametrize("run_sync_methods_in_threadpool", [False, True]) +def test_grpc_streaming_request(run_sync_methods_in_threadpool: bool): + user_callable_wrapper = _make_user_callable_wrapper( + gRPCClass, run_sync_methods_in_threadpool=run_sync_methods_in_threadpool + ) user_callable_wrapper.initialize_callable() grpc_request = gRPCRequest( diff --git a/python/ray/tests/conftest.py b/python/ray/tests/conftest.py index 47557bc36a32..d13c5a89b2d5 100644 --- a/python/ray/tests/conftest.py +++ b/python/ray/tests/conftest.py @@ -1,6 +1,7 @@ """ This file defines the common pytest fixtures used in current directory. """ + import json import logging import os @@ -34,6 +35,8 @@ redis_replicas, get_redis_cli, start_redis_instance, + start_redis_sentinel_instance, + redis_sentinel_replicas, find_available_port, wait_for_condition, find_free_port, @@ -201,6 +204,34 @@ def redis_alive(port, enable_tls): return False +def start_redis_with_sentinel(db_dir): + temp_dir = ray._private.utils.get_ray_temp_dir() + + redis_ports = find_available_port(49159, 55535, redis_sentinel_replicas() + 1) + sentinel_port = redis_ports[0] + master_port = redis_ports[1] + redis_processes = [ + start_redis_instance(temp_dir, p, listen_to_localhost_only=True, db_dir=db_dir)[ + 1 + ] + for p in redis_ports[1:] + ] + + # ensure all redis servers are up + for port in redis_ports[1:]: + wait_for_condition(redis_alive, 3, 100, port=port, enable_tls=False) + + # setup replicas of the master + for port in redis_ports[2:]: + redis_cli = get_redis_cli(port, False) + redis_cli.replicaof("127.0.0.1", master_port) + sentinel_process = start_redis_sentinel_instance( + temp_dir, sentinel_port, master_port + ) + address_str = f"127.0.0.1:{sentinel_port}" + return address_str, redis_processes + [sentinel_process] + + def start_redis(db_dir): retry_num = 0 while True: @@ -289,10 +320,14 @@ def kill_all_redis_server(): @contextmanager -def _setup_redis(request): +def _setup_redis(request, with_sentinel=False): with tempfile.TemporaryDirectory() as tmpdirname: kill_all_redis_server() - address_str, processes = start_redis(tmpdirname) + address_str, processes = ( + start_redis_with_sentinel(tmpdirname) + if with_sentinel + else start_redis(tmpdirname) + ) old_addr = os.environ.get("RAY_REDIS_ADDRESS") os.environ["RAY_REDIS_ADDRESS"] = address_str import uuid @@ -332,6 +367,12 @@ def external_redis(request): yield +@pytest.fixture +def external_redis_with_sentinel(request): + with _setup_redis(request, True): + yield + + @pytest.fixture def shutdown_only(maybe_external_redis): yield None @@ -535,6 +576,15 @@ def ray_start_cluster_head_with_external_redis(request, external_redis): yield res +@pytest.fixture +def ray_start_cluster_head_with_external_redis_sentinel( + request, external_redis_with_sentinel +): + param = getattr(request, "param", {}) + with _ray_start_cluster(do_init=True, num_nodes=1, **param) as res: + yield res + + @pytest.fixture def ray_start_cluster_head_with_env_vars(request, maybe_external_redis, monkeypatch): param = getattr(request, "param", {}) diff --git a/python/ray/tests/kuberay/test_autoscaling_config.py b/python/ray/tests/kuberay/test_autoscaling_config.py index 980f266bc9c5..699df522eb1a 100644 --- a/python/ray/tests/kuberay/test_autoscaling_config.py +++ b/python/ray/tests/kuberay/test_autoscaling_config.py @@ -71,7 +71,7 @@ def _get_basic_autoscaling_config() -> dict: "type": "kuberay", }, "available_node_types": { - "head-group": { + "headgroup": { "max_workers": 0, "min_workers": 0, "node_config": {}, @@ -84,7 +84,7 @@ def _get_basic_autoscaling_config() -> dict: }, "small-group": { "max_workers": 300, - "min_workers": 1, + "min_workers": 0, "node_config": {}, "resources": { "CPU": 1, @@ -97,7 +97,7 @@ def _get_basic_autoscaling_config() -> dict: # and modified max_workers. "gpu-group": { "max_workers": 200, - "min_workers": 1, + "min_workers": 0, "node_config": {}, "resources": { "CPU": 1, @@ -111,7 +111,7 @@ def _get_basic_autoscaling_config() -> dict: # and modified max_workers and node_config. "tpu-group": { "max_workers": 4, - "min_workers": 1, + "min_workers": 0, "node_config": {}, "resources": { "CPU": 1, @@ -127,7 +127,7 @@ def _get_basic_autoscaling_config() -> dict: "cluster_synced_files": [], "file_mounts": {}, "file_mounts_sync_continuously": False, - "head_node_type": "head-group", + "head_node_type": "headgroup", "head_setup_commands": [], "head_start_ray_commands": [], "idle_timeout_minutes": 1.0, diff --git a/python/ray/tests/kuberay/test_kuberay_node_provider.py b/python/ray/tests/kuberay/test_kuberay_node_provider.py index 4d2f94c1d086..3d5ea52009cb 100644 --- a/python/ray/tests/kuberay/test_kuberay_node_provider.py +++ b/python/ray/tests/kuberay/test_kuberay_node_provider.py @@ -126,7 +126,7 @@ def test_create_node_cap_at_max( { "raycluster-autoscaler-head-8zsc8": NodeData( kind="head", - type="head-group", + type="headgroup", replica_index=None, ip="10.4.2.6", status="up-to-date", @@ -149,7 +149,7 @@ def test_create_node_cap_at_max( { "raycluster-autoscaler-head-8zsc8": NodeData( kind="head", - type="head-group", + type="headgroup", replica_index=None, ip="10.4.2.6", status="up-to-date", @@ -217,7 +217,7 @@ def mock_get(node_provider, path): { "raycluster-autoscaler-head-8zsc8": NodeData( kind="head", - type="head-group", + type="headgroup", replica_index=None, ip="10.4.2.6", status="up-to-date", diff --git a/python/ray/tests/test_advanced.py b/python/ray/tests/test_advanced.py index a3cc669c8c90..8642099b042b 100644 --- a/python/ray/tests/test_advanced.py +++ b/python/ray/tests/test_advanced.py @@ -125,11 +125,11 @@ def test_internal_get_local_ongoing_lineage_reconstruction_tasks( ray_start_cluster_enabled, ): cluster = ray_start_cluster_enabled - cluster.add_node(resources={"head": 1}) + cluster.add_node(resources={"head": 2}) ray.init(address=cluster.address) - worker1 = cluster.add_node(resources={"worker": 1}) + worker1 = cluster.add_node(resources={"worker": 2}) - @ray.remote(resources={"head": 1}) + @ray.remote(num_cpus=0, resources={"head": 1}) class Counter: def __init__(self): self.count = 0 @@ -138,7 +138,9 @@ def inc(self): self.count = self.count + 1 return self.count - @ray.remote(max_retries=-1, num_cpus=0, resources={"worker": 1}) + @ray.remote( + max_retries=-1, num_cpus=0, resources={"worker": 1}, _labels={"key1": "value1"} + ) def task(counter): count = ray.get(counter.inc.remote()) if count > 1: @@ -146,10 +148,31 @@ def task(counter): time.sleep(100000) return [1] * 1024 * 1024 - counter = Counter.remote() - obj = task.remote(counter) + @ray.remote( + max_restarts=-1, + max_task_retries=-1, + num_cpus=0, + resources={"worker": 1}, + _labels={"key2": "value2"}, + ) + class Actor: + def run(self, counter): + count = ray.get(counter.inc.remote()) + if count > 1: + # lineage reconstruction + time.sleep(100000) + return [1] * 1024 * 1024 + + counter1 = Counter.remote() + obj1 = task.remote(counter1) # Wait for task to finish - ray.wait([obj], fetch_local=False) + ray.wait([obj1], fetch_local=False) + + counter2 = Counter.remote() + actor = Actor.remote() + obj2 = actor.run.remote(counter2) + # Wait for actor task to finish + ray.wait([obj2], fetch_local=False) assert len(get_local_ongoing_lineage_reconstruction_tasks()) == 0 @@ -158,16 +181,27 @@ def task(counter): def verify(expected_task_status): lineage_reconstruction_tasks = get_local_ongoing_lineage_reconstruction_tasks() - return ( - len(lineage_reconstruction_tasks) == 1 - and lineage_reconstruction_tasks[0][0].name == "task" - and lineage_reconstruction_tasks[0][0].resources == {"worker": 1.0} + lineage_reconstruction_tasks.sort(key=lambda task: task[0].name) + assert len(lineage_reconstruction_tasks) == 2 + assert [ + lineage_reconstruction_tasks[0][0].name, + lineage_reconstruction_tasks[1][0].name, + ] == ["Actor.run", "task"] + assert ( + lineage_reconstruction_tasks[0][0].labels == {"key2": "value2"} and lineage_reconstruction_tasks[0][0].status == expected_task_status and lineage_reconstruction_tasks[0][1] == 1 ) + assert ( + lineage_reconstruction_tasks[1][0].labels == {"key1": "value1"} + and lineage_reconstruction_tasks[1][0].status == expected_task_status + and lineage_reconstruction_tasks[1][1] == 1 + ) + + return True wait_for_condition(lambda: verify(common_pb2.TaskStatus.PENDING_NODE_ASSIGNMENT)) - cluster.add_node(resources={"worker": 1}) + cluster.add_node(resources={"worker": 2}) wait_for_condition(lambda: verify(common_pb2.TaskStatus.SUBMITTED_TO_WORKER)) diff --git a/python/ray/tests/test_basic.py b/python/ray/tests/test_basic.py index 001f38393390..55b6ec73f21d 100644 --- a/python/ray/tests/test_basic.py +++ b/python/ray/tests/test_basic.py @@ -581,7 +581,7 @@ def foo(): # TODO(suquark): The current implementation of `.options()` is so bad that we # cannot even access its options from outside. Here we hack the closures to # achieve our goal. Need futher efforts to clean up the tech debt. - assert f2.remote.__closure__[1].cell_contents == { + assert f2.remote.__closure__[2].cell_contents == { "_metadata": {"namespace": {"a": 11, "b": 2, "c": 3}}, "num_cpus": 1, "num_gpus": 1, @@ -593,7 +593,7 @@ def __init__(self, **options): f3 = foo.options(num_cpus=1, num_gpus=1, **mock_options2(a=11, c=3)) - assert f3.remote.__closure__[1].cell_contents == { + assert f3.remote.__closure__[2].cell_contents == { "_metadata": {"namespace": {"a": 1, "b": 2}, "namespace2": {"a": 11, "c": 3}}, "num_cpus": 1, "num_gpus": 1, diff --git a/python/ray/tests/test_cli_logger.py b/python/ray/tests/test_cli_logger.py index b164f5dc3725..bced27abee01 100644 --- a/python/ray/tests/test_cli_logger.py +++ b/python/ray/tests/test_cli_logger.py @@ -1,4 +1,6 @@ from ray.autoscaler._private import cli_logger +import io +from unittest.mock import patch import pytest @@ -14,6 +16,14 @@ def test_colorful_mock_random_function(): assert cm.bold("abc") == "abc" +def test_pathname(): + # Ensure that the `pathname` of the `LogRecord` points to the + # caller of `cli_logger`, not `cli_logger` itself. + with patch("sys.stdout", new=io.StringIO()) as mock_stdout: + cli_logger.cli_logger.info("123") + assert "test_cli_logger.py" in mock_stdout.getvalue() + + if __name__ == "__main__": import os import sys diff --git a/python/ray/tests/test_gcs_fault_tolerance.py b/python/ray/tests/test_gcs_fault_tolerance.py index bca9b83021de..eaecd0c3a073 100644 --- a/python/ray/tests/test_gcs_fault_tolerance.py +++ b/python/ray/tests/test_gcs_fault_tolerance.py @@ -20,6 +20,7 @@ wait_for_condition, wait_for_pid_to_exit, run_string_as_driver, + redis_sentinel_replicas, ) from ray.job_submission import JobSubmissionClient, JobStatus from ray._raylet import GcsClient @@ -871,6 +872,120 @@ def f(): wait_for_pid_to_exit(gcs_server_pid, 10000) +@pytest.mark.parametrize( + "ray_start_cluster_head_with_external_redis_sentinel", + [ + generate_system_config_map( + gcs_rpc_server_reconnect_timeout_s=60, + gcs_server_request_timeout_seconds=10, + redis_db_connect_retries=50, + ) + ], + indirect=True, +) +def test_redis_with_sentinel_failureover( + ray_start_cluster_head_with_external_redis_sentinel, +): + """This test is to cover ray cluster's behavior with Redis sentinel. + The expectation is Redis sentinel should manage failover + automatically, and GCS can continue talking to the same address + without any human intervention on Redis. + For this test we ensure: + - When Redis master failed, Ray should crash (TODO: GCS should + autommatically try re-connect to sentinel). + - When restart Ray, it should continue talking to sentinel, which + should return information about new master. + """ + cluster = ray_start_cluster_head_with_external_redis_sentinel + import redis + + redis_addr = os.environ.get("RAY_REDIS_ADDRESS") + ip, port = redis_addr.split(":") + redis_cli = redis.Redis(ip, port) + print(redis_cli.info("sentinel")) + redis_name = redis_cli.info("sentinel")["master0"]["name"] + + def get_sentinel_nodes(): + leader_address = ( + redis_cli.sentinel_master(redis_name)["ip"], + redis_cli.sentinel_master(redis_name)["port"], + ) + follower_addresses = [ + (x["ip"], x["port"]) for x in redis_cli.sentinel_slaves(redis_name) + ] + return [leader_address] + follower_addresses + + wait_for_condition(lambda: len(get_sentinel_nodes()) == redis_sentinel_replicas()) + + @ray.remote(max_restarts=-1) + class Counter: + def r(self, v): + return v + + def pid(self): + import os + + return os.getpid() + + c = Counter.options(name="c", namespace="test", lifetime="detached").remote() + c_pid = ray.get(c.pid.remote()) + c_process = psutil.Process(pid=c_pid) + r = ray.get(c.r.remote(10)) + assert r == 10 + + head_node = cluster.head_node + gcs_server_process = head_node.all_processes["gcs_server"][0].process + gcs_server_pid = gcs_server_process.pid + + leader_cli = redis.Redis(*get_sentinel_nodes()[0]) + leader_pid = leader_cli.info()["process_id"] + follower_cli = [redis.Redis(*x) for x in get_sentinel_nodes()[1:]] + + # Wait until all data is updated in the replica + leader_cli.set("_hole", "0") + wait_for_condition(lambda: all([b"_hole" in f.keys("*") for f in follower_cli])) + current_leader = get_sentinel_nodes()[0] + + # Now kill pid + leader_process = psutil.Process(pid=leader_pid) + leader_process.kill() + + print(">>> Waiting gcs server to exit", gcs_server_pid) + wait_for_pid_to_exit(gcs_server_pid, 1000) + print("GCS killed") + + wait_for_condition(lambda: current_leader != get_sentinel_nodes()[0]) + + # Kill Counter actor. It should restart after GCS is back + c_process.kill() + # Cleanup the in memory data and then start gcs + cluster.head_node.kill_gcs_server(False) + + print("Start gcs") + sleep(2) + cluster.head_node.start_gcs_server() + + assert len(ray.nodes()) == 1 + assert ray.nodes()[0]["alive"] + + driver_script = f""" +import ray +ray.init('{cluster.address}') +@ray.remote +def f(): + return 10 +assert ray.get(f.remote()) == 10 + +c = ray.get_actor("c", namespace="test") +v = ray.get(c.r.remote(10)) +assert v == 10 +print("DONE") +""" + + # Make sure the cluster is usable + wait_for_condition(lambda: "DONE" in run_string_as_driver(driver_script)) + + @pytest.mark.parametrize( "ray_start_regular", [ diff --git a/python/ray/tests/test_gcs_utils.py b/python/ray/tests/test_gcs_utils.py index c25beac6e598..50862f386346 100644 --- a/python/ray/tests/test_gcs_utils.py +++ b/python/ray/tests/test_gcs_utils.py @@ -267,7 +267,7 @@ async def test_gcs_aio_client_is_async(ray_start_regular): gcs_client = gcs_utils.GcsAioClient(address=gcs_address, nums_reconnect_retry=0) await gcs_client.internal_kv_put(b"A", b"B", False, b"NS", timeout=2) - with async_timeout.timeout(3): + async with async_timeout.timeout(3): none, result = await asyncio.gather( asyncio.sleep(2), gcs_client.internal_kv_get(b"A", b"NS", timeout=2) ) diff --git a/python/ray/tests/test_logging_2.py b/python/ray/tests/test_logging_2.py index b48b04e44a59..9d5be165f9ac 100644 --- a/python/ray/tests/test_logging_2.py +++ b/python/ray/tests/test_logging_2.py @@ -54,11 +54,14 @@ def f(): "worker_id": runtime_context.get_worker_id(), "node_id": runtime_context.get_node_id(), "task_id": runtime_context.get_task_id(), + "task_name": runtime_context.get_task_name(), + "task_func_name": runtime_context.get_task_function_name(), } for attr in should_exist: assert hasattr(record, attr) assert getattr(record, attr) == expected_values[attr] assert not hasattr(record, "actor_id") + assert not hasattr(record, "actor_name") obj_ref = f.remote() ray.get(obj_ref) @@ -77,7 +80,10 @@ def f(self): "worker_id": runtime_context.get_worker_id(), "node_id": runtime_context.get_node_id(), "actor_id": runtime_context.get_actor_id(), + "actor_name": runtime_context.get_actor_name(), "task_id": runtime_context.get_task_id(), + "task_name": runtime_context.get_task_name(), + "task_func_name": runtime_context.get_task_function_name(), } for attr in should_exist: assert hasattr(record, attr) diff --git a/python/ray/tests/test_output.py b/python/ray/tests/test_output.py index 4b84b8285534..505a02e84226 100644 --- a/python/ray/tests/test_output.py +++ b/python/ray/tests/test_output.py @@ -575,26 +575,27 @@ def test_disable_driver_logs_breakpoint(): @ray.remote def f(): while True: - start_time = time.time() - while time.time() - start_time < 1: + start_time = time.monotonic() + while time.monotonic() - start_time < 1: time.sleep(0.1) + print(f"slept {time.monotonic() - start_time} seconds") print("hello there") sys.stdout.flush() def kill(): - start_time = time.time() - while time.time() - start_time < 5: + start_time = time.monotonic() + while time.monotonic() - start_time < 5: time.sleep(0.1) sys.stdout.flush() - start_time = time.time() - while time.time() - start_time < 1: + start_time = time.monotonic() + while time.monotonic() - start_time < 1: time.sleep(0.1) os._exit(0) t = threading.Thread(target=kill) t.start() x = f.remote() -time.sleep(2) # Enough time to print one hello. +time.sleep(3) # Enough time to print one hello. breakpoint() # This should disable worker logs. """ @@ -602,7 +603,7 @@ def kill(): out_str = proc.stdout.read().decode("ascii") num_hello = out_str.count("hello") assert num_hello >= 1, out_str - assert num_hello < 3, out_str + assert num_hello <= 3, out_str assert "Temporarily disabling Ray worker logs" in out_str, out_str # TODO(ekl) nice to test resuming logs too, but it's quite complicated diff --git a/python/ray/tests/test_runtime_context.py b/python/ray/tests/test_runtime_context.py index 3835a7e7eb3a..23bb39c6ab4d 100644 --- a/python/ray/tests/test_runtime_context.py +++ b/python/ray/tests/test_runtime_context.py @@ -264,6 +264,136 @@ def test_auto_init(shutdown_only): assert ray.is_initialized() +def test_get_task_name(shutdown_only): + ray.init() + + # for a normal task + @ray.remote + def get_task_name_for_normal_task(): + return ray.get_runtime_context().get_task_name() + + expected_task_name = "normal_task_name" + task_name = ray.get( + get_task_name_for_normal_task.options(name=expected_task_name).remote() + ) + assert ( + task_name == expected_task_name + ), f"Check normal task name failed. expected={expected_task_name}, \ +actual={task_name}" + + # for an actor task + @ray.remote + class Actor: + def get_task_name_for_actor_task(self): + return ray.get_runtime_context().get_task_name() + + expected_task_name = "Actor.get_task_name_for_actor_task" + actor = Actor.remote() + task_name = ray.get(actor.get_task_name_for_actor_task.remote()) + assert ( + task_name == expected_task_name + ), f"Check actor task name failed. expected={expected_task_name}, \ +actual={task_name}" + + # for a threaded actor task + @ray.remote + class ThreadedActor: + def get_task_name_for_threaded_actor_task(self): + return ray.get_runtime_context().get_task_name() + + expected_task_name = "ThreadedActor.get_task_name_for_threaded_actor_task" + threaded_actor = ThreadedActor.options(max_concurrency=2).remote() + task_name = ray.get(threaded_actor.get_task_name_for_threaded_actor_task.remote()) + assert ( + task_name == expected_task_name + ), f"Check actor task name failed. expected={expected_task_name}, \ +actual={task_name}" + + # for a async actor task + @ray.remote + class AsyncActor: + async def get_task_name_for_async_actor_task(self): + return ray.get_runtime_context().get_task_name() + + expected_task_name = "AsyncActor.get_task_name_for_async_actor_task" + async_actor = AsyncActor.remote() + task_name = ray.get(async_actor.get_task_name_for_async_actor_task.remote()) + assert ( + task_name == expected_task_name + ), f"Check actor task name failed. expected={expected_task_name}, \ +actual={task_name}" + + +def test_get_task_function_name(shutdown_only): + ray.init() + + # for a normal task + @ray.remote + def get_task_function_name_for_normal_task(): + return ray.get_runtime_context().get_task_function_name() + + expected_task_function_name = __name__ + ".get_task_function_name_for_normal_task" + task_function_name = ray.get(get_task_function_name_for_normal_task.remote()) + assert ( + task_function_name == expected_task_function_name + ), f"Check normal task function failed. expected={expected_task_function_name}, \ +actual={task_function_name}" + + # for an actor task + @ray.remote + class Actor: + def get_task_function_name_for_actor_task(self): + return ray.get_runtime_context().get_task_function_name() + + expected_task_function_name = ( + __name__ + ".Actor.get_task_function_name_for_actor_task" + ) + actor = Actor.remote() + task_function_name = ray.get(actor.get_task_function_name_for_actor_task.remote()) + assert ( + task_function_name == expected_task_function_name + ), f"Check actor task function failed. expected={expected_task_function_name}, \ +actual={task_function_name}" + + # for a threaded actor task + @ray.remote + class ThreadedActor: + def get_task_function_name_for_threaded_actor_task(self): + return ray.get_runtime_context().get_task_function_name() + + expected_task_function_name = ( + __name__ + ".ThreadedActor.get_task_function_name_for_threaded_actor_task" + ) + threaded_actor = ThreadedActor.options(max_concurrency=2).remote() + task_function_name = ray.get( + threaded_actor.get_task_function_name_for_threaded_actor_task.remote() + ) + assert ( + task_function_name == expected_task_function_name + ), f"Check actor task function failed. expected={expected_task_function_name}, \ +actual={task_function_name}" + + # for a async actor task + @ray.remote + class AsyncActor: + async def get_task_function_name_for_async_actor_task(self): + return ray.get_runtime_context().get_task_function_name() + + expected_task_function_name = ( + __name__ + + ".test_get_task_function_name..AsyncActor.\ +get_task_function_name_for_async_actor_task" + ) + async_actor = AsyncActor.remote() + task_function_name = ray.get( + async_actor.get_task_function_name_for_async_actor_task.remote() + ) + assert ( + task_function_name == expected_task_function_name + ), f"Check actor task function failed. expected={expected_task_function_name}, \ +actual={task_function_name}" + + def test_async_actor_task_id(shutdown_only): ray.init() diff --git a/python/ray/util/collective/collective_group/nccl_util.py b/python/ray/util/collective/collective_group/nccl_util.py index 05b05ef33a27..221d5885c411 100644 --- a/python/ray/util/collective/collective_group/nccl_util.py +++ b/python/ray/util/collective/collective_group/nccl_util.py @@ -63,7 +63,7 @@ } # Older versions of cupy don't support bfloat16. - if hasattr(nccl, "NCCL_BFlOAT16"): + if hasattr(nccl, "NCCL_BFLOAT16"): TORCH_NCCL_DTYPE_MAP[torch.bfloat16] = nccl.NCCL_BFLOAT16 TORCH_NUMPY_DTYPE_MAP = { diff --git a/python/requirements/ml/data-requirements.txt b/python/requirements/ml/data-requirements.txt index 6e2baa5592fe..de91b5010c7c 100644 --- a/python/requirements/ml/data-requirements.txt +++ b/python/requirements/ml/data-requirements.txt @@ -1,8 +1,8 @@ # Used by CI for datasets and docs. # https://github.com/ray-project/ray/pull/29448#discussion_r1006256498 -dask[complete]==2022.10.1; python_version < '3.12' -distributed==2022.10.1; python_version < '3.12' +dask[complete]==2022.10.2; python_version < '3.12' +distributed==2022.10.2; python_version < '3.12' dask[complete]==2024.6.0; python_version >= '3.12' distributed==2024.6.0; python_version >= '3.12' aioboto3==11.2.0 diff --git a/python/requirements/ml/data-test-requirements.txt b/python/requirements/ml/data-test-requirements.txt index d2d435b09d88..9ad22340d031 100644 --- a/python/requirements/ml/data-test-requirements.txt +++ b/python/requirements/ml/data-test-requirements.txt @@ -18,4 +18,5 @@ delta-sharing pytest-mock decord snowflake-connector-python -pyiceberg[sql-sqlite]==0.7.0 \ No newline at end of file +pyiceberg[sql-sqlite]==0.7.0 +hudi==0.2.0rc1 diff --git a/python/requirements/ml/rllib-test-requirements.txt b/python/requirements/ml/rllib-test-requirements.txt index 027c57446e60..c67bf2cec445 100644 --- a/python/requirements/ml/rllib-test-requirements.txt +++ b/python/requirements/ml/rllib-test-requirements.txt @@ -5,7 +5,7 @@ # Atari ale_py==0.10.1 imageio==2.34.2 -opencv-python==4.8.1.78 +opencv-python-headless==4.8.1.78 # For testing MuJoCo envs with gymnasium. mujoco==3.2.4 diff --git a/python/requirements/test-requirements.txt b/python/requirements/test-requirements.txt index b73f554ec524..175affd3e7f7 100644 --- a/python/requirements/test-requirements.txt +++ b/python/requirements/test-requirements.txt @@ -14,8 +14,7 @@ beautifulsoup4==4.11.1 boto3==1.26.76 # Todo: investigate if we can get rid of this and exchange for ray.cloudpickle cloudpickle==2.2.0 -# Keep in sync with `ci/build/upload_build_info.sh` -cryptography==38.0.1 +cryptography==42.0.5 cython==0.29.37 fastapi==0.109.2 feather-format==0.4.1 @@ -45,8 +44,7 @@ Pillow==10.3.0; platform_system != "Windows" proxy.py==2.4.3 pydantic==2.5.0 pydot==1.4.2 -# Keep in sync with `ci/build/upload_build_info.sh` -PyOpenSSL==23.0.0 +pyopenssl==24.2.1 pygame==2.5.2 Pygments==2.18.0 pymongo==4.3.2 diff --git a/python/requirements_compiled.txt b/python/requirements_compiled.txt index 1347afee24c5..f3b39647b0e8 100644 --- a/python/requirements_compiled.txt +++ b/python/requirements_compiled.txt @@ -4,24 +4,18 @@ absl-py==1.4.0 # via # array-record - # chex # dm-control # dm-env - # dopamine-rl # etils # labmaze # ml-collections # mujoco # open-spiel - # optax - # orbax-checkpoint - # recsim # tensorboard # tensorflow # tensorflow-datasets # tensorflow-metadata # tensorflow-probability - # tf-slim accelerate==0.28.0 # via -r /ray/ci/../python/requirements/ml/core-requirements.txt adagio==0.2.4 @@ -76,9 +70,7 @@ aiosqlite==0.19.0 alabaster==0.7.13 # via sphinx ale-py==0.10.1 - # via - # -r /ray/ci/../python/requirements/ml/rllib-test-requirements.txt - # gymnasium + # via -r /ray/ci/../python/requirements/ml/rllib-test-requirements.txt alembic==1.12.1 # via # aim @@ -147,10 +139,6 @@ attrs==21.4.0 # open-spiel # sarif-om # semgrep -autorom==0.6.1 ; platform_machine != "arm64" - # via -r /ray/ci/../python/requirements/ml/rllib-test-requirements.txt -autorom-accept-rom-license==0.6.1 - # via autorom aws-sam-translator==1.81.0 # via cfn-lint aws-xray-sdk==2.12.1 @@ -242,8 +230,6 @@ braceexpand==0.1.7 # via webdataset bracex==2.4 # via wcmatch -cached-property==1.5.2 - # via orbax-checkpoint cachetools==5.3.2 # via # aim @@ -272,16 +258,12 @@ charset-normalizer==3.3.2 # via # requests # snowflake-connector-python -chex==0.1.7 - # via optax clang-format==12.0.1 # via -r /ray/ci/../python/requirements/lint-requirements.txt click==8.1.7 # via # -r /ray/ci/../python/requirements.txt # aim - # autorom - # autorom-accept-rom-license # black # click-option-group # dask @@ -352,7 +334,7 @@ crc32c==2.3 # via -r /ray/ci/../python/requirements/ml/data-requirements.txt crcmod==1.7 # via gsutil -cryptography==38.0.1 +cryptography==42.0.5 # via # -r /ray/ci/../python/requirements/test-requirements.txt # adal @@ -378,7 +360,7 @@ cython==0.29.37 # via # -r /ray/ci/../python/requirements/test-requirements.txt # gpy -dask==2022.10.1 ; python_version < "3.12" +dask==2022.10.2 ; python_version < "3.12" # via # -r /ray/ci/../python/requirements/ml/data-requirements.txt # distributed @@ -416,7 +398,7 @@ dill==0.3.7 # multiprocess distlib==0.3.7 # via virtualenv -distributed==2022.10.1 ; python_version < "3.12" +distributed==2022.10.2 ; python_version < "3.12" # via # -r /ray/ci/../python/requirements/ml/data-requirements.txt # dask @@ -429,7 +411,6 @@ dm-env==1.6 dm-tree==0.1.8 # via # -r /ray/ci/../python/requirements.txt - # chex # dm-control # dm-env # tensorflow-datasets @@ -450,10 +431,6 @@ docutils==0.19 # -r /ray/ci/../python/requirements/lint-requirements.txt # myst-parser # sphinx -dopamine-rl==4.0.5 ; (sys_platform != "darwin" or platform_machine != "arm64") and python_version < "3.12" - # via - # -r /ray/ci/../python/requirements/ml/rllib-test-requirements.txt - # recsim dulwich==0.21.6 # via comet-ml ecdsa==0.18.0 @@ -472,7 +449,7 @@ etils==1.5.2 ; python_version < "3.12" # via # -r /ray/ci/../python/requirements/ml/dl-cpu-requirements.txt # array-record - # orbax-checkpoint + # mujoco # tensorflow-datasets evaluate==0.4.0 # via -r /ray/ci/../python/requirements/ml/train-test-requirements.txt @@ -545,8 +522,6 @@ flatbuffers==23.5.26 # onnxruntime # tensorflow # tf2onnx -flax==0.7.2 - # via dopamine-rl fonttools==4.45.1 # via matplotlib fqdn==1.5.1 @@ -590,10 +565,6 @@ gast==0.4.0 # tensorflow-probability gcs-oauth2-boto-plugin==3.0 # via gsutil -gin-config==0.5.0 - # via - # dopamine-rl - # recsim gitdb==4.0.11 # via gitpython gitpython==3.1.40 @@ -677,6 +648,8 @@ gradio-client==0.6.1 # via gradio graphql-core==3.2.3 # via moto +graphviz==0.20.3 + # via -r /ray/ci/../python/requirements/test-requirements.txt greenlet==3.0.1 # via sqlalchemy grpcio==1.66.2 ; sys_platform != "darwin" @@ -704,7 +677,6 @@ gunicorn==20.1.0 gymnasium==1.0.0 # via # -r /ray/ci/../python/requirements.txt - # -r /ray/ci/../python/requirements/ml/rllib-test-requirements.txt # minigrid # pettingzoo # shimmy @@ -745,6 +717,8 @@ httpx==0.24.1 # -r /ray/ci/../python/requirements/test-requirements.txt # gradio # gradio-client +hudi==0.2.0rc1 + # via -r /ray/ci/../python/requirements/ml/data-test-requirements.txt huggingface-hub==0.19.4 # via # accelerate @@ -771,7 +745,7 @@ idna==3.7 # snowflake-connector-python # trustme # yarl -imageio==2.34.2 ; python_version < "3.12" +imageio==2.34.2 # via # -r /ray/ci/../python/requirements/ml/rllib-test-requirements.txt # moviepy @@ -788,10 +762,8 @@ importlib-metadata==6.11.0 # myst-nb importlib-resources==5.13.0 # via - # ale-py # etils # gradio - # orbax-checkpoint # prophet iniconfig==2.0.0 # via pytest @@ -829,21 +801,6 @@ isort==5.10.1 # via -r /ray/ci/../python/requirements/lint-requirements.txt itsdangerous==2.1.2 # via flask -jax==0.4.13 - # via - # chex - # dopamine-rl - # flax - # optax - # orbax-checkpoint -jax-jumpy==1.0.0 - # via gymnasium -jaxlib==0.4.13 - # via - # chex - # dopamine-rl - # optax - # orbax-checkpoint jedi==0.19.1 # via ipython jinja2==3.1.2 @@ -908,7 +865,6 @@ jsonschema==4.17.3 # jsonschema-spec # jupyter-events # jupyterlab-server - # kaggle-environments # nbformat # openapi-schema-validator # openapi-spec-validator @@ -966,8 +922,6 @@ jupyterlab-widgets==3.0.11 # via ipywidgets jupytext==1.16.3 # via -r /ray/ci/../python/requirements/test-requirements.txt -kaggle-environments==1.7.11 - # via -r /ray/ci/../python/requirements/ml/rllib-test-requirements.txt keras==2.15.0 # via tensorflow kiwisolver==1.4.5 @@ -1060,11 +1014,7 @@ mistune==0.8.4 ml-collections==0.1.1 # via open-spiel ml-dtypes==0.3.2 - # via - # jax - # jaxlib - # tensorflow - # tensorstore + # via tensorflow mlagents-envs==0.28.0 # via -r /ray/ci/../python/requirements/ml/rllib-test-requirements.txt mlflow==2.9.2 @@ -1105,9 +1055,7 @@ msgpack==1.0.7 # -r /ray/ci/../python/requirements.txt # -r /ray/ci/../python/requirements/ml/rllib-requirements.txt # distributed - # flax # msgpack-numpy - # orbax-checkpoint # ray msgpack-numpy==0.4.8 # via -r /ray/ci/../python/requirements/ml/rllib-requirements.txt @@ -1177,7 +1125,6 @@ nest-asyncio==1.5.8 # nbclassic # nbclient # notebook - # orbax-checkpoint netifaces==0.11.0 # via # hpbandster @@ -1218,7 +1165,6 @@ numpy==1.26.4 # altair # bayesian-optimization # bokeh - # chex # cma # cmaes # cmdstanpy @@ -1231,10 +1177,8 @@ numpy==1.26.4 # deepspeed # dm-control # dm-env - # dopamine-rl # etils # evaluate - # flax # gpy # gradio # gymnasium @@ -1242,9 +1186,6 @@ numpy==1.26.4 # hpbandster # hyperopt # imageio - # jax - # jax-jumpy - # jaxlib # labmaze # lightgbm # matplotlib @@ -1266,9 +1207,7 @@ numpy==1.26.4 # open-spiel # opencv-python # opt-einsum - # optax # optuna - # orbax-checkpoint # pandas # paramz # patsy @@ -1293,7 +1232,6 @@ numpy==1.26.4 # tensorflow # tensorflow-datasets # tensorflow-probability - # tensorstore # tf2onnx # tifffile # tinyscaler @@ -1336,7 +1274,7 @@ opencensus-context==0.1.3 opencensus-proto==0.1.0 # via opentelemetry-exporter-opencensus opencv-python==4.8.1.78 - # via dopamine-rl + # via -r /ray/ci/../python/requirements/ml/rllib-test-requirements.txt openpyxl==3.0.10 # via -r /ray/ci/../python/requirements/test-requirements.txt opentelemetry-api==1.1.0 @@ -1369,15 +1307,10 @@ opentelemetry-semantic-conventions==0.20b0 # via opentelemetry-sdk opt-einsum==3.3.0 # via - # jax # pyro-ppl # tensorflow -optax==0.1.7 - # via flax optuna==3.2.0 # via -r /ray/ci/../python/requirements/ml/tune-requirements.txt -orbax-checkpoint==0.2.3 - # via flax orjson==3.9.10 # via gradio packaging==23.0 @@ -1439,7 +1372,6 @@ pandas==1.5.3 ; python_version < "3.12" # dask # datasets # delta-sharing - # dopamine-rl # evaluate # gradio # mlflow @@ -1491,7 +1423,6 @@ pillow==10.3.0 ; platform_system != "Windows" # -r /ray/ci/../python/requirements/test-requirements.txt # aim # bokeh - # dopamine-rl # gradio # imageio # matplotlib @@ -1638,7 +1569,6 @@ pyflakes==2.3.1 pygame==2.5.2 # via # -r /ray/ci/../python/requirements/test-requirements.txt - # dopamine-rl # minigrid pyglet==1.5.15 # via -r /ray/ci/../python/requirements/ml/rllib-requirements.txt @@ -1675,7 +1605,7 @@ pyopengl==3.1.7 # via # dm-control # mujoco -pyopenssl==23.0.0 +pyopenssl==24.2.1 # via # -r /ray/ci/../python/requirements.txt # -r /ray/ci/../python/requirements/anyscale-requirements.txt @@ -1809,7 +1739,6 @@ pyyaml==6.0.1 # dask # datasets # distributed - # flax # gradio # huggingface-hub # jsonschema-spec @@ -1826,7 +1755,6 @@ pyyaml==6.0.1 # myst-nb # myst-parser # optuna - # orbax-checkpoint # pymars # pytorch-lightning # ray @@ -1862,8 +1790,6 @@ requests==2.31.0 # -r /ray/ci/../python/requirements.txt # adal # aim - # autorom - # autorom-accept-rom-license # azure-cli-core # azure-core # comet-ml @@ -1937,7 +1863,6 @@ rich==13.3.2 # -r /ray/ci/../python/requirements.txt # -r /ray/ci/../python/requirements/ml/rllib-requirements.txt # comet-ml - # flax # memray # pyiceberg # semgrep @@ -1989,8 +1914,6 @@ scipy==1.11.4 # gpy # hpbandster # hyperopt - # jax - # jaxlib # lightgbm # linear-operator # medpy @@ -1998,7 +1921,6 @@ scipy==1.11.4 # open-spiel # paramz # pymars - # recsim # scikit-image # scikit-learn # statsforecast @@ -2154,6 +2076,8 @@ statsmodels==0.14.0 # via # hpbandster # statsforecast +strictyaml==1.7.3 + # via pyiceberg supersuit==3.9.3 # via -r /ray/ci/../python/requirements/ml/rllib-test-requirements.txt sympy==1.13.1 @@ -2188,10 +2112,7 @@ tensorboardx==2.6.2.2 # -r /ray/ci/../python/requirements/test-requirements.txt # pytorch-lightning tensorflow==2.15.1 ; python_version < "3.12" and (sys_platform != "darwin" or platform_machine != "arm64") - # via - # -r /ray/ci/../python/requirements/ml/dl-cpu-requirements.txt - # dopamine-rl - # recsim + # via -r /ray/ci/../python/requirements/ml/dl-cpu-requirements.txt tensorflow-datasets==4.9.3 ; python_version < "3.12" # via # -r /ray/ci/../python/requirements/ml/data-test-requirements.txt @@ -2207,13 +2128,7 @@ tensorflow-io-gcs-filesystem==0.31.0 ; python_version < "3.12" tensorflow-metadata==1.14.0 # via tensorflow-datasets tensorflow-probability==0.23.0 ; python_version < "3.12" - # via - # -r /ray/ci/../python/requirements/ml/dl-cpu-requirements.txt - # dopamine-rl -tensorstore==0.1.63 - # via - # flax - # orbax-checkpoint + # via -r /ray/ci/../python/requirements/ml/dl-cpu-requirements.txt termcolor==2.4.0 # via # pytest-shutil @@ -2227,8 +2142,6 @@ terminado==0.18.1 # notebook testfixtures==7.0.0 # via -r /ray/ci/../python/requirements/test-requirements.txt -tf-slim==1.1.0 - # via dopamine-rl tf2onnx==1.15.1 ; sys_platform != "darwin" or platform_machine != "arm64" # via -r /ray/ci/../python/requirements/ml/rllib-requirements.txt threadpoolctl==3.1.0 @@ -2258,7 +2171,6 @@ tomlkit==0.13.0 toolz==0.12.1 # via # altair - # chex # dask # distributed # partd @@ -2394,7 +2306,6 @@ typing-extensions==4.8.0 # configspace # etils # fastapi - # flax # gradio # gradio-client # gymnasium @@ -2403,7 +2314,6 @@ typing-extensions==4.8.0 # mypy # myst-nb # nevergrad - # orbax-checkpoint # pydantic # pydantic-core # pytorch-lightning diff --git a/python/setup.py b/python/setup.py index 2e6958d021da..16017fa5447a 100644 --- a/python/setup.py +++ b/python/setup.py @@ -228,7 +228,7 @@ def get_packages(self): pandas_dep = "pandas >= 1.3" numpy_dep = "numpy >= 1.20" pyarrow_deps = [ - "pyarrow >= 6.0.1", + "pyarrow >= 9.0.0", "pyarrow <18; sys_platform == 'darwin' and platform_machine == 'x86_64'", ] setup_spec.extras = { diff --git a/release/BUILD.bazel b/release/BUILD.bazel index a09070d9b313..f269add55138 100644 --- a/release/BUILD.bazel +++ b/release/BUILD.bazel @@ -309,7 +309,6 @@ py_library( bk_require("pybuildkite"), bk_require("pygithub"), bk_require("requests"), - bk_require("retry"), ], ) @@ -624,3 +623,18 @@ py_test( bk_require("pytest"), ], ) + +py_test( + name = "test_retry", + size = "small", + srcs = ["ray_release/tests/test_retry.py"], + exec_compatible_with = ["//:hermetic_python"], + tags = [ + "release_unit", + "team:ci", + ], + deps = [ + ":ray_release", + bk_require("pytest"), + ], +) diff --git a/release/air_examples/dolly_v2_lightning_fsdp_finetuning/dolly_v2_fsdp_compute_aws.yaml b/release/air_examples/dolly_v2_lightning_fsdp_finetuning/dolly_v2_fsdp_compute_aws.yaml index 7966578a31b1..9e6cabef573d 100644 --- a/release/air_examples/dolly_v2_lightning_fsdp_finetuning/dolly_v2_fsdp_compute_aws.yaml +++ b/release/air_examples/dolly_v2_lightning_fsdp_finetuning/dolly_v2_fsdp_compute_aws.yaml @@ -12,7 +12,7 @@ worker_node_types: max_workers: 15 use_spot: false -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_compute_aws.yaml b/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_compute_aws.yaml index e315fc0b9f88..6ed2aa738ed9 100644 --- a/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_compute_aws.yaml +++ b/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_compute_aws.yaml @@ -12,7 +12,7 @@ worker_node_types: max_workers: 7 use_spot: false -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_compute_gce.yaml b/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_compute_gce.yaml index 7be4f970f0b1..be93c6d0aac6 100644 --- a/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_compute_gce.yaml +++ b/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_compute_gce.yaml @@ -14,7 +14,7 @@ worker_node_types: max_workers: 7 use_spot: false -#aws: +#advanced_configurations_json: # TagSpecifications: # - ResourceType: "instance" # Tags: diff --git a/release/air_examples/opt_deepspeed_batch_inference/30b_deepspeed_compute.yaml b/release/air_examples/opt_deepspeed_batch_inference/30b_deepspeed_compute.yaml index 1ac93d59eb91..a2b79c9cc489 100644 --- a/release/air_examples/opt_deepspeed_batch_inference/30b_deepspeed_compute.yaml +++ b/release/air_examples/opt_deepspeed_batch_inference/30b_deepspeed_compute.yaml @@ -7,7 +7,7 @@ head_node_type: worker_node_types: [] -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_deepspeed_compute_aws.yaml b/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_deepspeed_compute_aws.yaml index 17f69c81a906..3fe5ec7c083e 100644 --- a/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_deepspeed_compute_aws.yaml +++ b/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_deepspeed_compute_aws.yaml @@ -12,7 +12,7 @@ worker_node_types: max_workers: 15 use_spot: false -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml b/release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml index df7c2a8958a0..150990710680 100644 --- a/release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml +++ b/release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml @@ -9,7 +9,7 @@ head_node_type: worker_node_types: [] -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml b/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml index ee7d1436e7cf..c543315e24f3 100644 --- a/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml +++ b/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml @@ -14,7 +14,7 @@ worker_node_types: min_workers: 3 use_spot: false -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/air_tests/air_benchmarks/mlperf-train/compute_cpu_16.yaml b/release/air_tests/air_benchmarks/mlperf-train/compute_cpu_16.yaml index be1577e57401..b45a2c038d78 100644 --- a/release/air_tests/air_benchmarks/mlperf-train/compute_cpu_16.yaml +++ b/release/air_tests/air_benchmarks/mlperf-train/compute_cpu_16.yaml @@ -9,7 +9,7 @@ head_node_type: worker_node_types: [] -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/air_tests/horovod/compute_tpl_aws.yaml b/release/air_tests/horovod/compute_tpl_aws.yaml index d14997e25f0b..2ef09f059167 100644 --- a/release/air_tests/horovod/compute_tpl_aws.yaml +++ b/release/air_tests/horovod/compute_tpl_aws.yaml @@ -15,7 +15,7 @@ worker_node_types: min_workers: 1 use_spot: false -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/benchmark-worker-startup/only_head_node_1gpu_64cpu.yaml b/release/benchmark-worker-startup/only_head_node_1gpu_64cpu.yaml index cddae87016f6..f377139d6f22 100644 --- a/release/benchmark-worker-startup/only_head_node_1gpu_64cpu.yaml +++ b/release/benchmark-worker-startup/only_head_node_1gpu_64cpu.yaml @@ -3,7 +3,7 @@ region: us-west-2 max_workers: 0 -aws: +advanced_configurations_json: # Fix the volume size so that IOPS is constant even if the default changes. BlockDeviceMappings: - DeviceName: /dev/sda1 diff --git a/release/benchmarks/distributed/many_nodes_tests/compute_config.yaml b/release/benchmarks/distributed/many_nodes_tests/compute_config.yaml index 47f435a873ac..2a8de6119ea3 100644 --- a/release/benchmarks/distributed/many_nodes_tests/compute_config.yaml +++ b/release/benchmarks/distributed/many_nodes_tests/compute_config.yaml @@ -4,7 +4,7 @@ region: us-west-2 # NFS needs to be disabled for this test, since the test spawns too many nodes # and may hit the limit on the # of clients. -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: @@ -23,7 +23,7 @@ head_node_type: worker_node_types: - name: worker_node - instance_type: m5.large + instance_type: m6i.large min_workers: 500 max_workers: 2000 use_spot: false diff --git a/release/benchmarks/object_store.yaml b/release/benchmarks/object_store.yaml index 6908c9e9bf7f..5353a1009c6b 100644 --- a/release/benchmarks/object_store.yaml +++ b/release/benchmarks/object_store.yaml @@ -5,14 +5,14 @@ max_workers: 49 head_node_type: name: head_node - instance_type: m4.16xlarge + instance_type: m6i.16xlarge resources: custom_resources: node: 1 worker_node_types: - name: worker_node - instance_type: m4.2xlarge + instance_type: m6i.2xlarge min_workers: 49 max_workers: 49 use_spot: false diff --git a/release/benchmarks/single_node.yaml b/release/benchmarks/single_node.yaml index 94297cbfbb8e..d94ea397f073 100644 --- a/release/benchmarks/single_node.yaml +++ b/release/benchmarks/single_node.yaml @@ -3,7 +3,7 @@ region: us-west-2 max_workers: 0 -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/dashboard/agent_stress_compute.yaml b/release/dashboard/agent_stress_compute.yaml index 340b63778529..5e3859905a07 100644 --- a/release/dashboard/agent_stress_compute.yaml +++ b/release/dashboard/agent_stress_compute.yaml @@ -1,7 +1,7 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west-2 -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/jobs_tests/compute_tpl_4_xlarge.yaml b/release/jobs_tests/compute_tpl_4_xlarge.yaml index 7da54ffb0d6c..5f21711662d7 100644 --- a/release/jobs_tests/compute_tpl_4_xlarge.yaml +++ b/release/jobs_tests/compute_tpl_4_xlarge.yaml @@ -16,7 +16,7 @@ worker_node_types: max_workers: 4 use_spot: false -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/jobs_tests/compute_tpl_gce_4_xlarge.yaml b/release/jobs_tests/compute_tpl_gce_4_xlarge.yaml index b2c81c32c83c..176c282a50f5 100644 --- a/release/jobs_tests/compute_tpl_gce_4_xlarge.yaml +++ b/release/jobs_tests/compute_tpl_gce_4_xlarge.yaml @@ -1,6 +1,6 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west1 -allowed_azs: +allowed_azs: - us-west1-c max_workers: 4 @@ -16,7 +16,7 @@ worker_node_types: max_workers: 4 use_spot: false -#aws: +#advanced_configurations_json: # TagSpecifications: # - ResourceType: "instance" # Tags: diff --git a/release/jobs_tests/compute_tpl_gpu_node.yaml b/release/jobs_tests/compute_tpl_gpu_node.yaml index 36a2a3e0ce8c..27700e794664 100644 --- a/release/jobs_tests/compute_tpl_gpu_node.yaml +++ b/release/jobs_tests/compute_tpl_gpu_node.yaml @@ -13,7 +13,7 @@ worker_node_types: max_workers: 1 use_spot: false -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/jobs_tests/compute_tpl_gpu_worker.yaml b/release/jobs_tests/compute_tpl_gpu_worker.yaml index 15955339513f..a98edd4945dc 100644 --- a/release/jobs_tests/compute_tpl_gpu_worker.yaml +++ b/release/jobs_tests/compute_tpl_gpu_worker.yaml @@ -13,7 +13,7 @@ worker_node_types: max_workers: 1 use_spot: false -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/k8s_tests/compute_tpl.yaml b/release/k8s_tests/compute_tpl.yaml index c29a684f9efd..a5a788a2eec7 100644 --- a/release/k8s_tests/compute_tpl.yaml +++ b/release/k8s_tests/compute_tpl.yaml @@ -9,7 +9,7 @@ head_node_type: worker_node_types: [] -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/k8s_tests/ray_v1alpha1_rayservice_template.yaml b/release/k8s_tests/ray_v1alpha1_rayservice_template.yaml index c47e9572ceb1..7bed4fc1066c 100644 --- a/release/k8s_tests/ray_v1alpha1_rayservice_template.yaml +++ b/release/k8s_tests/ray_v1alpha1_rayservice_template.yaml @@ -126,7 +126,7 @@ spec: serviceType: ClusterIP # the pod replicas in this group typed head (assuming there could be more than 1 in the future) replicas: 1 - # logical group name, for this called head-group, also can be functional + # logical group name, for this called headgroup, also can be functional # pod type head or worker # rayNodeType: head # Not needed since it is under the headgroup # the following params are used to complete the ray start: ray start --head --block --redis-port=6379 ... diff --git a/release/long_running_distributed_tests/compute_tpl.yaml b/release/long_running_distributed_tests/compute_tpl.yaml index 1fe5db1b9f87..68c144d651d3 100644 --- a/release/long_running_distributed_tests/compute_tpl.yaml +++ b/release/long_running_distributed_tests/compute_tpl.yaml @@ -14,7 +14,7 @@ worker_node_types: max_workers: 2 use_spot: false -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/long_running_tests/many_ppo.yaml b/release/long_running_tests/many_ppo.yaml index 63c8145d992e..941dd0fa091d 100644 --- a/release/long_running_tests/many_ppo.yaml +++ b/release/long_running_tests/many_ppo.yaml @@ -9,7 +9,7 @@ head_node_type: worker_node_types: [] -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/long_running_tests/many_ppo_gce.yaml b/release/long_running_tests/many_ppo_gce.yaml index 7be96bd65462..9981b725f770 100644 --- a/release/long_running_tests/many_ppo_gce.yaml +++ b/release/long_running_tests/many_ppo_gce.yaml @@ -11,7 +11,7 @@ head_node_type: worker_node_types: [] -#aws: +#advanced_configurations_json: # TagSpecifications: # - ResourceType: "instance" # Tags: diff --git a/release/long_running_tests/tpl_cpu_1.yaml b/release/long_running_tests/tpl_cpu_1.yaml index 0798e51be0d3..f09553f38347 100644 --- a/release/long_running_tests/tpl_cpu_1.yaml +++ b/release/long_running_tests/tpl_cpu_1.yaml @@ -9,7 +9,7 @@ head_node_type: worker_node_types: [] -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/long_running_tests/tpl_cpu_1_c5.yaml b/release/long_running_tests/tpl_cpu_1_c5.yaml index 6711e8ea7b66..d6cd546b8951 100644 --- a/release/long_running_tests/tpl_cpu_1_c5.yaml +++ b/release/long_running_tests/tpl_cpu_1_c5.yaml @@ -9,7 +9,7 @@ head_node_type: worker_node_types: [] -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/long_running_tests/tpl_cpu_1_c5_gce.yaml b/release/long_running_tests/tpl_cpu_1_c5_gce.yaml index 1f6a428ca7ec..52c35b2508e6 100644 --- a/release/long_running_tests/tpl_cpu_1_c5_gce.yaml +++ b/release/long_running_tests/tpl_cpu_1_c5_gce.yaml @@ -11,7 +11,7 @@ head_node_type: worker_node_types: [] -#aws: +#advanced_configurations_json: # TagSpecifications: # - ResourceType: "instance" # Tags: diff --git a/release/long_running_tests/tpl_cpu_1_gce.yaml b/release/long_running_tests/tpl_cpu_1_gce.yaml index 807ac26cac60..48eef743adfb 100644 --- a/release/long_running_tests/tpl_cpu_1_gce.yaml +++ b/release/long_running_tests/tpl_cpu_1_gce.yaml @@ -11,7 +11,7 @@ head_node_type: worker_node_types: [] -#aws: +#advanced_configurations_json: # TagSpecifications: # - ResourceType: "instance" # Tags: diff --git a/release/long_running_tests/tpl_cpu_1_large.yaml b/release/long_running_tests/tpl_cpu_1_large.yaml index ebe2058ab8b9..87e8548fc087 100644 --- a/release/long_running_tests/tpl_cpu_1_large.yaml +++ b/release/long_running_tests/tpl_cpu_1_large.yaml @@ -9,7 +9,7 @@ head_node_type: worker_node_types: [] -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/long_running_tests/tpl_cpu_1_large_gce.yaml b/release/long_running_tests/tpl_cpu_1_large_gce.yaml index f9392a87032a..93c00da4d01e 100644 --- a/release/long_running_tests/tpl_cpu_1_large_gce.yaml +++ b/release/long_running_tests/tpl_cpu_1_large_gce.yaml @@ -11,7 +11,7 @@ head_node_type: worker_node_types: [] -#aws: +#advanced_configurations_json: # TagSpecifications: # - ResourceType: "instance" # Tags: diff --git a/release/long_running_tests/tpl_cpu_2.yaml b/release/long_running_tests/tpl_cpu_2.yaml index 7e249f7ec82e..94b54c63b6d1 100644 --- a/release/long_running_tests/tpl_cpu_2.yaml +++ b/release/long_running_tests/tpl_cpu_2.yaml @@ -14,7 +14,7 @@ worker_node_types: max_workers: 1 use_spot: false -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/long_running_tests/tpl_cpu_3.yaml b/release/long_running_tests/tpl_cpu_3.yaml index 54ac50b78e38..4821923fe71c 100644 --- a/release/long_running_tests/tpl_cpu_3.yaml +++ b/release/long_running_tests/tpl_cpu_3.yaml @@ -14,7 +14,7 @@ worker_node_types: max_workers: 2 use_spot: false -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/long_running_tests/tpl_cpu_3_gce.yaml b/release/long_running_tests/tpl_cpu_3_gce.yaml index e08b73838512..c9d24ec1dd71 100644 --- a/release/long_running_tests/tpl_cpu_3_gce.yaml +++ b/release/long_running_tests/tpl_cpu_3_gce.yaml @@ -16,7 +16,7 @@ worker_node_types: max_workers: 2 use_spot: false -#aws: +#advanced_configurations_json: # TagSpecifications: # - ResourceType: "instance" # Tags: diff --git a/release/long_running_tests/tpl_cpu_4.yaml b/release/long_running_tests/tpl_cpu_4.yaml index c08501e94c67..43e2adbdad8a 100644 --- a/release/long_running_tests/tpl_cpu_4.yaml +++ b/release/long_running_tests/tpl_cpu_4.yaml @@ -14,7 +14,7 @@ worker_node_types: max_workers: 3 use_spot: false -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/long_running_tests/tpl_cpu_4_gce.yaml b/release/long_running_tests/tpl_cpu_4_gce.yaml index 4525893848e1..3a56f551ccc3 100644 --- a/release/long_running_tests/tpl_cpu_4_gce.yaml +++ b/release/long_running_tests/tpl_cpu_4_gce.yaml @@ -16,7 +16,7 @@ worker_node_types: max_workers: 3 use_spot: false -#aws: +#advanced_configurations_json: # TagSpecifications: # - ResourceType: "instance" # Tags: diff --git a/release/microbenchmark/experimental/accelerated_dag_gpu_microbenchmark.py b/release/microbenchmark/experimental/accelerated_dag_gpu_microbenchmark.py index f440e72752fb..895d43bdcdab 100644 --- a/release/microbenchmark/experimental/accelerated_dag_gpu_microbenchmark.py +++ b/release/microbenchmark/experimental/accelerated_dag_gpu_microbenchmark.py @@ -58,13 +58,16 @@ class TorchTensorWorker: def __init__(self): self.device = torch_utils.get_devices()[0] - def send(self, shape, dtype, value: int): - t = torch.ones(shape, dtype=dtype, device=self.device) * value + def send(self, shape, dtype, _): + t = torch.ones(shape, dtype=dtype, device=self.device) * 1 return t def recv(self, tensor): + # This benchmark tests the overhead of sending a tensor between + # actors. To minimize the overhead of shared memory transfer, + # we return only a byte string. assert tensor.device == self.device - return (tensor[0].item(), tensor.shape, tensor.dtype) + return b"x" @ray.remote(num_gpus=1) @@ -139,17 +142,15 @@ def exec_ray_dag( dag = dag.experimental_compile() def _run(): - i = np.random.randint(100) - ref = dag.execute(i) + ref = dag.execute(b"x") result = ray.get(ref) - assert result == (i, SHAPE, DTYPE) + assert result == b"x" else: def _run(): - i = np.random.randint(100) - result = ray.get(dag.execute(i)) - assert result == (i, SHAPE, DTYPE) + result = ray.get(dag.execute(b"x")) + assert result == b"x" results = timeit(label, _run) diff --git a/release/ml_user_tests/horovod/compute_tpl_aws.yaml b/release/ml_user_tests/horovod/compute_tpl_aws.yaml index 6c518ba272c1..61999ce38e8c 100644 --- a/release/ml_user_tests/horovod/compute_tpl_aws.yaml +++ b/release/ml_user_tests/horovod/compute_tpl_aws.yaml @@ -14,7 +14,7 @@ worker_node_types: min_workers: 3 use_spot: false -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/ml_user_tests/horovod/compute_tpl_gce.yaml b/release/ml_user_tests/horovod/compute_tpl_gce.yaml index 2cad8d220fba..d2d46997fd4f 100644 --- a/release/ml_user_tests/horovod/compute_tpl_gce.yaml +++ b/release/ml_user_tests/horovod/compute_tpl_gce.yaml @@ -16,7 +16,7 @@ worker_node_types: min_workers: 3 use_spot: false -#aws: +#advanced_configurations_json: # TagSpecifications: # - ResourceType: "instance" # Tags: diff --git a/release/ml_user_tests/tune_rllib/compute_tpl_aws.yaml b/release/ml_user_tests/tune_rllib/compute_tpl_aws.yaml index c4166af67959..376fd90539c7 100644 --- a/release/ml_user_tests/tune_rllib/compute_tpl_aws.yaml +++ b/release/ml_user_tests/tune_rllib/compute_tpl_aws.yaml @@ -20,7 +20,7 @@ worker_node_types: max_workers: 2 use_spot: false -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/nightly_tests/chaos_test/compute_template.yaml b/release/nightly_tests/chaos_test/compute_template.yaml index 4421ed956e29..f91504fb6937 100644 --- a/release/nightly_tests/chaos_test/compute_template.yaml +++ b/release/nightly_tests/chaos_test/compute_template.yaml @@ -1,7 +1,7 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west-2 -aws: +advanced_configurations_json: IamInstanceProfile: {"Name": "ray-autoscaler-v1"} head_node_type: diff --git a/release/nightly_tests/dask_on_ray/1tb_sort_compute.yaml b/release/nightly_tests/dask_on_ray/1tb_sort_compute.yaml index 80c5ea1325da..7bc19c0bf2f9 100644 --- a/release/nightly_tests/dask_on_ray/1tb_sort_compute.yaml +++ b/release/nightly_tests/dask_on_ray/1tb_sort_compute.yaml @@ -1,7 +1,7 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west-2 -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/nightly_tests/dask_on_ray/chaos_dask_on_ray_stress_compute.yaml b/release/nightly_tests/dask_on_ray/chaos_dask_on_ray_stress_compute.yaml index e249486f0377..838abd890c33 100644 --- a/release/nightly_tests/dask_on_ray/chaos_dask_on_ray_stress_compute.yaml +++ b/release/nightly_tests/dask_on_ray/chaos_dask_on_ray_stress_compute.yaml @@ -1,7 +1,7 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west-2 -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/nightly_tests/dask_on_ray/dask_on_ray_sort_compute_template.yaml b/release/nightly_tests/dask_on_ray/dask_on_ray_sort_compute_template.yaml index da67eec060c4..e87043b3d435 100644 --- a/release/nightly_tests/dask_on_ray/dask_on_ray_sort_compute_template.yaml +++ b/release/nightly_tests/dask_on_ray/dask_on_ray_sort_compute_template.yaml @@ -1,7 +1,7 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west-2 -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/nightly_tests/dask_on_ray/dask_on_ray_stress_compute.yaml b/release/nightly_tests/dask_on_ray/dask_on_ray_stress_compute.yaml index e249486f0377..838abd890c33 100644 --- a/release/nightly_tests/dask_on_ray/dask_on_ray_stress_compute.yaml +++ b/release/nightly_tests/dask_on_ray/dask_on_ray_stress_compute.yaml @@ -1,7 +1,7 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west-2 -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/nightly_tests/dask_on_ray/dask_on_ray_stress_compute_k8s.yaml b/release/nightly_tests/dask_on_ray/dask_on_ray_stress_compute_k8s.yaml index 387ce28c725a..6e891770737b 100644 --- a/release/nightly_tests/dask_on_ray/dask_on_ray_stress_compute_k8s.yaml +++ b/release/nightly_tests/dask_on_ray/dask_on_ray_stress_compute_k8s.yaml @@ -1,7 +1,7 @@ cloud_id: cld_HSrCZdMCYDe1NmMCJhYRgQ4p region: us-west-2 -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/nightly_tests/dask_on_ray/large_scale_dask_on_ray_compute_template.yaml b/release/nightly_tests/dask_on_ray/large_scale_dask_on_ray_compute_template.yaml index aea2d4f78b07..67838b838f5d 100644 --- a/release/nightly_tests/dask_on_ray/large_scale_dask_on_ray_compute_template.yaml +++ b/release/nightly_tests/dask_on_ray/large_scale_dask_on_ray_compute_template.yaml @@ -1,7 +1,7 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west-2 -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/nightly_tests/dataset/aggregate_benchmark.py b/release/nightly_tests/dataset/aggregate_benchmark.py deleted file mode 100644 index 8085ed0ca6d9..000000000000 --- a/release/nightly_tests/dataset/aggregate_benchmark.py +++ /dev/null @@ -1,141 +0,0 @@ -from typing import Tuple - -import ray -from ray.data._internal.aggregate import ( - _AggregateOnKeyBase, - Max, - Mean, - Min, - Sum, -) -from ray.data.block import Block -from ray.data.dataset import Dataset -import pyarrow.compute as pac - -from benchmark import Benchmark - - -def run_h2oai(benchmark: Benchmark): - """This benchmark is originally from https://github.com/h2oai/db-benchmark - - Here we run all group-by queries from the benchmark on Ray Datasets. - The input files are pre-generated and stored in AWS S3 beforehand. - """ - - # Test input file schema={ - # id1: string, id2: string, id3: string, id4: int64, id5: int64, id6: int64, - # v1: int64, v2: int64, v3: double - # }) - test_input = [ - ("s3://air-example-data/h2oai_benchmark/G1_1e7_1e2_0_0.csv", "h2oai-500M") - ] - for path, test_name in test_input: - input_ds = ray.data.read_csv(path) - # Number of blocks (parallelism) should be set as number of available CPUs - # to get best performance. - num_blocks = int(ray.cluster_resources().get("CPU", 1)) - input_ds = input_ds.repartition(num_blocks).materialize() - - q_list = [ - (h2oai_q1, "q1"), - (h2oai_q3, "q3"), - (h2oai_q4, "q4"), - (h2oai_q5, "q5"), - (h2oai_q7, "q7"), - (h2oai_q8, "q8"), - ] - - for q, name in q_list: - benchmark.run_materialize_ds(f"{test_name}-{name}", q, ds=input_ds) - - -def h2oai_q1(ds: Dataset) -> Dataset: - return ds.groupby("id1").sum("v1") - - -def h2oai_q2(ds: Dataset) -> Dataset: - # TODO(chengsu): Run this after dataset supports multiple group-by keys. - # return ds.groupby(["id1", "id2"]).sum("v1") - raise NotImplementedError - - -def h2oai_q3(ds: Dataset) -> Dataset: - return ds.groupby("id3").aggregate(Sum("v1"), Mean("v3")) - - -def h2oai_q4(ds: Dataset) -> Dataset: - return ds.groupby("id4").aggregate(Mean("v1"), Mean("v2"), Mean("v3")) - - -def h2oai_q5(ds: Dataset) -> Dataset: - return ds.groupby("id6").aggregate(Sum("v1"), Sum("v2"), Sum("v3")) - - -def h2oai_q6(ds: Dataset) -> Dataset: - # TODO(chengsu): Run this after dataset supports multiple group-by keys. - # return ds.groupby(["id4", "id5"]).aggregate(Median("v3"), Std("v3")) - raise NotImplementedError - - -def h2oai_q7(ds: Dataset) -> Dataset: - ds = ds.groupby("id3").aggregate(Max("v1"), Min("v2")) - ds = ds.map_batches( - lambda df: df.assign(result=df["max(v1)"] - df["min(v2)"]), - batch_format="pandas", - ) - return ds - - -def h2oai_q8(ds: Dataset) -> Dataset: - def accumulate_block(agg: Tuple[float, float], block: Block) -> Tuple[float, float]: - column = block["v3"] - top_k_indices = pac.top_k_unstable(column, k=2) - top_k_result = pac.take(column, top_k_indices).to_pylist() - top_k_result.extend([float("-inf")] * (2 - len(top_k_result))) - top_k_result = (top_k_result[0], top_k_result[1]) - return merge(agg, top_k_result) - - def merge( - agg1: Tuple[float, float], - agg2: Tuple[float, float], - ) -> Tuple[float, float]: - if agg1[0] >= agg2[0]: - value1 = agg1[0] - value2 = max(agg1[1], agg2[0]) - else: - value1 = agg2[0] - value2 = max(agg1[0], agg2[1]) - return (value1, value2) - - class Top2(_AggregateOnKeyBase): - def __init__(self, on): - self._set_key_fn(on) - super().__init__( - init=lambda _: (float("-inf"), float("-inf")), - merge=merge, - accumulate_block=accumulate_block, - name=(f"top2({str(on)})"), - ) - - return ds.groupby("id6").aggregate(Top2("v3")) - - -def h2oai_q9(ds: Dataset) -> Dataset: - # TODO(chengsu): Run this after dataset supports multiple group-by keys. - # return ds.groupby(["id2", "id4"]).aggregate(pow(corr("v1", "v2"), 2)) - raise NotImplementedError - - -def h2oai_q10(ds: Dataset) -> Dataset: - # TODO(chengsu): Run this after dataset supports multiple group-by keys. - # return ds.groupby(["id1", "id2", "id3", "id4", "id5", "id6"]) - # .aggregate(Count(), Sum("v3")) - raise NotImplementedError - - -if __name__ == "__main__": - benchmark = Benchmark("aggregate") - - run_h2oai(benchmark) - - benchmark.write_result() diff --git a/release/nightly_tests/dataset/compute_gpu_4x4_aws.yaml b/release/nightly_tests/dataset/compute_gpu_4x4_aws.yaml index df0eb98234ea..e56edf8bbf28 100644 --- a/release/nightly_tests/dataset/compute_gpu_4x4_aws.yaml +++ b/release/nightly_tests/dataset/compute_gpu_4x4_aws.yaml @@ -14,7 +14,7 @@ worker_node_types: min_workers: 3 use_spot: false -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/nightly_tests/dataset/multi_node_autoscaling_compute.yaml b/release/nightly_tests/dataset/multi_node_autoscaling_compute.yaml new file mode 100644 index 000000000000..7b3612d3b4b8 --- /dev/null +++ b/release/nightly_tests/dataset/multi_node_autoscaling_compute.yaml @@ -0,0 +1,18 @@ +# This config matches the default config for Anyscale workspaces with autoscaling. +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west-2 + +max_workers: 0 + +head_node_type: + name: head_node + instance_type: m5.2xlarge + resources: + cpu: 0 + +worker_node_types: + - name: worker_node + instance_type: m5.2xlarge + min_workers: 0 + max_workers: 10 + use_spot: false diff --git a/release/nightly_tests/dataset/multi_node_read_images_benchmark_compute.yaml b/release/nightly_tests/dataset/multi_node_read_images_benchmark_compute.yaml deleted file mode 100644 index 9655daad50cd..000000000000 --- a/release/nightly_tests/dataset/multi_node_read_images_benchmark_compute.yaml +++ /dev/null @@ -1,15 +0,0 @@ -cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} -region: us-west-2 - -max_workers: 19 - -head_node_type: - name: head_node - instance_type: m5.16xlarge - -worker_node_types: - - name: worker_node - instance_type: m5.4xlarge - max_workers: 19 - min_workers: 19 - use_spot: false diff --git a/release/nightly_tests/dataset/multi_node_read_images_benchmark_compute_gce.yaml b/release/nightly_tests/dataset/multi_node_read_images_benchmark_compute_gce.yaml deleted file mode 100644 index bca10d5c5447..000000000000 --- a/release/nightly_tests/dataset/multi_node_read_images_benchmark_compute_gce.yaml +++ /dev/null @@ -1,17 +0,0 @@ -cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} -region: us-west1 -allowed_azs: - - us-west1-c - -max_workers: 19 - -head_node_type: - name: head_node - instance_type: n2-standard-64 # aws m5.16xlarge - -worker_node_types: - - name: worker_node - instance_type: n2-standard-16 # aws m5.4xlarge - max_workers: 19 - min_workers: 19 - use_spot: false diff --git a/release/nightly_tests/dataset/parquet_metadata_resolution.py b/release/nightly_tests/dataset/parquet_metadata_resolution.py deleted file mode 100644 index b9f634f3f17f..000000000000 --- a/release/nightly_tests/dataset/parquet_metadata_resolution.py +++ /dev/null @@ -1,36 +0,0 @@ -import argparse -import os - -from benchmark import Benchmark - -parser = argparse.ArgumentParser(description="Parquet Metadata Read") -parser.add_argument("--num-files", type=int, default=30) -parser.add_argument("--cloud", type=str, choices=["aws", "gcp"]) - - -if __name__ == "__main__": - args = parser.parse_args() - import ray - - print("Connecting to Ray cluster...") - ray.init(address="auto") - - num = args.num_files - - assert args.cloud in {"aws", "gcp"}, args.cloud - if args.cloud == "aws": - prefix = "s3://shuffling-data-loader-benchmarks/data/r10_000_000_000-f1000" - if args.cloud == "gcp": - # NOTE(@bveeramani): I made a mistake while transferring the files from S3 to - # GCS, so there's an extra "r10_000_000_000-f1000" in the URI. Don't worry about - # it. The files are the same. - prefix = "gs://shuffling-data-loader-benchmarks/data/r10_000_000_000-f1000/r10_000_000_000-f1000" # noqa: E501 - files = [f"{prefix}/input_data_{i}.parquet.snappy" for i in range(args.num_files)] - - def _trigger_parquet_metadata_load(): - # This should only read Parquet metadata. - ray.data.read_parquet(files).count() - - benchmark = Benchmark("parquet_metadata_resolution") - benchmark.run_fn("read_metadata", _trigger_parquet_metadata_load) - benchmark.write_result(os.environ["TEST_OUTPUT_JSON"]) diff --git a/release/nightly_tests/dataset/pipelined_ingestion_compute.yaml b/release/nightly_tests/dataset/pipelined_ingestion_compute.yaml index b8b25b2def6c..0ed874893d1d 100644 --- a/release/nightly_tests/dataset/pipelined_ingestion_compute.yaml +++ b/release/nightly_tests/dataset/pipelined_ingestion_compute.yaml @@ -3,7 +3,7 @@ region: us-west-2 max_workers: 999 -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/nightly_tests/dataset/read_and_consume_benchmark.py b/release/nightly_tests/dataset/read_and_consume_benchmark.py new file mode 100644 index 000000000000..f833d27035ce --- /dev/null +++ b/release/nightly_tests/dataset/read_and_consume_benchmark.py @@ -0,0 +1,69 @@ +import ray + +from benchmark import Benchmark + +import argparse +from typing import Callable + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument("path", type=str) + parser.add_argument( + "--format", + choices=["image", "parquet"], + required=True, + ) + + consume_group = parser.add_mutually_exclusive_group() + consume_group.add_argument("--count", action="store_true") + consume_group.add_argument("--iterate", action="store_true") + + return parser.parse_args() + + +def main(args): + benchmark = Benchmark("read-and-consume") + read_fn = get_read_fn(args) + consume_fn = get_consume_fn(args) + + def benchmark_fn(): + ds = read_fn(args.path) + consume_fn(ds) + + benchmark.run_fn(str(vars(args)), benchmark_fn) + benchmark.write_result() + + +def get_read_fn(args: argparse.Namespace) -> Callable[[str], ray.data.Dataset]: + if args.format == "image": + read_fn = ray.data.read_images + elif args.format == "parquet": + read_fn = ray.data.read_parquet + else: + assert False, f"Invalid data format argument: {args}" + + return read_fn + + +def get_consume_fn(args: argparse.Namespace) -> Callable[[ray.data.Dataset], None]: + if args.count: + + def consume_fn(ds): + ds.count() + + elif args.iterate: + + def consume_fn(ds): + for _ in ds.iter_internal_ref_bundles(): + pass + + else: + assert False, f"Invalid consume arguments: {args}" + + return consume_fn + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/release/nightly_tests/dataset/read_images_benchmark.py b/release/nightly_tests/dataset/read_images_benchmark.py deleted file mode 100644 index 2a5f68db6e2f..000000000000 --- a/release/nightly_tests/dataset/read_images_benchmark.py +++ /dev/null @@ -1,148 +0,0 @@ -import argparse -import os -import random -import shutil -import tempfile -from typing import List, Tuple - -from PIL import Image - -import ray - -from benchmark import Benchmark - - -def parse_args(): - parser = argparse.ArgumentParser() - group = parser.add_mutually_exclusive_group() - group.add_argument( - "--single-node", - action="store_true", - help="Run single-node read_images benchmark.", - ) - group.add_argument( - "--multi-node", - action="store_true", - help="Run multi-node read_images benchmark.", - ) - return parser.parse_args() - - -def main(args): - ray.init() - - benchmark = Benchmark("read-images") - if args.single_node: - run_images_benchmark_single_node(benchmark) - elif args.multi_node: - run_images_benchmark_multi_node(benchmark) - - benchmark.write_result() - - -def generate_images( - num_images: int, sizes: List[Tuple[int, int]], modes: List[str], formats: List[str] -) -> str: - - dimensions = [] - for mode in modes: - if mode in ["1", "L", "P"]: - dimension = 1 - elif mode in ["RGB", "YCbCr", "LAB", "HSV"]: - dimension = 3 - elif mode in ["RGBA", "CMYK", "I", "F"]: - dimension = 4 - else: - raise ValueError(f"Found unknown image mode: {mode}.") - dimensions.append(dimension) - - images_dir = tempfile.mkdtemp() - - for image_idx in range(num_images): - size = random.choice(sizes) - file_format = random.choice(formats) - mode_idx = random.randrange(len(modes)) - mode = modes[mode_idx] - dimension = dimensions[mode_idx] - - width, height = size - file_name = f"{images_dir}/{image_idx}.{file_format}" - pixels_per_dimension = [] - for _ in range(dimension): - pixels = os.urandom(width * height) - pixels_per_dimension.append(pixels) - - image = Image.new(mode, size) - if len(pixels_per_dimension) == 1: - image.putdata(pixels_per_dimension[0]) - else: - image.putdata(list(zip(*pixels_per_dimension))) - image.save(file_name) - - return images_dir - - -def run_images_benchmark_single_node(benchmark: Benchmark): - # Set global random seed. - random.seed(42) - - test_input = [ - generate_images(100, [(256, 256)], ["RGB"], ["jpg"]), - generate_images(100, [(2048, 2048)], ["RGB"], ["jpg"]), - generate_images( - 1000, [(64, 64), (256, 256)], ["RGB", "L"], ["jpg", "jpeg", "png"] - ), - ] - - benchmark.run_materialize_ds( - "images-100-256-rbg-jpg", ray.data.read_images, test_input[0] - ) - benchmark.run_materialize_ds( - "images-100-2048-rbg-jpg", ray.data.read_images, test_input[1] - ) - benchmark.run_materialize_ds( - "images-100-2048-to-256-rbg-jpg", - ray.data.read_images, - test_input[1], - size=(256, 256), - ) - benchmark.run_materialize_ds( - "images-1000-mix", - ray.data.read_images, - test_input[2], - size=(256, 256), - mode="RGB", - ) - - for root in test_input: - shutil.rmtree(root) - - # TODO(chengsu): run benchmark on 20G and 100G imagenet data in multi-nodes - # cluster. - benchmark.run_materialize_ds( - "images-imagenet-1g", - ray.data.read_images, - "s3://air-example-data-2/1G-image-data-synthetic-raw", - ) - - -def run_images_benchmark_multi_node(benchmark: Benchmark): - hundred_thousand_image_paths = [ - f"s3://air-example-data-2/100k-images-data-synthetic-raw/dog_{i}/dog_0.jpg" - for i in range(100_000) - ] - hundred_million_image_paths = [] - for _ in range(100_000_000 // 100_000): - hundred_million_image_paths.extend(hundred_thousand_image_paths) - - def fn(): - ds = ray.data.read_images(hundred_million_image_paths) - for _ in ds.iter_batches(batch_size=None, batch_format="pyarrow"): - pass - - benchmark.run_fn("images-100M", fn) - - -if __name__ == "__main__": - args = parse_args() - main(args) diff --git a/release/nightly_tests/dataset/read_parquet_benchmark.py b/release/nightly_tests/dataset/read_parquet_benchmark.py deleted file mode 100644 index 96ceff9ff55c..000000000000 --- a/release/nightly_tests/dataset/read_parquet_benchmark.py +++ /dev/null @@ -1,120 +0,0 @@ -import ray -from ray.data.dataset import Dataset - -from benchmark import Benchmark -from parquet_data_generator import generate_data - -import shutil -import tempfile -from typing import Optional - - -def read_parquet( - root: str, - override_num_blocks: Optional[int] = None, - use_threads: bool = False, - filter=None, - columns=None, -) -> Dataset: - return ray.data.read_parquet( - paths=root, - override_num_blocks=override_num_blocks, - use_threads=use_threads, - filter=filter, - columns=columns, - ) - - -def run_read_parquet_benchmark(benchmark: Benchmark): - # Test with different override_num_blocks (multi-processing for single node) - # and threading. - for override_num_blocks in [1, 2, 4]: - for use_threads in [True, False]: - test_name = f"read-parquet-downsampled-nyc-taxi-2009-{override_num_blocks}-{use_threads}" # noqa: E501 - benchmark.run_materialize_ds( - test_name, - read_parquet, - root="s3://anonymous@air-example-data/ursa-labs-taxi-data/downsampled_2009_full_year_data.parquet", # noqa: E501 - override_num_blocks=override_num_blocks, - use_threads=use_threads, - ) - - # TODO: Test below is currently excluded, due to failure around - # pickling the Dataset involving the filter expression. - # The error is present on Python < 3.8, and involves the pickle/pickle5 - # libraries. `pickle` is included as a default library from Python 3.8+, - # whereas Python versions before this must import the backported `pickle5` library - # to maintain the same functionality. - - # Test with projection and filter pushdowns. - # Since we have projection and filter pushdown, we can run the read on the full - # size of one year data fast enough on a single node. - # test_name = "read-parquet-nyc-taxi-2018-pushdown" - # filter_expr = (pa.dataset.field("passenger_count") <= 10) & ( - # pa.dataset.field("passenger_count") > 0 - # ) - # benchmark.run( - # test_name, - # read_parquet, - # root="s3://anonymous@air-example-data/ursa-labs-taxi-data/by_year/2018", - # columns=["passenger_count", "trip_distance"], - # filter=filter_expr, - # ) - - # Test with different number files to handle: from a few to many. - data_dirs = [] - # Each test set has same total number of rows, which are distributed - # to different number of files. - total_rows = 1024 * 1024 * 8 - for num_files in [8, 128, 1024]: - for compression in ["snappy", "gzip"]: - data_dirs.append(tempfile.mkdtemp()) - generate_data( - num_rows=total_rows, - num_files=num_files, - num_row_groups_per_file=16, - compression=compression, - data_dir=data_dirs[-1], - ) - test_name = f"read-parquet-random-data-{num_files}-{compression}" - benchmark.run_materialize_ds( - test_name, - read_parquet, - root=data_dirs[-1], - override_num_blocks=1, # We are testing one task to handle N files - ) - for dir in data_dirs: - shutil.rmtree(dir) - - # Test reading many small files. - num_files = 1000 - num_row_groups_per_file = 2 - total_rows = num_files * num_row_groups_per_file - compression = "gzip" - - many_files_dir = "s3://air-example-data-2/read-many-parquet-files/" - # If needed, use the following utility to generate files on S3. - # Otherwise, the benchmark will read pre-generated files in the above bucket. - # generate_data( - # num_rows=total_rows, - # num_files=num_files, - # num_row_groups_per_file=num_row_groups_per_file, - # compression=compression, - # data_dir=many_files_dir, - # ) - test_name = f"read-many-parquet-files-s3-{num_files}-{compression}" - benchmark.run_materialize_ds( - test_name, - read_parquet, - root=many_files_dir, - ) - - -if __name__ == "__main__": - ray.init() - - benchmark = Benchmark("read-parquet") - - run_read_parquet_benchmark(benchmark) - - benchmark.write_result() diff --git a/release/nightly_tests/dataset/read_tfrecords_benchmark.py b/release/nightly_tests/dataset/read_tfrecords_benchmark.py index 48d1bb229195..34fecb02fd41 100644 --- a/release/nightly_tests/dataset/read_tfrecords_benchmark.py +++ b/release/nightly_tests/dataset/read_tfrecords_benchmark.py @@ -1,3 +1,4 @@ +import os import random import shutil import tempfile @@ -7,11 +8,47 @@ from ray.data.dataset import Dataset from benchmark import Benchmark -from read_images_benchmark import generate_images +from PIL import Image import pyarrow as pa import numpy as np +def generate_images( + num_images: int, sizes: List[Tuple[int, int]], modes: List[str], formats: List[str] +) -> str: + dimensions = [] + for mode in modes: + if mode in ["1", "L", "P"]: + dimension = 1 + elif mode in ["RGB", "YCbCr", "LAB", "HSV"]: + dimension = 3 + elif mode in ["RGBA", "CMYK", "I", "F"]: + dimension = 4 + else: + raise ValueError(f"Found unknown image mode: {mode}.") + dimensions.append(dimension) + images_dir = tempfile.mkdtemp() + for image_idx in range(num_images): + size = random.choice(sizes) + file_format = random.choice(formats) + mode_idx = random.randrange(len(modes)) + mode = modes[mode_idx] + dimension = dimensions[mode_idx] + width, height = size + file_name = f"{images_dir}/{image_idx}.{file_format}" + pixels_per_dimension = [] + for _ in range(dimension): + pixels = os.urandom(width * height) + pixels_per_dimension.append(pixels) + image = Image.new(mode, size) + if len(pixels_per_dimension) == 1: + image.putdata(pixels_per_dimension[0]) + else: + image.putdata(list(zip(*pixels_per_dimension))) + image.save(file_name) + return images_dir + + def read_tfrecords(path: str) -> Dataset: return ray.data.read_tfrecords(paths=path).materialize() diff --git a/release/nightly_tests/dataset/shuffle_compute.yaml b/release/nightly_tests/dataset/shuffle_compute.yaml index eb7aacc0b8e7..b776f7edfa4c 100644 --- a/release/nightly_tests/dataset/shuffle_compute.yaml +++ b/release/nightly_tests/dataset/shuffle_compute.yaml @@ -3,7 +3,7 @@ region: us-west-2 max_workers: 999 -aws: +advanced_configurations_json: IamInstanceProfile: {"Name": "ray-autoscaler-v1"} BlockDeviceMappings: - DeviceName: /dev/sda1 diff --git a/release/nightly_tests/decision_tree/autoscaling_compute.yaml b/release/nightly_tests/decision_tree/autoscaling_compute.yaml index 4eb361e1bcae..3031267a1bd8 100644 --- a/release/nightly_tests/decision_tree/autoscaling_compute.yaml +++ b/release/nightly_tests/decision_tree/autoscaling_compute.yaml @@ -3,7 +3,7 @@ region: us-west-2 max_workers: 10 -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/nightly_tests/decision_tree/autoscaling_compute_gce.yaml b/release/nightly_tests/decision_tree/autoscaling_compute_gce.yaml index b54d1d50f812..5b7072d5b30f 100644 --- a/release/nightly_tests/decision_tree/autoscaling_compute_gce.yaml +++ b/release/nightly_tests/decision_tree/autoscaling_compute_gce.yaml @@ -1,11 +1,11 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west1 -allowed_azs: +allowed_azs: - us-west1-c max_workers: 10 -#aws: +#advanced_configurations_json: # BlockDeviceMappings: # - DeviceName: /dev/sda1 # Ebs: diff --git a/release/nightly_tests/placement_group_tests/compute.yaml b/release/nightly_tests/placement_group_tests/compute.yaml index d0fe68b4c17c..3baa53f9f36c 100644 --- a/release/nightly_tests/placement_group_tests/compute.yaml +++ b/release/nightly_tests/placement_group_tests/compute.yaml @@ -1,7 +1,7 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west-2 -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: @@ -20,7 +20,7 @@ worker_node_types: use_spot: false - name: fake_gpu_node instance_type: m5.4xlarge - min_workers: 0 + min_workers: 0 max_workers: 2 use_spot: false resources: diff --git a/release/nightly_tests/placement_group_tests/long_running_test_compute.yaml b/release/nightly_tests/placement_group_tests/long_running_test_compute.yaml index cc3005c09c5b..d990178123f0 100644 --- a/release/nightly_tests/placement_group_tests/long_running_test_compute.yaml +++ b/release/nightly_tests/placement_group_tests/long_running_test_compute.yaml @@ -1,7 +1,7 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west-2 -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/nightly_tests/placement_group_tests/pg_perf_test_compute.yaml b/release/nightly_tests/placement_group_tests/pg_perf_test_compute.yaml index 8764e0f6c4df..a3e1852cc568 100644 --- a/release/nightly_tests/placement_group_tests/pg_perf_test_compute.yaml +++ b/release/nightly_tests/placement_group_tests/pg_perf_test_compute.yaml @@ -1,7 +1,7 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west-2 -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/nightly_tests/shuffle/100tb_shuffle_compute.yaml b/release/nightly_tests/shuffle/100tb_shuffle_compute.yaml index 14a02cfa030e..17ed94f5d623 100644 --- a/release/nightly_tests/shuffle/100tb_shuffle_compute.yaml +++ b/release/nightly_tests/shuffle/100tb_shuffle_compute.yaml @@ -1,7 +1,7 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west-2 -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/nightly_tests/shuffle/datasets_large_scale_compute_small_instances.yaml b/release/nightly_tests/shuffle/datasets_large_scale_compute_small_instances.yaml index b6f95b050839..f6a658058a5e 100644 --- a/release/nightly_tests/shuffle/datasets_large_scale_compute_small_instances.yaml +++ b/release/nightly_tests/shuffle/datasets_large_scale_compute_small_instances.yaml @@ -1,7 +1,7 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west-2 -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/nightly_tests/shuffle/shuffle_compute_autoscaling.yaml b/release/nightly_tests/shuffle/shuffle_compute_autoscaling.yaml index 38091a3f12b6..57653d15408a 100644 --- a/release/nightly_tests/shuffle/shuffle_compute_autoscaling.yaml +++ b/release/nightly_tests/shuffle/shuffle_compute_autoscaling.yaml @@ -1,7 +1,7 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west-2 -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/nightly_tests/shuffle/shuffle_compute_autoscaling_gce.yaml b/release/nightly_tests/shuffle/shuffle_compute_autoscaling_gce.yaml index 4a7af41b667c..30d141e8b544 100644 --- a/release/nightly_tests/shuffle/shuffle_compute_autoscaling_gce.yaml +++ b/release/nightly_tests/shuffle/shuffle_compute_autoscaling_gce.yaml @@ -1,9 +1,9 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west1 -allowed_azs: +allowed_azs: - us-west1-c -#aws: +#advanced_configurations_json: # BlockDeviceMappings: # - DeviceName: /dev/sda1 # Ebs: diff --git a/release/nightly_tests/shuffle/shuffle_compute_large_scale.yaml b/release/nightly_tests/shuffle/shuffle_compute_large_scale.yaml index 27268db12a0d..7b2a779f6cd0 100644 --- a/release/nightly_tests/shuffle/shuffle_compute_large_scale.yaml +++ b/release/nightly_tests/shuffle/shuffle_compute_large_scale.yaml @@ -1,7 +1,7 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west-2 -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/nightly_tests/shuffle/shuffle_compute_multi.yaml b/release/nightly_tests/shuffle/shuffle_compute_multi.yaml index a726988aeda0..841d019c9545 100644 --- a/release/nightly_tests/shuffle/shuffle_compute_multi.yaml +++ b/release/nightly_tests/shuffle/shuffle_compute_multi.yaml @@ -3,7 +3,7 @@ region: us-west-2 max_workers: 3 -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/nightly_tests/shuffle/shuffle_compute_multi_gce.yaml b/release/nightly_tests/shuffle/shuffle_compute_multi_gce.yaml index 1332e3390e97..ff95d850892d 100644 --- a/release/nightly_tests/shuffle/shuffle_compute_multi_gce.yaml +++ b/release/nightly_tests/shuffle/shuffle_compute_multi_gce.yaml @@ -1,11 +1,11 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west1 -allowed_azs: +allowed_azs: - us-west1-c max_workers: 3 -#aws: +#advanced_configurations_json: # BlockDeviceMappings: # - DeviceName: /dev/sda1 # Ebs: diff --git a/release/nightly_tests/shuffle/shuffle_compute_single.yaml b/release/nightly_tests/shuffle/shuffle_compute_single.yaml index df8d84edc81f..16b4bec73a91 100644 --- a/release/nightly_tests/shuffle/shuffle_compute_single.yaml +++ b/release/nightly_tests/shuffle/shuffle_compute_single.yaml @@ -3,7 +3,7 @@ region: us-west-2 max_workers: 0 -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/nightly_tests/stress_tests/placement_group_tests_compute.yaml b/release/nightly_tests/stress_tests/placement_group_tests_compute.yaml index 275b0a0a7d36..9b5476d95624 100644 --- a/release/nightly_tests/stress_tests/placement_group_tests_compute.yaml +++ b/release/nightly_tests/stress_tests/placement_group_tests_compute.yaml @@ -3,7 +3,7 @@ region: us-west-2 max_workers: 5 -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: @@ -12,18 +12,17 @@ aws: head_node_type: name: head_node - instance_type: m4.16xlarge + instance_type: m6i.16xlarge resources: cpu: 64 worker_node_types: - name: worker_node - instance_type: m4.large + instance_type: m6i.large min_workers: 5 max_workers: 5 use_spot: false resources: - cpu: 2 + cpu: 2 custom_resources: pg_custom: 666 - diff --git a/release/nightly_tests/stress_tests/placement_group_tests_compute_gce.yaml b/release/nightly_tests/stress_tests/placement_group_tests_compute_gce.yaml index 0e0285848708..b22a790f1b18 100644 --- a/release/nightly_tests/stress_tests/placement_group_tests_compute_gce.yaml +++ b/release/nightly_tests/stress_tests/placement_group_tests_compute_gce.yaml @@ -1,11 +1,11 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west1 -allowed_azs: +allowed_azs: - us-west1-c max_workers: 5 -#aws: +#advanced_configurations_json: # BlockDeviceMappings: # - DeviceName: /dev/sda1 # Ebs: @@ -25,7 +25,7 @@ worker_node_types: max_workers: 5 use_spot: false resources: - cpu: 2 + cpu: 2 custom_resources: pg_custom: 666 diff --git a/release/nightly_tests/stress_tests/smoke_test_compute.yaml b/release/nightly_tests/stress_tests/smoke_test_compute.yaml index 0af96e62373b..9ae9ea54cfe4 100644 --- a/release/nightly_tests/stress_tests/smoke_test_compute.yaml +++ b/release/nightly_tests/stress_tests/smoke_test_compute.yaml @@ -3,7 +3,7 @@ region: us-west-2 max_workers: 4 -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: @@ -12,11 +12,11 @@ aws: head_node_type: name: head_node - instance_type: m4.4xlarge + instance_type: m6i.4xlarge worker_node_types: - name: worker_node - instance_type: m4.large + instance_type: m6i.large min_workers: 4 max_workers: 4 use_spot: false diff --git a/release/nightly_tests/stress_tests/stress_test_threaded_actor_compute.yaml b/release/nightly_tests/stress_tests/stress_test_threaded_actor_compute.yaml index 93a576e66333..88ba8049b087 100644 --- a/release/nightly_tests/stress_tests/stress_test_threaded_actor_compute.yaml +++ b/release/nightly_tests/stress_tests/stress_test_threaded_actor_compute.yaml @@ -1,7 +1,7 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west-2 -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/nightly_tests/stress_tests/stress_tests_compute.yaml b/release/nightly_tests/stress_tests/stress_tests_compute.yaml index 5e9acaf7f7e8..9b85c0723e98 100644 --- a/release/nightly_tests/stress_tests/stress_tests_compute.yaml +++ b/release/nightly_tests/stress_tests/stress_tests_compute.yaml @@ -3,7 +3,7 @@ region: us-west-2 max_workers: 100 -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: @@ -12,13 +12,13 @@ aws: head_node_type: name: head_node - instance_type: m4.16xlarge + instance_type: m6i.16xlarge resources: cpu: 64 worker_node_types: - name: worker_node - instance_type: m4.large + instance_type: m6i.large min_workers: 100 max_workers: 100 use_spot: false diff --git a/release/nightly_tests/stress_tests/stress_tests_compute_large.yaml b/release/nightly_tests/stress_tests/stress_tests_compute_large.yaml index 3784bac975be..53aa4e77c3d6 100644 --- a/release/nightly_tests/stress_tests/stress_tests_compute_large.yaml +++ b/release/nightly_tests/stress_tests/stress_tests_compute_large.yaml @@ -3,7 +3,7 @@ region: us-west-2 max_workers: 6 -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: @@ -12,13 +12,13 @@ aws: head_node_type: name: head_node - instance_type: m4.16xlarge + instance_type: m6i.16xlarge resources: cpu: 64 worker_node_types: - name: worker_node - instance_type: m4.16xlarge + instance_type: m6i.16xlarge min_workers: 6 max_workers: 6 use_spot: false diff --git a/release/perf_metrics/metadata.json b/release/perf_metrics/metadata.json index 2ef9c5cd543e..d4a423494576 100644 --- a/release/perf_metrics/metadata.json +++ b/release/perf_metrics/metadata.json @@ -1 +1 @@ -{"release_version": "2.39.0"} \ No newline at end of file +{"release_version": "2.39.0"} diff --git a/release/ray_release/byod/requirements_byod_3.9.txt b/release/ray_release/byod/requirements_byod_3.9.txt index adff2b611647..f1dcf9ee13a8 100644 --- a/release/ray_release/byod/requirements_byod_3.9.txt +++ b/release/ray_release/byod/requirements_byod_3.9.txt @@ -144,6 +144,7 @@ ale-py==0.10.1 \ # via # -c release/ray_release/byod/requirements_compiled.txt # -r release/ray_release/byod/requirements_byod_3.9.in + # gymnasium annotated-types==0.6.0 \ --hash=sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43 \ --hash=sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d @@ -574,33 +575,39 @@ crcmod==1.7 \ # via # -c release/ray_release/byod/requirements_compiled.txt # gsutil -cryptography==38.0.1 \ - --hash=sha256:0297ffc478bdd237f5ca3a7dc96fc0d315670bfa099c04dc3a4a2172008a405a \ - --hash=sha256:10d1f29d6292fc95acb597bacefd5b9e812099d75a6469004fd38ba5471a977f \ - --hash=sha256:16fa61e7481f4b77ef53991075de29fc5bacb582a1244046d2e8b4bb72ef66d0 \ - --hash=sha256:194044c6b89a2f9f169df475cc167f6157eb9151cc69af8a2a163481d45cc407 \ - --hash=sha256:1db3d807a14931fa317f96435695d9ec386be7b84b618cc61cfa5d08b0ae33d7 \ - --hash=sha256:3261725c0ef84e7592597606f6583385fed2a5ec3909f43bc475ade9729a41d6 \ - --hash=sha256:3b72c360427889b40f36dc214630e688c2fe03e16c162ef0aa41da7ab1455153 \ - --hash=sha256:3e3a2599e640927089f932295a9a247fc40a5bdf69b0484532f530471a382750 \ - --hash=sha256:3fc26e22840b77326a764ceb5f02ca2d342305fba08f002a8c1f139540cdfaad \ - --hash=sha256:5067ee7f2bce36b11d0e334abcd1ccf8c541fc0bbdaf57cdd511fdee53e879b6 \ - --hash=sha256:52e7bee800ec869b4031093875279f1ff2ed12c1e2f74923e8f49c916afd1d3b \ - --hash=sha256:64760ba5331e3f1794d0bcaabc0d0c39e8c60bf67d09c93dc0e54189dfd7cfe5 \ - --hash=sha256:765fa194a0f3372d83005ab83ab35d7c5526c4e22951e46059b8ac678b44fa5a \ - --hash=sha256:79473cf8a5cbc471979bd9378c9f425384980fcf2ab6534b18ed7d0d9843987d \ - --hash=sha256:896dd3a66959d3a5ddcfc140a53391f69ff1e8f25d93f0e2e7830c6de90ceb9d \ - --hash=sha256:89ed49784ba88c221756ff4d4755dbc03b3c8d2c5103f6d6b4f83a0fb1e85294 \ - --hash=sha256:ac7e48f7e7261207d750fa7e55eac2d45f720027d5703cd9007e9b37bbb59ac0 \ - --hash=sha256:ad7353f6ddf285aeadfaf79e5a6829110106ff8189391704c1d8801aa0bae45a \ - --hash=sha256:b0163a849b6f315bf52815e238bc2b2346604413fa7c1601eea84bcddb5fb9ac \ - --hash=sha256:b6c9b706316d7b5a137c35e14f4103e2115b088c412140fdbd5f87c73284df61 \ - --hash=sha256:c2e5856248a416767322c8668ef1845ad46ee62629266f84a8f007a317141013 \ - --hash=sha256:ca9f6784ea96b55ff41708b92c3f6aeaebde4c560308e5fbbd3173fbc466e94e \ - --hash=sha256:d1a5bd52d684e49a36582193e0b89ff267704cd4025abefb9e26803adeb3e5fb \ - --hash=sha256:d3971e2749a723e9084dd507584e2a2761f78ad2c638aa31e80bc7a15c9db4f9 \ - --hash=sha256:d4ef6cc305394ed669d4d9eebf10d3a101059bdcf2669c366ec1d14e4fb227bd \ - --hash=sha256:d9e69ae01f99abe6ad646947bba8941e896cb3aa805be2597a0400e0764b5818 +cryptography==42.0.5 \ + --hash=sha256:0270572b8bd2c833c3981724b8ee9747b3ec96f699a9665470018594301439ee \ + --hash=sha256:111a0d8553afcf8eb02a4fea6ca4f59d48ddb34497aa8706a6cf536f1a5ec576 \ + --hash=sha256:16a48c23a62a2f4a285699dba2e4ff2d1cff3115b9df052cdd976a18856d8e3d \ + --hash=sha256:1b95b98b0d2af784078fa69f637135e3c317091b615cd0905f8b8a087e86fa30 \ + --hash=sha256:1f71c10d1e88467126f0efd484bd44bca5e14c664ec2ede64c32f20875c0d413 \ + --hash=sha256:2424ff4c4ac7f6b8177b53c17ed5d8fa74ae5955656867f5a8affaca36a27abb \ + --hash=sha256:2bce03af1ce5a5567ab89bd90d11e7bbdff56b8af3acbbec1faded8f44cb06da \ + --hash=sha256:329906dcc7b20ff3cad13c069a78124ed8247adcac44b10bea1130e36caae0b4 \ + --hash=sha256:37dd623507659e08be98eec89323469e8c7b4c1407c85112634ae3dbdb926fdd \ + --hash=sha256:3eaafe47ec0d0ffcc9349e1708be2aaea4c6dd4978d76bf6eb0cb2c13636c6fc \ + --hash=sha256:5e6275c09d2badf57aea3afa80d975444f4be8d3bc58f7f80d2a484c6f9485c8 \ + --hash=sha256:6fe07eec95dfd477eb9530aef5bead34fec819b3aaf6c5bd6d20565da607bfe1 \ + --hash=sha256:7367d7b2eca6513681127ebad53b2582911d1736dc2ffc19f2c3ae49997496bc \ + --hash=sha256:7cde5f38e614f55e28d831754e8a3bacf9ace5d1566235e39d91b35502d6936e \ + --hash=sha256:9481ffe3cf013b71b2428b905c4f7a9a4f76ec03065b05ff499bb5682a8d9ad8 \ + --hash=sha256:98d8dc6d012b82287f2c3d26ce1d2dd130ec200c8679b6213b3c73c08b2b7940 \ + --hash=sha256:a011a644f6d7d03736214d38832e030d8268bcff4a41f728e6030325fea3e400 \ + --hash=sha256:a2913c5375154b6ef2e91c10b5720ea6e21007412f6437504ffea2109b5a33d7 \ + --hash=sha256:a30596bae9403a342c978fb47d9b0ee277699fa53bbafad14706af51fe543d16 \ + --hash=sha256:b03c2ae5d2f0fc05f9a2c0c997e1bc18c8229f392234e8a0194f202169ccd278 \ + --hash=sha256:b6cd2203306b63e41acdf39aa93b86fb566049aeb6dc489b70e34bcd07adca74 \ + --hash=sha256:b7ffe927ee6531c78f81aa17e684e2ff617daeba7f189f911065b2ea2d526dec \ + --hash=sha256:b8cac287fafc4ad485b8a9b67d0ee80c66bf3574f655d3b97ef2e1082360faf1 \ + --hash=sha256:ba334e6e4b1d92442b75ddacc615c5476d4ad55cc29b15d590cc6b86efa487e2 \ + --hash=sha256:ba3e4a42397c25b7ff88cdec6e2a16c2be18720f317506ee25210f6d31925f9c \ + --hash=sha256:c41fb5e6a5fe9ebcd58ca3abfeb51dffb5d83d6775405305bfa8715b76521922 \ + --hash=sha256:cd2030f6650c089aeb304cf093f3244d34745ce0cfcc39f20c6fbfe030102e2a \ + --hash=sha256:cd65d75953847815962c84a4654a84850b2bb4aed3f26fadcc1c13892e1e29f6 \ + --hash=sha256:e4985a790f921508f36f81831817cbc03b102d643b5fcb81cd33df3fa291a1a1 \ + --hash=sha256:e807b3188f9eb0eaa7bbb579b462c5ace579f1cedb28107ce8b48a9f7ad3679e \ + --hash=sha256:f12764b8fffc7a123f641d7d049d382b73f96a34117e0b637b80643169cec8ac \ + --hash=sha256:f8837fe1d6ac4a8052a9a8ddab256bc006242696f03368a4009be7ee3075cdb7 # via # -c release/ray_release/byod/requirements_compiled.txt # pyopenssl @@ -650,9 +657,9 @@ cython==0.29.37 \ # via # -c release/ray_release/byod/requirements_compiled.txt # -r release/ray_release/byod/requirements_byod_3.9.in -dask[complete]==2022.10.1 ; python_version < "3.12" \ - --hash=sha256:2e6765bb6011c97c59fd4792540df679c703100443fcd99c82b98d8697295822 \ - --hash=sha256:79d283326045700af0de7e2be57fd663499958c63638bf5076839cbcde64aa3f +dask[complete]==2022.10.2 ; python_version < "3.12" \ + --hash=sha256:42cb43f601709575fa46ce09e74bea83fdd464187024f56954e09d9b428ceaab \ + --hash=sha256:928003a97b890a14c8a09a01f15320d261053bda530a8bf191d84f33db4a63b8 # via # -c release/ray_release/byod/requirements_compiled.txt # -r release/ray_release/byod/requirements_byod_3.9.in @@ -673,9 +680,9 @@ diskcache==5.6.3 \ --hash=sha256:2c3a3fa2743d8535d832ec61c2054a1641f41775aa7c556758a109941e33e4fc \ --hash=sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19 # via petastorm -distributed==2022.10.1 ; python_version < "3.12" \ - --hash=sha256:31abab8ecc76951875828a3689d47dc4f20226b3ec99a0dc1af6183d02dbe5fe \ - --hash=sha256:42c6fe7d3bea491e23ce020879c411f2ecfecdb4914a6cb6b4a63530a7b0fa70 +distributed==2022.10.2 ; python_version < "3.12" \ + --hash=sha256:53f0a5bf6efab9a5ab3345cd913f6d3f3d4ea444ee2edbea331c7fef96fd67d0 \ + --hash=sha256:ae4fffdb55c6cb510ba1cbdf2856563af80ebf93e5ceacb91c1ce79e7da108d8 # via # -c release/ray_release/byod/requirements_compiled.txt # dask @@ -1264,7 +1271,7 @@ gsutil==5.27 \ # via # -c release/ray_release/byod/requirements_compiled.txt # -r release/ray_release/byod/requirements_byod_3.9.in -gymnasium==1.0.0 \ +gymnasium[atari]==1.0.0 \ --hash=sha256:9d2b66f30c1b34fe3c2ce7fae65ecf365d0e9982d2b3d860235e773328a3b403 \ --hash=sha256:b6f40e1e24c5bd419361e1a5b86a9117d2499baecc3a660d44dfff4c465393ad # via @@ -2203,9 +2210,9 @@ pygments==2.18.0 \ # via # -c release/ray_release/byod/requirements_compiled.txt # rich -pyopenssl==23.0.0 \ - --hash=sha256:c1cc5f86bcacefc84dada7d31175cae1b1518d5f60d3d0bb595a67822a868a6f \ - --hash=sha256:df5fc28af899e74e19fccb5510df423581047e10ab6f1f4ba1763ff5fde844c0 +pyopenssl==24.2.1 \ + --hash=sha256:4247f0dbe3748d560dcbb2ff3ea01af0f9a1a001ef5f7c4c647956ed8cbf0e95 \ + --hash=sha256:967d5719b12b243588573f39b0c677637145c7a1ffedcd495a487e58177fbb8d # via # -c release/ray_release/byod/requirements_compiled.txt # gcs-oauth2-boto-plugin diff --git a/release/ray_release/byod/requirements_ml_byod_3.9.in b/release/ray_release/byod/requirements_ml_byod_3.9.in index 7ef915a8a698..69ca2cc2c734 100644 --- a/release/ray_release/byod/requirements_ml_byod_3.9.in +++ b/release/ray_release/byod/requirements_ml_byod_3.9.in @@ -6,7 +6,6 @@ bitsandbytes boto3 cmake crc32c -dataset datasets decord deepspeed>=0.12.3 diff --git a/release/ray_release/byod/requirements_ml_byod_3.9.txt b/release/ray_release/byod/requirements_ml_byod_3.9.txt index 7eeec76c2886..8d9e5a044f47 100644 --- a/release/ray_release/byod/requirements_ml_byod_3.9.txt +++ b/release/ray_release/byod/requirements_ml_byod_3.9.txt @@ -1,4 +1,3 @@ - # # This file is autogenerated by pip-compile with python 3.9 # To update, run: @@ -118,12 +117,6 @@ aiosignal==1.3.1 \ # via # -c release/ray_release/byod/requirements_compiled.txt # aiohttp -alembic==1.12.1 \ - --hash=sha256:47d52e3dfb03666ed945becb723d6482e52190917fdb47071440cfdba05d92cb \ - --hash=sha256:bca5877e9678b454706347bc10b97cb7d67f300320fa5c3a94423e8266e2823f - # via - # -c release/ray_release/byod/requirements_compiled.txt - # dataset annotated-types==0.6.0 \ --hash=sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43 \ --hash=sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d @@ -175,17 +168,12 @@ attrs==21.4.0 \ # aiohttp # jsonlines # jsonschema - # markdown-it-py backcall==0.2.0 \ --hash=sha256:5cbdbf27be5e7cfadb448baf0aa95508f91f2bbc6c6437cd9cd06e2a4c215e1e \ --hash=sha256:fbbce6a29f263178a1f7915c1940bde0ec2b2a967566fe1c65c1dfb7422bd255 # via # -c release/ray_release/byod/requirements_compiled.txt # ipython -banal==1.0.6 \ - --hash=sha256:2fe02c9305f53168441948f4a03dfbfa2eacc73db30db4a93309083cb0e250a5 \ - --hash=sha256:877aacb16b17f8fa4fd29a7c44515c5a23dc1a7b26078bc41dd34829117d85e1 - # via dataset bitsandbytes==0.43.1 \ --hash=sha256:52c1c7189a6ca006555a9663e544e75f40520a97a26e075411f9f9aca0771fcd \ --hash=sha256:a81c826d576d6d691c7b4a7491c8fdc0f37f769795d6ca2e54afa605d2c260a3 @@ -510,12 +498,6 @@ comm==0.2.0 \ # via # -c release/ray_release/byod/requirements_compiled.txt # ipywidgets -commonmark==0.9.1 \ - --hash=sha256:452f9dc859be7f06631ddcb328b6919c67984aca654e5fefb3914d54691aed60 \ - --hash=sha256:da2f38c92590f83de410ba1a3cbceafbc74fee9def35f9251ba9a971d6d66fd9 - # via - # -c release/ray_release/byod/requirements_compiled.txt - # rich configargparse==1.7 \ --hash=sha256:d249da6591465c6c26df64a9f73d2536e743be2f244eb3ebe61114af2f94f86b \ --hash=sha256:e7067471884de5478c58a511e529f0f9bd1c66bfef1dea90935438d6c23306d1 @@ -652,33 +634,39 @@ crcmod==1.7 \ # via # -c release/ray_release/byod/requirements_compiled.txt # gsutil -cryptography==38.0.1 \ - --hash=sha256:0297ffc478bdd237f5ca3a7dc96fc0d315670bfa099c04dc3a4a2172008a405a \ - --hash=sha256:10d1f29d6292fc95acb597bacefd5b9e812099d75a6469004fd38ba5471a977f \ - --hash=sha256:16fa61e7481f4b77ef53991075de29fc5bacb582a1244046d2e8b4bb72ef66d0 \ - --hash=sha256:194044c6b89a2f9f169df475cc167f6157eb9151cc69af8a2a163481d45cc407 \ - --hash=sha256:1db3d807a14931fa317f96435695d9ec386be7b84b618cc61cfa5d08b0ae33d7 \ - --hash=sha256:3261725c0ef84e7592597606f6583385fed2a5ec3909f43bc475ade9729a41d6 \ - --hash=sha256:3b72c360427889b40f36dc214630e688c2fe03e16c162ef0aa41da7ab1455153 \ - --hash=sha256:3e3a2599e640927089f932295a9a247fc40a5bdf69b0484532f530471a382750 \ - --hash=sha256:3fc26e22840b77326a764ceb5f02ca2d342305fba08f002a8c1f139540cdfaad \ - --hash=sha256:5067ee7f2bce36b11d0e334abcd1ccf8c541fc0bbdaf57cdd511fdee53e879b6 \ - --hash=sha256:52e7bee800ec869b4031093875279f1ff2ed12c1e2f74923e8f49c916afd1d3b \ - --hash=sha256:64760ba5331e3f1794d0bcaabc0d0c39e8c60bf67d09c93dc0e54189dfd7cfe5 \ - --hash=sha256:765fa194a0f3372d83005ab83ab35d7c5526c4e22951e46059b8ac678b44fa5a \ - --hash=sha256:79473cf8a5cbc471979bd9378c9f425384980fcf2ab6534b18ed7d0d9843987d \ - --hash=sha256:896dd3a66959d3a5ddcfc140a53391f69ff1e8f25d93f0e2e7830c6de90ceb9d \ - --hash=sha256:89ed49784ba88c221756ff4d4755dbc03b3c8d2c5103f6d6b4f83a0fb1e85294 \ - --hash=sha256:ac7e48f7e7261207d750fa7e55eac2d45f720027d5703cd9007e9b37bbb59ac0 \ - --hash=sha256:ad7353f6ddf285aeadfaf79e5a6829110106ff8189391704c1d8801aa0bae45a \ - --hash=sha256:b0163a849b6f315bf52815e238bc2b2346604413fa7c1601eea84bcddb5fb9ac \ - --hash=sha256:b6c9b706316d7b5a137c35e14f4103e2115b088c412140fdbd5f87c73284df61 \ - --hash=sha256:c2e5856248a416767322c8668ef1845ad46ee62629266f84a8f007a317141013 \ - --hash=sha256:ca9f6784ea96b55ff41708b92c3f6aeaebde4c560308e5fbbd3173fbc466e94e \ - --hash=sha256:d1a5bd52d684e49a36582193e0b89ff267704cd4025abefb9e26803adeb3e5fb \ - --hash=sha256:d3971e2749a723e9084dd507584e2a2761f78ad2c638aa31e80bc7a15c9db4f9 \ - --hash=sha256:d4ef6cc305394ed669d4d9eebf10d3a101059bdcf2669c366ec1d14e4fb227bd \ - --hash=sha256:d9e69ae01f99abe6ad646947bba8941e896cb3aa805be2597a0400e0764b5818 +cryptography==42.0.5 \ + --hash=sha256:0270572b8bd2c833c3981724b8ee9747b3ec96f699a9665470018594301439ee \ + --hash=sha256:111a0d8553afcf8eb02a4fea6ca4f59d48ddb34497aa8706a6cf536f1a5ec576 \ + --hash=sha256:16a48c23a62a2f4a285699dba2e4ff2d1cff3115b9df052cdd976a18856d8e3d \ + --hash=sha256:1b95b98b0d2af784078fa69f637135e3c317091b615cd0905f8b8a087e86fa30 \ + --hash=sha256:1f71c10d1e88467126f0efd484bd44bca5e14c664ec2ede64c32f20875c0d413 \ + --hash=sha256:2424ff4c4ac7f6b8177b53c17ed5d8fa74ae5955656867f5a8affaca36a27abb \ + --hash=sha256:2bce03af1ce5a5567ab89bd90d11e7bbdff56b8af3acbbec1faded8f44cb06da \ + --hash=sha256:329906dcc7b20ff3cad13c069a78124ed8247adcac44b10bea1130e36caae0b4 \ + --hash=sha256:37dd623507659e08be98eec89323469e8c7b4c1407c85112634ae3dbdb926fdd \ + --hash=sha256:3eaafe47ec0d0ffcc9349e1708be2aaea4c6dd4978d76bf6eb0cb2c13636c6fc \ + --hash=sha256:5e6275c09d2badf57aea3afa80d975444f4be8d3bc58f7f80d2a484c6f9485c8 \ + --hash=sha256:6fe07eec95dfd477eb9530aef5bead34fec819b3aaf6c5bd6d20565da607bfe1 \ + --hash=sha256:7367d7b2eca6513681127ebad53b2582911d1736dc2ffc19f2c3ae49997496bc \ + --hash=sha256:7cde5f38e614f55e28d831754e8a3bacf9ace5d1566235e39d91b35502d6936e \ + --hash=sha256:9481ffe3cf013b71b2428b905c4f7a9a4f76ec03065b05ff499bb5682a8d9ad8 \ + --hash=sha256:98d8dc6d012b82287f2c3d26ce1d2dd130ec200c8679b6213b3c73c08b2b7940 \ + --hash=sha256:a011a644f6d7d03736214d38832e030d8268bcff4a41f728e6030325fea3e400 \ + --hash=sha256:a2913c5375154b6ef2e91c10b5720ea6e21007412f6437504ffea2109b5a33d7 \ + --hash=sha256:a30596bae9403a342c978fb47d9b0ee277699fa53bbafad14706af51fe543d16 \ + --hash=sha256:b03c2ae5d2f0fc05f9a2c0c997e1bc18c8229f392234e8a0194f202169ccd278 \ + --hash=sha256:b6cd2203306b63e41acdf39aa93b86fb566049aeb6dc489b70e34bcd07adca74 \ + --hash=sha256:b7ffe927ee6531c78f81aa17e684e2ff617daeba7f189f911065b2ea2d526dec \ + --hash=sha256:b8cac287fafc4ad485b8a9b67d0ee80c66bf3574f655d3b97ef2e1082360faf1 \ + --hash=sha256:ba334e6e4b1d92442b75ddacc615c5476d4ad55cc29b15d590cc6b86efa487e2 \ + --hash=sha256:ba3e4a42397c25b7ff88cdec6e2a16c2be18720f317506ee25210f6d31925f9c \ + --hash=sha256:c41fb5e6a5fe9ebcd58ca3abfeb51dffb5d83d6775405305bfa8715b76521922 \ + --hash=sha256:cd2030f6650c089aeb304cf093f3244d34745ce0cfcc39f20c6fbfe030102e2a \ + --hash=sha256:cd65d75953847815962c84a4654a84850b2bb4aed3f26fadcc1c13892e1e29f6 \ + --hash=sha256:e4985a790f921508f36f81831817cbc03b102d643b5fcb81cd33df3fa291a1a1 \ + --hash=sha256:e807b3188f9eb0eaa7bbb579b462c5ace579f1cedb28107ce8b48a9f7ad3679e \ + --hash=sha256:f12764b8fffc7a123f641d7d049d382b73f96a34117e0b637b80643169cec8ac \ + --hash=sha256:f8837fe1d6ac4a8052a9a8ddab256bc006242696f03368a4009be7ee3075cdb7 # via # -c release/ray_release/byod/requirements_compiled.txt # pyopenssl @@ -694,10 +682,6 @@ dataproperty==1.0.1 \ # via # pytablewriter # tabledata -dataset==1.6.2 \ - --hash=sha256:77d362118f67a8cbb4848dbd30ab362b9fa7cfebdbfaf426c9c500cb38969a99 \ - --hash=sha256:dcca9ba7658473d3082b1adf87a650252a1cd665705b73fa7d4ee32116a107b9 - # via -r release/ray_release/byod/requirements_ml_byod_3.9.in datasets==2.14.0 \ --hash=sha256:1bb3d1c992a593949a8d3e445b358ac1db4ead00e6619ea2e5e7b6dfc222dde1 \ --hash=sha256:93081cc3d9d0ce860c81f950a3ba23d24704da2eacbe2722092ef4f6ae0ada96 @@ -867,68 +851,84 @@ fonttools==4.45.1 \ # via # -c release/ray_release/byod/requirements_compiled.txt # matplotlib -frozenlist==1.4.0 \ - --hash=sha256:007df07a6e3eb3e33e9a1fe6a9db7af152bbd8a185f9aaa6ece10a3529e3e1c6 \ - --hash=sha256:008eb8b31b3ea6896da16c38c1b136cb9fec9e249e77f6211d479db79a4eaf01 \ - --hash=sha256:09163bdf0b2907454042edb19f887c6d33806adc71fbd54afc14908bfdc22251 \ - --hash=sha256:0c7c1b47859ee2cac3846fde1c1dc0f15da6cec5a0e5c72d101e0f83dcb67ff9 \ - --hash=sha256:0e5c8764c7829343d919cc2dfc587a8db01c4f70a4ebbc49abde5d4b158b007b \ - --hash=sha256:10ff5faaa22786315ef57097a279b833ecab1a0bfb07d604c9cbb1c4cdc2ed87 \ - --hash=sha256:17ae5cd0f333f94f2e03aaf140bb762c64783935cc764ff9c82dff626089bebf \ - --hash=sha256:19488c57c12d4e8095a922f328df3f179c820c212940a498623ed39160bc3c2f \ - --hash=sha256:1a0848b52815006ea6596c395f87449f693dc419061cc21e970f139d466dc0a0 \ - --hash=sha256:1e78fb68cf9c1a6aa4a9a12e960a5c9dfbdb89b3695197aa7064705662515de2 \ - --hash=sha256:261b9f5d17cac914531331ff1b1d452125bf5daa05faf73b71d935485b0c510b \ - --hash=sha256:2b8bcf994563466db019fab287ff390fffbfdb4f905fc77bc1c1d604b1c689cc \ - --hash=sha256:38461d02d66de17455072c9ba981d35f1d2a73024bee7790ac2f9e361ef1cd0c \ - --hash=sha256:490132667476f6781b4c9458298b0c1cddf237488abd228b0b3650e5ecba7467 \ - --hash=sha256:491e014f5c43656da08958808588cc6c016847b4360e327a62cb308c791bd2d9 \ - --hash=sha256:515e1abc578dd3b275d6a5114030b1330ba044ffba03f94091842852f806f1c1 \ - --hash=sha256:556de4430ce324c836789fa4560ca62d1591d2538b8ceb0b4f68fb7b2384a27a \ - --hash=sha256:5833593c25ac59ede40ed4de6d67eb42928cca97f26feea219f21d0ed0959b79 \ - --hash=sha256:6221d84d463fb110bdd7619b69cb43878a11d51cbb9394ae3105d082d5199167 \ - --hash=sha256:6918d49b1f90821e93069682c06ffde41829c346c66b721e65a5c62b4bab0300 \ - --hash=sha256:6c38721585f285203e4b4132a352eb3daa19121a035f3182e08e437cface44bf \ - --hash=sha256:71932b597f9895f011f47f17d6428252fc728ba2ae6024e13c3398a087c2cdea \ - --hash=sha256:7211ef110a9194b6042449431e08c4d80c0481e5891e58d429df5899690511c2 \ - --hash=sha256:764226ceef3125e53ea2cb275000e309c0aa5464d43bd72abd661e27fffc26ab \ - --hash=sha256:7645a8e814a3ee34a89c4a372011dcd817964ce8cb273c8ed6119d706e9613e3 \ - --hash=sha256:76d4711f6f6d08551a7e9ef28c722f4a50dd0fc204c56b4bcd95c6cc05ce6fbb \ - --hash=sha256:7f4f399d28478d1f604c2ff9119907af9726aed73680e5ed1ca634d377abb087 \ - --hash=sha256:88f7bc0fcca81f985f78dd0fa68d2c75abf8272b1f5c323ea4a01a4d7a614efc \ - --hash=sha256:8d0edd6b1c7fb94922bf569c9b092ee187a83f03fb1a63076e7774b60f9481a8 \ - --hash=sha256:901289d524fdd571be1c7be054f48b1f88ce8dddcbdf1ec698b27d4b8b9e5d62 \ - --hash=sha256:93ea75c050c5bb3d98016b4ba2497851eadf0ac154d88a67d7a6816206f6fa7f \ - --hash=sha256:981b9ab5a0a3178ff413bca62526bb784249421c24ad7381e39d67981be2c326 \ - --hash=sha256:9ac08e601308e41eb533f232dbf6b7e4cea762f9f84f6357136eed926c15d12c \ - --hash=sha256:a02eb8ab2b8f200179b5f62b59757685ae9987996ae549ccf30f983f40602431 \ - --hash=sha256:a0c6da9aee33ff0b1a451e867da0c1f47408112b3391dd43133838339e410963 \ - --hash=sha256:a6c8097e01886188e5be3e6b14e94ab365f384736aa1fca6a0b9e35bd4a30bc7 \ - --hash=sha256:aa384489fefeb62321b238e64c07ef48398fe80f9e1e6afeff22e140e0850eef \ - --hash=sha256:ad2a9eb6d9839ae241701d0918f54c51365a51407fd80f6b8289e2dfca977cc3 \ - --hash=sha256:b206646d176a007466358aa21d85cd8600a415c67c9bd15403336c331a10d956 \ - --hash=sha256:b826d97e4276750beca7c8f0f1a4938892697a6bcd8ec8217b3312dad6982781 \ - --hash=sha256:b89ac9768b82205936771f8d2eb3ce88503b1556324c9f903e7156669f521472 \ - --hash=sha256:bd7bd3b3830247580de99c99ea2a01416dfc3c34471ca1298bccabf86d0ff4dc \ - --hash=sha256:bdf1847068c362f16b353163391210269e4f0569a3c166bc6a9f74ccbfc7e839 \ - --hash=sha256:c11b0746f5d946fecf750428a95f3e9ebe792c1ee3b1e96eeba145dc631a9672 \ - --hash=sha256:c5374b80521d3d3f2ec5572e05adc94601985cc526fb276d0c8574a6d749f1b3 \ - --hash=sha256:ca265542ca427bf97aed183c1676e2a9c66942e822b14dc6e5f42e038f92a503 \ - --hash=sha256:ce31ae3e19f3c902de379cf1323d90c649425b86de7bbdf82871b8a2a0615f3d \ - --hash=sha256:ceb6ec0a10c65540421e20ebd29083c50e6d1143278746a4ef6bcf6153171eb8 \ - --hash=sha256:d081f13b095d74b67d550de04df1c756831f3b83dc9881c38985834387487f1b \ - --hash=sha256:d5655a942f5f5d2c9ed93d72148226d75369b4f6952680211972a33e59b1dfdc \ - --hash=sha256:d5a32087d720c608f42caed0ef36d2b3ea61a9d09ee59a5142d6070da9041b8f \ - --hash=sha256:d6484756b12f40003c6128bfcc3fa9f0d49a687e171186c2d85ec82e3758c559 \ - --hash=sha256:dd65632acaf0d47608190a71bfe46b209719bf2beb59507db08ccdbe712f969b \ - --hash=sha256:de343e75f40e972bae1ef6090267f8260c1446a1695e77096db6cfa25e759a95 \ - --hash=sha256:e29cda763f752553fa14c68fb2195150bfab22b352572cb36c43c47bedba70eb \ - --hash=sha256:e41f3de4df3e80de75845d3e743b3f1c4c8613c3997a912dbf0229fc61a8b963 \ - --hash=sha256:e66d2a64d44d50d2543405fb183a21f76b3b5fd16f130f5c99187c3fb4e64919 \ - --hash=sha256:e74b0506fa5aa5598ac6a975a12aa8928cbb58e1f5ac8360792ef15de1aa848f \ - --hash=sha256:f0ed05f5079c708fe74bf9027e95125334b6978bf07fd5ab923e9e55e5fbb9d3 \ - --hash=sha256:f61e2dc5ad442c52b4887f1fdc112f97caeff4d9e6ebe78879364ac59f1663e1 \ - --hash=sha256:fec520865f42e5c7f050c2a79038897b1c7d1595e907a9e08e3353293ffc948e +frozenlist==1.4.1 \ + --hash=sha256:04ced3e6a46b4cfffe20f9ae482818e34eba9b5fb0ce4056e4cc9b6e212d09b7 \ + --hash=sha256:0633c8d5337cb5c77acbccc6357ac49a1770b8c487e5b3505c57b949b4b82e98 \ + --hash=sha256:068b63f23b17df8569b7fdca5517edef76171cf3897eb68beb01341131fbd2ad \ + --hash=sha256:0c250a29735d4f15321007fb02865f0e6b6a41a6b88f1f523ca1596ab5f50bd5 \ + --hash=sha256:1979bc0aeb89b33b588c51c54ab0161791149f2461ea7c7c946d95d5f93b56ae \ + --hash=sha256:1a4471094e146b6790f61b98616ab8e44f72661879cc63fa1049d13ef711e71e \ + --hash=sha256:1b280e6507ea8a4fa0c0a7150b4e526a8d113989e28eaaef946cc77ffd7efc0a \ + --hash=sha256:1d0ce09d36d53bbbe566fe296965b23b961764c0bcf3ce2fa45f463745c04701 \ + --hash=sha256:20b51fa3f588ff2fe658663db52a41a4f7aa6c04f6201449c6c7c476bd255c0d \ + --hash=sha256:23b2d7679b73fe0e5a4560b672a39f98dfc6f60df63823b0a9970525325b95f6 \ + --hash=sha256:23b701e65c7b36e4bf15546a89279bd4d8675faabc287d06bbcfac7d3c33e1e6 \ + --hash=sha256:2471c201b70d58a0f0c1f91261542a03d9a5e088ed3dc6c160d614c01649c106 \ + --hash=sha256:27657df69e8801be6c3638054e202a135c7f299267f1a55ed3a598934f6c0d75 \ + --hash=sha256:29acab3f66f0f24674b7dc4736477bcd4bc3ad4b896f5f45379a67bce8b96868 \ + --hash=sha256:32453c1de775c889eb4e22f1197fe3bdfe457d16476ea407472b9442e6295f7a \ + --hash=sha256:3a670dc61eb0d0eb7080890c13de3066790f9049b47b0de04007090807c776b0 \ + --hash=sha256:3e0153a805a98f5ada7e09826255ba99fb4f7524bb81bf6b47fb702666484ae1 \ + --hash=sha256:410478a0c562d1a5bcc2f7ea448359fcb050ed48b3c6f6f4f18c313a9bdb1826 \ + --hash=sha256:442acde1e068288a4ba7acfe05f5f343e19fac87bfc96d89eb886b0363e977ec \ + --hash=sha256:48f6a4533887e189dae092f1cf981f2e3885175f7a0f33c91fb5b7b682b6bab6 \ + --hash=sha256:4f57dab5fe3407b6c0c1cc907ac98e8a189f9e418f3b6e54d65a718aaafe3950 \ + --hash=sha256:4f9c515e7914626b2a2e1e311794b4c35720a0be87af52b79ff8e1429fc25f19 \ + --hash=sha256:55fdc093b5a3cb41d420884cdaf37a1e74c3c37a31f46e66286d9145d2063bd0 \ + --hash=sha256:5667ed53d68d91920defdf4035d1cdaa3c3121dc0b113255124bcfada1cfa1b8 \ + --hash=sha256:590344787a90ae57d62511dd7c736ed56b428f04cd8c161fcc5e7232c130c69a \ + --hash=sha256:5a7d70357e7cee13f470c7883a063aae5fe209a493c57d86eb7f5a6f910fae09 \ + --hash=sha256:5c3894db91f5a489fc8fa6a9991820f368f0b3cbdb9cd8849547ccfab3392d86 \ + --hash=sha256:5c849d495bf5154cd8da18a9eb15db127d4dba2968d88831aff6f0331ea9bd4c \ + --hash=sha256:64536573d0a2cb6e625cf309984e2d873979709f2cf22839bf2d61790b448ad5 \ + --hash=sha256:693945278a31f2086d9bf3df0fe8254bbeaef1fe71e1351c3bd730aa7d31c41b \ + --hash=sha256:6db4667b187a6742b33afbbaf05a7bc551ffcf1ced0000a571aedbb4aa42fc7b \ + --hash=sha256:6eb73fa5426ea69ee0e012fb59cdc76a15b1283d6e32e4f8dc4482ec67d1194d \ + --hash=sha256:722e1124aec435320ae01ee3ac7bec11a5d47f25d0ed6328f2273d287bc3abb0 \ + --hash=sha256:7268252af60904bf52c26173cbadc3a071cece75f873705419c8681f24d3edea \ + --hash=sha256:74fb4bee6880b529a0c6560885fce4dc95936920f9f20f53d99a213f7bf66776 \ + --hash=sha256:780d3a35680ced9ce682fbcf4cb9c2bad3136eeff760ab33707b71db84664e3a \ + --hash=sha256:82e8211d69a4f4bc360ea22cd6555f8e61a1bd211d1d5d39d3d228b48c83a897 \ + --hash=sha256:89aa2c2eeb20957be2d950b85974b30a01a762f3308cd02bb15e1ad632e22dc7 \ + --hash=sha256:8aefbba5f69d42246543407ed2461db31006b0f76c4e32dfd6f42215a2c41d09 \ + --hash=sha256:96ec70beabbd3b10e8bfe52616a13561e58fe84c0101dd031dc78f250d5128b9 \ + --hash=sha256:9750cc7fe1ae3b1611bb8cfc3f9ec11d532244235d75901fb6b8e42ce9229dfe \ + --hash=sha256:9acbb16f06fe7f52f441bb6f413ebae6c37baa6ef9edd49cdd567216da8600cd \ + --hash=sha256:9d3e0c25a2350080e9319724dede4f31f43a6c9779be48021a7f4ebde8b2d742 \ + --hash=sha256:a06339f38e9ed3a64e4c4e43aec7f59084033647f908e4259d279a52d3757d09 \ + --hash=sha256:a0cb6f11204443f27a1628b0e460f37fb30f624be6051d490fa7d7e26d4af3d0 \ + --hash=sha256:a7496bfe1da7fb1a4e1cc23bb67c58fab69311cc7d32b5a99c2007b4b2a0e932 \ + --hash=sha256:a828c57f00f729620a442881cc60e57cfcec6842ba38e1b19fd3e47ac0ff8dc1 \ + --hash=sha256:a9b2de4cf0cdd5bd2dee4c4f63a653c61d2408055ab77b151c1957f221cabf2a \ + --hash=sha256:b46c8ae3a8f1f41a0d2ef350c0b6e65822d80772fe46b653ab6b6274f61d4a49 \ + --hash=sha256:b7e3ed87d4138356775346e6845cccbe66cd9e207f3cd11d2f0b9fd13681359d \ + --hash=sha256:b7f2f9f912dca3934c1baec2e4585a674ef16fe00218d833856408c48d5beee7 \ + --hash=sha256:ba60bb19387e13597fb059f32cd4d59445d7b18b69a745b8f8e5db0346f33480 \ + --hash=sha256:beee944ae828747fd7cb216a70f120767fc9f4f00bacae8543c14a6831673f89 \ + --hash=sha256:bfa4a17e17ce9abf47a74ae02f32d014c5e9404b6d9ac7f729e01562bbee601e \ + --hash=sha256:c037a86e8513059a2613aaba4d817bb90b9d9b6b69aace3ce9c877e8c8ed402b \ + --hash=sha256:c302220494f5c1ebeb0912ea782bcd5e2f8308037b3c7553fad0e48ebad6ad82 \ + --hash=sha256:c6321c9efe29975232da3bd0af0ad216800a47e93d763ce64f291917a381b8eb \ + --hash=sha256:c757a9dd70d72b076d6f68efdbb9bc943665ae954dad2801b874c8c69e185068 \ + --hash=sha256:c99169d4ff810155ca50b4da3b075cbde79752443117d89429595c2e8e37fed8 \ + --hash=sha256:c9c92be9fd329ac801cc420e08452b70e7aeab94ea4233a4804f0915c14eba9b \ + --hash=sha256:cc7b01b3754ea68a62bd77ce6020afaffb44a590c2289089289363472d13aedb \ + --hash=sha256:db9e724bebd621d9beca794f2a4ff1d26eed5965b004a97f1f1685a173b869c2 \ + --hash=sha256:dca69045298ce5c11fd539682cff879cc1e664c245d1c64da929813e54241d11 \ + --hash=sha256:dd9b1baec094d91bf36ec729445f7769d0d0cf6b64d04d86e45baf89e2b9059b \ + --hash=sha256:e02a0e11cf6597299b9f3bbd3f93d79217cb90cfd1411aec33848b13f5c656cc \ + --hash=sha256:e6a20a581f9ce92d389a8c7d7c3dd47c81fd5d6e655c8dddf341e14aa48659d0 \ + --hash=sha256:e7004be74cbb7d9f34553a5ce5fb08be14fb33bc86f332fb71cbe5216362a497 \ + --hash=sha256:e774d53b1a477a67838a904131c4b0eef6b3d8a651f8b138b04f748fccfefe17 \ + --hash=sha256:edb678da49d9f72c9f6c609fbe41a5dfb9a9282f9e6a2253d5a91e0fc382d7c0 \ + --hash=sha256:f146e0911cb2f1da549fc58fc7bcd2b836a44b79ef871980d605ec392ff6b0d2 \ + --hash=sha256:f56e2333dda1fe0f909e7cc59f021eba0d2307bc6f012a1ccf2beca6ba362439 \ + --hash=sha256:f9a3ea26252bd92f570600098783d1371354d89d5f6b7dfd87359d669f2109b5 \ + --hash=sha256:f9aa1878d1083b276b0196f2dfbe00c9b7e752475ed3b682025ff20c1c1f51ac \ + --hash=sha256:fb3c2db03683b5767dedb5769b8a40ebb47d6f7f45b1b3e3b4b51ec8ad9d9825 \ + --hash=sha256:fbeb989b5cc29e8daf7f976b421c220f1b8c731cbf22b9130d8815418ea45887 \ + --hash=sha256:fde5bd59ab5357e3853313127f4d3565fc7dad314a74d7b5d43c22c6a5ed2ced \ + --hash=sha256:fe1a06da377e3a1062ae5fe0926e12b84eceb8a50b350ddca72dc85015873f74 # via # -c release/ray_release/byod/requirements_compiled.txt # aiohttp @@ -964,8 +964,9 @@ fugue-sql-antlr==0.2.0 \ # via # -c release/ray_release/byod/requirements_compiled.txt # fugue -future==0.18.3 \ - --hash=sha256:34a17436ed1e96697a86f9de3d15a3b0be01d8bc8de9c1dffd59fb8234ed5307 +future==1.0.0 \ + --hash=sha256:929292d34f5872e70396626ef385ec22355a1fae8ad29e1a734c3e43f9fbc216 \ + --hash=sha256:bd2968309307861edae1458a4f8a4f3598c03be43b97521076aebf5d94c07b05 # via # -c release/ray_release/byod/requirements_compiled.txt # petastorm @@ -1295,7 +1296,6 @@ greenlet==3.0.1 \ # via # -c release/ray_release/byod/requirements_compiled.txt # gevent - # sqlalchemy gsutil==5.27 \ --hash=sha256:681a2d844acdf05fac989da6dd406944ae11cb27a4cf3c9edef74d2585ab5f05 # via @@ -1431,9 +1431,9 @@ jupyterlab-widgets==3.0.11 \ # via # -c release/ray_release/byod/requirements_compiled.txt # ipywidgets -jupytext==1.13.6 \ - --hash=sha256:2160774e30587fb427213231f0267ed070ba4ede41cf6121dbb2b14225eb83ba \ - --hash=sha256:c6c25918ddb6403d0d8504e08d35f6efc447baf0dbeb6a28b73adf39e866a0c4 +jupytext==1.16.3 \ + --hash=sha256:1ebac990461dd9f477ff7feec9e3003fa1acc89f3c16ba01b73f79fd76f01a98 \ + --hash=sha256:870e0d7a716dcb1303df6ad1cec65e3315a20daedd808a55cb3dae2d56e4ed20 # via # -c release/ray_release/byod/requirements_compiled.txt # -r release/ray_release/byod/requirements_ml_byod_3.9.in @@ -1680,19 +1680,14 @@ lxml==4.9.4 \ # via # -c release/ray_release/byod/requirements_compiled.txt # sacrebleu -mako==1.3.0 \ - --hash=sha256:57d4e997349f1a92035aa25c17ace371a4213f2ca42f99bee9a602500cfd54d9 \ - --hash=sha256:e3a9d388fd00e87043edbe8792f45880ac0114e9c4adc69f6e9bfb2c55e3b11b - # via - # -c release/ray_release/byod/requirements_compiled.txt - # alembic -markdown-it-py==1.1.0 \ - --hash=sha256:36be6bb3ad987bfdb839f5ba78ddf094552ca38ccbd784ae4f74a4e1419fc6e3 \ - --hash=sha256:98080fc0bc34c4f2bcf0846a096a9429acbd9d5d8e67ed34026c03c61c464389 +markdown-it-py==2.2.0 \ + --hash=sha256:5a35f8d1870171d9acc47b99612dc146129b631baf04970128b568f190d0cc30 \ + --hash=sha256:7c9a5e412688bc771c67432cbfebcdd686c93ce6484913dccf06cb5a0bea35a1 # via # -c release/ray_release/byod/requirements_compiled.txt # jupytext # mdit-py-plugins + # rich markupsafe==2.1.3 \ --hash=sha256:05fb21170423db021895e1ea1e1f3ab3adb85d1c2333cbc2310f2a26bc77272e \ --hash=sha256:0a4e4a1aff6c7ac4cd55792abf96c915634c2b97e3cc1c7129578aa68ebd754e \ @@ -1757,7 +1752,6 @@ markupsafe==2.1.3 \ # via # -c release/ray_release/byod/requirements_compiled.txt # jinja2 - # mako # werkzeug matplotlib==3.7.4 \ --hash=sha256:0037d066cca1f4bda626c507cddeb6f7da8283bc6a214da2db13ff2162933c52 \ @@ -1829,6 +1823,12 @@ mdit-py-plugins==0.3.5 \ # via # -c release/ray_release/byod/requirements_compiled.txt # jupytext +mdurl==0.1.2 \ + --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \ + --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba + # via + # -c release/ray_release/byod/requirements_compiled.txt + # markdown-it-py memray==1.10.0 ; platform_system != "Windows" and sys_platform != "darwin" and platform_machine != "aarch64" \ --hash=sha256:0a21745fb516b7a6efcd40aa7487c59e9313fcfc782d0193fcfcf00b48426874 \ --hash=sha256:22f2a47871c172a0539bd72737bb6b294fc10c510464066b825d90fcd3bb4916 \ @@ -1868,7 +1868,7 @@ memray==1.10.0 ; platform_system != "Windows" and sys_platform != "darwin" and p # via # -c release/ray_release/byod/requirements_compiled.txt # -r release/ray_release/byod/requirements_ml_byod_3.9.in -modin==0.22.2 \ +modin==0.22.2 ; python_version < "3.12" \ --hash=sha256:532fe0bfb2dcf06c0ad2d467721ef489fd58bb3ef7150bcf4a7ddd1069be1e4d \ --hash=sha256:fa897dc59d5b9a8496be044185689fdd337b9f26cc81c4144b217a2a94d029bc # via @@ -1952,81 +1952,97 @@ msgpack==1.0.7 \ # via # -c release/ray_release/byod/requirements_compiled.txt # locust -multidict==6.0.4 \ - --hash=sha256:01a3a55bd90018c9c080fbb0b9f4891db37d148a0a18722b42f94694f8b6d4c9 \ - --hash=sha256:0b1a97283e0c85772d613878028fec909f003993e1007eafa715b24b377cb9b8 \ - --hash=sha256:0dfad7a5a1e39c53ed00d2dd0c2e36aed4650936dc18fd9a1826a5ae1cad6f03 \ - --hash=sha256:11bdf3f5e1518b24530b8241529d2050014c884cf18b6fc69c0c2b30ca248710 \ - --hash=sha256:1502e24330eb681bdaa3eb70d6358e818e8e8f908a22a1851dfd4e15bc2f8161 \ - --hash=sha256:16ab77bbeb596e14212e7bab8429f24c1579234a3a462105cda4a66904998664 \ - --hash=sha256:16d232d4e5396c2efbbf4f6d4df89bfa905eb0d4dc5b3549d872ab898451f569 \ - --hash=sha256:21a12c4eb6ddc9952c415f24eef97e3e55ba3af61f67c7bc388dcdec1404a067 \ - --hash=sha256:27c523fbfbdfd19c6867af7346332b62b586eed663887392cff78d614f9ec313 \ - --hash=sha256:281af09f488903fde97923c7744bb001a9b23b039a909460d0f14edc7bf59706 \ - --hash=sha256:33029f5734336aa0d4c0384525da0387ef89148dc7191aae00ca5fb23d7aafc2 \ - --hash=sha256:3601a3cece3819534b11d4efc1eb76047488fddd0c85a3948099d5da4d504636 \ - --hash=sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49 \ - --hash=sha256:36c63aaa167f6c6b04ef2c85704e93af16c11d20de1d133e39de6a0e84582a93 \ - --hash=sha256:39ff62e7d0f26c248b15e364517a72932a611a9b75f35b45be078d81bdb86603 \ - --hash=sha256:43644e38f42e3af682690876cff722d301ac585c5b9e1eacc013b7a3f7b696a0 \ - --hash=sha256:4372381634485bec7e46718edc71528024fcdc6f835baefe517b34a33c731d60 \ - --hash=sha256:458f37be2d9e4c95e2d8866a851663cbc76e865b78395090786f6cd9b3bbf4f4 \ - --hash=sha256:45e1ecb0379bfaab5eef059f50115b54571acfbe422a14f668fc8c27ba410e7e \ - --hash=sha256:4b9d9e4e2b37daddb5c23ea33a3417901fa7c7b3dee2d855f63ee67a0b21e5b1 \ - --hash=sha256:4ceef517eca3e03c1cceb22030a3e39cb399ac86bff4e426d4fc6ae49052cc60 \ - --hash=sha256:4d1a3d7ef5e96b1c9e92f973e43aa5e5b96c659c9bc3124acbbd81b0b9c8a951 \ - --hash=sha256:4dcbb0906e38440fa3e325df2359ac6cb043df8e58c965bb45f4e406ecb162cc \ - --hash=sha256:509eac6cf09c794aa27bcacfd4d62c885cce62bef7b2c3e8b2e49d365b5003fe \ - --hash=sha256:52509b5be062d9eafc8170e53026fbc54cf3b32759a23d07fd935fb04fc22d95 \ - --hash=sha256:52f2dffc8acaba9a2f27174c41c9e57f60b907bb9f096b36b1a1f3be71c6284d \ - --hash=sha256:574b7eae1ab267e5f8285f0fe881f17efe4b98c39a40858247720935b893bba8 \ - --hash=sha256:5979b5632c3e3534e42ca6ff856bb24b2e3071b37861c2c727ce220d80eee9ed \ - --hash=sha256:59d43b61c59d82f2effb39a93c48b845efe23a3852d201ed2d24ba830d0b4cf2 \ - --hash=sha256:5a4dcf02b908c3b8b17a45fb0f15b695bf117a67b76b7ad18b73cf8e92608775 \ - --hash=sha256:5cad9430ab3e2e4fa4a2ef4450f548768400a2ac635841bc2a56a2052cdbeb87 \ - --hash=sha256:5fc1b16f586f049820c5c5b17bb4ee7583092fa0d1c4e28b5239181ff9532e0c \ - --hash=sha256:62501642008a8b9871ddfccbf83e4222cf8ac0d5aeedf73da36153ef2ec222d2 \ - --hash=sha256:64bdf1086b6043bf519869678f5f2757f473dee970d7abf6da91ec00acb9cb98 \ - --hash=sha256:64da238a09d6039e3bd39bb3aee9c21a5e34f28bfa5aa22518581f910ff94af3 \ - --hash=sha256:666daae833559deb2d609afa4490b85830ab0dfca811a98b70a205621a6109fe \ - --hash=sha256:67040058f37a2a51ed8ea8f6b0e6ee5bd78ca67f169ce6122f3e2ec80dfe9b78 \ - --hash=sha256:6748717bb10339c4760c1e63da040f5f29f5ed6e59d76daee30305894069a660 \ - --hash=sha256:6b181d8c23da913d4ff585afd1155a0e1194c0b50c54fcfe286f70cdaf2b7176 \ - --hash=sha256:6ed5f161328b7df384d71b07317f4d8656434e34591f20552c7bcef27b0ab88e \ - --hash=sha256:7582a1d1030e15422262de9f58711774e02fa80df0d1578995c76214f6954988 \ - --hash=sha256:7d18748f2d30f94f498e852c67d61261c643b349b9d2a581131725595c45ec6c \ - --hash=sha256:7d6ae9d593ef8641544d6263c7fa6408cc90370c8cb2bbb65f8d43e5b0351d9c \ - --hash=sha256:81a4f0b34bd92df3da93315c6a59034df95866014ac08535fc819f043bfd51f0 \ - --hash=sha256:8316a77808c501004802f9beebde51c9f857054a0c871bd6da8280e718444449 \ - --hash=sha256:853888594621e6604c978ce2a0444a1e6e70c8d253ab65ba11657659dcc9100f \ - --hash=sha256:99b76c052e9f1bc0721f7541e5e8c05db3941eb9ebe7b8553c625ef88d6eefde \ - --hash=sha256:a2e4369eb3d47d2034032a26c7a80fcb21a2cb22e1173d761a162f11e562caa5 \ - --hash=sha256:ab55edc2e84460694295f401215f4a58597f8f7c9466faec545093045476327d \ - --hash=sha256:af048912e045a2dc732847d33821a9d84ba553f5c5f028adbd364dd4765092ac \ - --hash=sha256:b1a2eeedcead3a41694130495593a559a668f382eee0727352b9a41e1c45759a \ - --hash=sha256:b1e8b901e607795ec06c9e42530788c45ac21ef3aaa11dbd0c69de543bfb79a9 \ - --hash=sha256:b41156839806aecb3641f3208c0dafd3ac7775b9c4c422d82ee2a45c34ba81ca \ - --hash=sha256:b692f419760c0e65d060959df05f2a531945af31fda0c8a3b3195d4efd06de11 \ - --hash=sha256:bc779e9e6f7fda81b3f9aa58e3a6091d49ad528b11ed19f6621408806204ad35 \ - --hash=sha256:bf6774e60d67a9efe02b3616fee22441d86fab4c6d335f9d2051d19d90a40063 \ - --hash=sha256:c048099e4c9e9d615545e2001d3d8a4380bd403e1a0578734e0d31703d1b0c0b \ - --hash=sha256:c5cb09abb18c1ea940fb99360ea0396f34d46566f157122c92dfa069d3e0e982 \ - --hash=sha256:cc8e1d0c705233c5dd0c5e6460fbad7827d5d36f310a0fadfd45cc3029762258 \ - --hash=sha256:d5e3fc56f88cc98ef8139255cf8cd63eb2c586531e43310ff859d6bb3a6b51f1 \ - --hash=sha256:d6aa0418fcc838522256761b3415822626f866758ee0bc6632c9486b179d0b52 \ - --hash=sha256:d6c254ba6e45d8e72739281ebc46ea5eb5f101234f3ce171f0e9f5cc86991480 \ - --hash=sha256:d6d635d5209b82a3492508cf5b365f3446afb65ae7ebd755e70e18f287b0adf7 \ - --hash=sha256:dcfe792765fab89c365123c81046ad4103fcabbc4f56d1c1997e6715e8015461 \ - --hash=sha256:ddd3915998d93fbcd2566ddf9cf62cdb35c9e093075f862935573d265cf8f65d \ - --hash=sha256:ddff9c4e225a63a5afab9dd15590432c22e8057e1a9a13d28ed128ecf047bbdc \ - --hash=sha256:e41b7e2b59679edfa309e8db64fdf22399eec4b0b24694e1b2104fb789207779 \ - --hash=sha256:e69924bfcdda39b722ef4d9aa762b2dd38e4632b3641b1d9a57ca9cd18f2f83a \ - --hash=sha256:ea20853c6dbbb53ed34cb4d080382169b6f4554d394015f1bef35e881bf83547 \ - --hash=sha256:ee2a1ece51b9b9e7752e742cfb661d2a29e7bcdba2d27e66e28a99f1890e4fa0 \ - --hash=sha256:eeb6dcc05e911516ae3d1f207d4b0520d07f54484c49dfc294d6e7d63b734171 \ - --hash=sha256:f70b98cd94886b49d91170ef23ec5c0e8ebb6f242d734ed7ed677b24d50c82cf \ - --hash=sha256:fc35cb4676846ef752816d5be2193a1e8367b4c1397b74a565a9d0389c433a1d \ - --hash=sha256:ff959bee35038c4624250473988b24f846cbeb2c6639de3602c073f10410ceba +multidict==6.0.5 \ + --hash=sha256:01265f5e40f5a17f8241d52656ed27192be03bfa8764d88e8220141d1e4b3556 \ + --hash=sha256:0275e35209c27a3f7951e1ce7aaf93ce0d163b28948444bec61dd7badc6d3f8c \ + --hash=sha256:04bde7a7b3de05732a4eb39c94574db1ec99abb56162d6c520ad26f83267de29 \ + --hash=sha256:04da1bb8c8dbadf2a18a452639771951c662c5ad03aefe4884775454be322c9b \ + --hash=sha256:09a892e4a9fb47331da06948690ae38eaa2426de97b4ccbfafbdcbe5c8f37ff8 \ + --hash=sha256:0d63c74e3d7ab26de115c49bffc92cc77ed23395303d496eae515d4204a625e7 \ + --hash=sha256:107c0cdefe028703fb5dafe640a409cb146d44a6ae201e55b35a4af8e95457dd \ + --hash=sha256:141b43360bfd3bdd75f15ed811850763555a251e38b2405967f8e25fb43f7d40 \ + --hash=sha256:14c2976aa9038c2629efa2c148022ed5eb4cb939e15ec7aace7ca932f48f9ba6 \ + --hash=sha256:19fe01cea168585ba0f678cad6f58133db2aa14eccaf22f88e4a6dccadfad8b3 \ + --hash=sha256:1d147090048129ce3c453f0292e7697d333db95e52616b3793922945804a433c \ + --hash=sha256:1d9ea7a7e779d7a3561aade7d596649fbecfa5c08a7674b11b423783217933f9 \ + --hash=sha256:215ed703caf15f578dca76ee6f6b21b7603791ae090fbf1ef9d865571039ade5 \ + --hash=sha256:21fd81c4ebdb4f214161be351eb5bcf385426bf023041da2fd9e60681f3cebae \ + --hash=sha256:220dd781e3f7af2c2c1053da9fa96d9cf3072ca58f057f4c5adaaa1cab8fc442 \ + --hash=sha256:228b644ae063c10e7f324ab1ab6b548bdf6f8b47f3ec234fef1093bc2735e5f9 \ + --hash=sha256:29bfeb0dff5cb5fdab2023a7a9947b3b4af63e9c47cae2a10ad58394b517fddc \ + --hash=sha256:2f4848aa3baa109e6ab81fe2006c77ed4d3cd1e0ac2c1fbddb7b1277c168788c \ + --hash=sha256:2faa5ae9376faba05f630d7e5e6be05be22913782b927b19d12b8145968a85ea \ + --hash=sha256:2ffc42c922dbfddb4a4c3b438eb056828719f07608af27d163191cb3e3aa6cc5 \ + --hash=sha256:37b15024f864916b4951adb95d3a80c9431299080341ab9544ed148091b53f50 \ + --hash=sha256:3cc2ad10255f903656017363cd59436f2111443a76f996584d1077e43ee51182 \ + --hash=sha256:3d25f19500588cbc47dc19081d78131c32637c25804df8414463ec908631e453 \ + --hash=sha256:403c0911cd5d5791605808b942c88a8155c2592e05332d2bf78f18697a5fa15e \ + --hash=sha256:411bf8515f3be9813d06004cac41ccf7d1cd46dfe233705933dd163b60e37600 \ + --hash=sha256:425bf820055005bfc8aa9a0b99ccb52cc2f4070153e34b701acc98d201693733 \ + --hash=sha256:435a0984199d81ca178b9ae2c26ec3d49692d20ee29bc4c11a2a8d4514c67eda \ + --hash=sha256:4a6a4f196f08c58c59e0b8ef8ec441d12aee4125a7d4f4fef000ccb22f8d7241 \ + --hash=sha256:4cc0ef8b962ac7a5e62b9e826bd0cd5040e7d401bc45a6835910ed699037a461 \ + --hash=sha256:51d035609b86722963404f711db441cf7134f1889107fb171a970c9701f92e1e \ + --hash=sha256:53689bb4e102200a4fafa9de9c7c3c212ab40a7ab2c8e474491914d2305f187e \ + --hash=sha256:55205d03e8a598cfc688c71ca8ea5f66447164efff8869517f175ea632c7cb7b \ + --hash=sha256:5c0631926c4f58e9a5ccce555ad7747d9a9f8b10619621f22f9635f069f6233e \ + --hash=sha256:5cb241881eefd96b46f89b1a056187ea8e9ba14ab88ba632e68d7a2ecb7aadf7 \ + --hash=sha256:60d698e8179a42ec85172d12f50b1668254628425a6bd611aba022257cac1386 \ + --hash=sha256:612d1156111ae11d14afaf3a0669ebf6c170dbb735e510a7438ffe2369a847fd \ + --hash=sha256:6214c5a5571802c33f80e6c84713b2c79e024995b9c5897f794b43e714daeec9 \ + --hash=sha256:6939c95381e003f54cd4c5516740faba40cf5ad3eeff460c3ad1d3e0ea2549bf \ + --hash=sha256:69db76c09796b313331bb7048229e3bee7928eb62bab5e071e9f7fcc4879caee \ + --hash=sha256:6bf7a982604375a8d49b6cc1b781c1747f243d91b81035a9b43a2126c04766f5 \ + --hash=sha256:766c8f7511df26d9f11cd3a8be623e59cca73d44643abab3f8c8c07620524e4a \ + --hash=sha256:76c0de87358b192de7ea9649beb392f107dcad9ad27276324c24c91774ca5271 \ + --hash=sha256:76f067f5121dcecf0d63a67f29080b26c43c71a98b10c701b0677e4a065fbd54 \ + --hash=sha256:7901c05ead4b3fb75113fb1dd33eb1253c6d3ee37ce93305acd9d38e0b5f21a4 \ + --hash=sha256:79660376075cfd4b2c80f295528aa6beb2058fd289f4c9252f986751a4cd0496 \ + --hash=sha256:79a6d2ba910adb2cbafc95dad936f8b9386e77c84c35bc0add315b856d7c3abb \ + --hash=sha256:7afcdd1fc07befad18ec4523a782cde4e93e0a2bf71239894b8d61ee578c1319 \ + --hash=sha256:7be7047bd08accdb7487737631d25735c9a04327911de89ff1b26b81745bd4e3 \ + --hash=sha256:7c6390cf87ff6234643428991b7359b5f59cc15155695deb4eda5c777d2b880f \ + --hash=sha256:7df704ca8cf4a073334e0427ae2345323613e4df18cc224f647f251e5e75a527 \ + --hash=sha256:85f67aed7bb647f93e7520633d8f51d3cbc6ab96957c71272b286b2f30dc70ed \ + --hash=sha256:896ebdcf62683551312c30e20614305f53125750803b614e9e6ce74a96232604 \ + --hash=sha256:92d16a3e275e38293623ebf639c471d3e03bb20b8ebb845237e0d3664914caef \ + --hash=sha256:99f60d34c048c5c2fabc766108c103612344c46e35d4ed9ae0673d33c8fb26e8 \ + --hash=sha256:9fe7b0653ba3d9d65cbe7698cca585bf0f8c83dbbcc710db9c90f478e175f2d5 \ + --hash=sha256:a3145cb08d8625b2d3fee1b2d596a8766352979c9bffe5d7833e0503d0f0b5e5 \ + --hash=sha256:aeaf541ddbad8311a87dd695ed9642401131ea39ad7bc8cf3ef3967fd093b626 \ + --hash=sha256:b55358304d7a73d7bdf5de62494aaf70bd33015831ffd98bc498b433dfe5b10c \ + --hash=sha256:b82cc8ace10ab5bd93235dfaab2021c70637005e1ac787031f4d1da63d493c1d \ + --hash=sha256:c0868d64af83169e4d4152ec612637a543f7a336e4a307b119e98042e852ad9c \ + --hash=sha256:c1c1496e73051918fcd4f58ff2e0f2f3066d1c76a0c6aeffd9b45d53243702cc \ + --hash=sha256:c9bf56195c6bbd293340ea82eafd0071cb3d450c703d2c93afb89f93b8386ccc \ + --hash=sha256:cbebcd5bcaf1eaf302617c114aa67569dd3f090dd0ce8ba9e35e9985b41ac35b \ + --hash=sha256:cd6c8fca38178e12c00418de737aef1261576bd1b6e8c6134d3e729a4e858b38 \ + --hash=sha256:ceb3b7e6a0135e092de86110c5a74e46bda4bd4fbfeeb3a3bcec79c0f861e450 \ + --hash=sha256:cf590b134eb70629e350691ecca88eac3e3b8b3c86992042fb82e3cb1830d5e1 \ + --hash=sha256:d3eb1ceec286eba8220c26f3b0096cf189aea7057b6e7b7a2e60ed36b373b77f \ + --hash=sha256:d65f25da8e248202bd47445cec78e0025c0fe7582b23ec69c3b27a640dd7a8e3 \ + --hash=sha256:d6f6d4f185481c9669b9447bf9d9cf3b95a0e9df9d169bbc17e363b7d5487755 \ + --hash=sha256:d84a5c3a5f7ce6db1f999fb9438f686bc2e09d38143f2d93d8406ed2dd6b9226 \ + --hash=sha256:d946b0a9eb8aaa590df1fe082cee553ceab173e6cb5b03239716338629c50c7a \ + --hash=sha256:dce1c6912ab9ff5f179eaf6efe7365c1f425ed690b03341911bf4939ef2f3046 \ + --hash=sha256:de170c7b4fe6859beb8926e84f7d7d6c693dfe8e27372ce3b76f01c46e489fcf \ + --hash=sha256:e02021f87a5b6932fa6ce916ca004c4d441509d33bbdbeca70d05dff5e9d2479 \ + --hash=sha256:e030047e85cbcedbfc073f71836d62dd5dadfbe7531cae27789ff66bc551bd5e \ + --hash=sha256:e0e79d91e71b9867c73323a3444724d496c037e578a0e1755ae159ba14f4f3d1 \ + --hash=sha256:e4428b29611e989719874670fd152b6625500ad6c686d464e99f5aaeeaca175a \ + --hash=sha256:e4972624066095e52b569e02b5ca97dbd7a7ddd4294bf4e7247d52635630dd83 \ + --hash=sha256:e7be68734bd8c9a513f2b0cfd508802d6609da068f40dc57d4e3494cefc92929 \ + --hash=sha256:e8e94e6912639a02ce173341ff62cc1201232ab86b8a8fcc05572741a5dc7d93 \ + --hash=sha256:ea1456df2a27c73ce51120fa2f519f1bea2f4a03a917f4a43c8707cf4cbbae1a \ + --hash=sha256:ebd8d160f91a764652d3e51ce0d2956b38efe37c9231cd82cfc0bed2e40b581c \ + --hash=sha256:eca2e9d0cc5a889850e9bbd68e98314ada174ff6ccd1129500103df7a94a7a44 \ + --hash=sha256:edd08e6f2f1a390bf137080507e44ccc086353c8e98c657e666c017718561b89 \ + --hash=sha256:f285e862d2f153a70586579c15c44656f888806ed0e5b56b64489afe4a2dbfba \ + --hash=sha256:f2a1dee728b52b33eebff5072817176c172050d44d67befd681609b4746e1c2e \ + --hash=sha256:f7e301075edaf50500f0b341543c41194d8df3ae5caf4702f2095f3ca73dd8da \ + --hash=sha256:fb616be3538599e797a2017cccca78e354c767165e8858ab5116813146041a24 \ + --hash=sha256:fce28b3c8a81b6b36dfac9feb1de115bab619b3c13905b419ec71d03a3fc1423 \ + --hash=sha256:fe5d7785250541f7f5019ab9cba2c71169dc7d74d0f45253f8313f436458a4ef # via # -c release/ray_release/byod/requirements_compiled.txt # aiohttp @@ -2219,6 +2235,12 @@ numpy==1.26.4 \ # triad # utilsforecast # xgboost +nvidia-nccl-cu12==2.20.5 \ + --hash=sha256:057f6bf9685f75215d0c53bf3ac4a10b3e6578351de307abad9e18a99182af56 \ + --hash=sha256:1fc150d5c3250b170b29410ba682384b14581db722b2531b0d8d33c595f33d01 + # via + # -c release/ray_release/byod/requirements_compiled.txt + # xgboost oauth2client==4.1.3 \ --hash=sha256:b8a81cc5d60e2d364f0b1b98f958dbd472887acaf1a5b05e21c28c31a2d6d3ac \ --hash=sha256:d486741e451287f69568a4d26d70d9acd73a2bbfa275746c535b4209891cccc6 @@ -2250,6 +2272,7 @@ packaging==23.0 \ # evaluate # fugue-sql-antlr # huggingface-hub + # jupytext # lightning-utilities # matplotlib # modin @@ -2263,7 +2286,7 @@ packaging==23.0 \ # transformers # typepy # utilsforecast -pandas==1.5.3 \ +pandas==1.5.3 ; python_version < "3.12" \ --hash=sha256:14e45300521902689a81f3f41386dc86f19b8ba8dd5ac5a3c7010ef8d2932813 \ --hash=sha256:26d9c71772c7afb9d5046e6e9cf42d83dd147b5cf5bcb9d97252077118543792 \ --hash=sha256:3749077d86e3a2f0ed51367f30bf5b82e131cc0f14260c4d3e499186fccc4406 \ @@ -2710,9 +2733,9 @@ pynvml==11.5.0 \ # via # -c release/ray_release/byod/requirements_compiled.txt # deepspeed -pyopenssl==23.0.0 \ - --hash=sha256:c1cc5f86bcacefc84dada7d31175cae1b1518d5f60d3d0bb595a67822a868a6f \ - --hash=sha256:df5fc28af899e74e19fccb5510df423581047e10ab6f1f4ba1763ff5fde844c0 +pyopenssl==24.2.1 \ + --hash=sha256:4247f0dbe3748d560dcbb2ff3ea01af0f9a1a001ef5f7c4c647956ed8cbf0e95 \ + --hash=sha256:967d5719b12b243588573f39b0c677637145c7a1ffedcd495a487e58177fbb8d # via # -c release/ray_release/byod/requirements_compiled.txt # gcs-oauth2-boto-plugin @@ -2963,95 +2986,86 @@ qpd==0.4.4 \ # via # -c release/ray_release/byod/requirements_compiled.txt # fugue -regex==2023.10.3 \ - --hash=sha256:00ba3c9818e33f1fa974693fb55d24cdc8ebafcb2e4207680669d8f8d7cca79a \ - --hash=sha256:00e871d83a45eee2f8688d7e6849609c2ca2a04a6d48fba3dff4deef35d14f07 \ - --hash=sha256:06e9abc0e4c9ab4779c74ad99c3fc10d3967d03114449acc2c2762ad4472b8ca \ - --hash=sha256:0b9ac09853b2a3e0d0082104036579809679e7715671cfbf89d83c1cb2a30f58 \ - --hash=sha256:0d47840dc05e0ba04fe2e26f15126de7c755496d5a8aae4a08bda4dd8d646c54 \ - --hash=sha256:0f649fa32fe734c4abdfd4edbb8381c74abf5f34bc0b3271ce687b23729299ed \ - --hash=sha256:107ac60d1bfdc3edb53be75e2a52aff7481b92817cfdddd9b4519ccf0e54a6ff \ - --hash=sha256:11175910f62b2b8c055f2b089e0fedd694fe2be3941b3e2633653bc51064c528 \ - --hash=sha256:12bd4bc2c632742c7ce20db48e0d99afdc05e03f0b4c1af90542e05b809a03d9 \ - --hash=sha256:16f8740eb6dbacc7113e3097b0a36065a02e37b47c936b551805d40340fb9971 \ - --hash=sha256:1c0e8fae5b27caa34177bdfa5a960c46ff2f78ee2d45c6db15ae3f64ecadde14 \ - --hash=sha256:2c54e23836650bdf2c18222c87f6f840d4943944146ca479858404fedeb9f9af \ - --hash=sha256:3367007ad1951fde612bf65b0dffc8fd681a4ab98ac86957d16491400d661302 \ - --hash=sha256:36362386b813fa6c9146da6149a001b7bd063dabc4d49522a1f7aa65b725c7ec \ - --hash=sha256:39807cbcbe406efca2a233884e169d056c35aa7e9f343d4e78665246a332f597 \ - --hash=sha256:39cdf8d141d6d44e8d5a12a8569d5a227f645c87df4f92179bd06e2e2705e76b \ - --hash=sha256:3b2c3502603fab52d7619b882c25a6850b766ebd1b18de3df23b2f939360e1bd \ - --hash=sha256:3ccf2716add72f80714b9a63899b67fa711b654be3fcdd34fa391d2d274ce767 \ - --hash=sha256:3fef4f844d2290ee0ba57addcec17eec9e3df73f10a2748485dfd6a3a188cc0f \ - --hash=sha256:4023e2efc35a30e66e938de5aef42b520c20e7eda7bb5fb12c35e5d09a4c43f6 \ - --hash=sha256:4a3ee019a9befe84fa3e917a2dd378807e423d013377a884c1970a3c2792d293 \ - --hash=sha256:4a8bf76e3182797c6b1afa5b822d1d5802ff30284abe4599e1247be4fd6b03be \ - --hash=sha256:4a992f702c9be9c72fa46f01ca6e18d131906a7180950958f766c2aa294d4b41 \ - --hash=sha256:4c34d4f73ea738223a094d8e0ffd6d2c1a1b4c175da34d6b0de3d8d69bee6bcc \ - --hash=sha256:4cd1bccf99d3ef1ab6ba835308ad85be040e6a11b0977ef7ea8c8005f01a3c29 \ - --hash=sha256:4ef80829117a8061f974b2fda8ec799717242353bff55f8a29411794d635d964 \ - --hash=sha256:58837f9d221744d4c92d2cf7201c6acd19623b50c643b56992cbd2b745485d3d \ - --hash=sha256:5a8f91c64f390ecee09ff793319f30a0f32492e99f5dc1c72bc361f23ccd0a9a \ - --hash=sha256:5addc9d0209a9afca5fc070f93b726bf7003bd63a427f65ef797a931782e7edc \ - --hash=sha256:6239d4e2e0b52c8bd38c51b760cd870069f0bdf99700a62cd509d7a031749a55 \ - --hash=sha256:66e2fe786ef28da2b28e222c89502b2af984858091675044d93cb50e6f46d7af \ - --hash=sha256:69c0771ca5653c7d4b65203cbfc5e66db9375f1078689459fe196fe08b7b4930 \ - --hash=sha256:6ac965a998e1388e6ff2e9781f499ad1eaa41e962a40d11c7823c9952c77123e \ - --hash=sha256:6c56c3d47da04f921b73ff9415fbaa939f684d47293f071aa9cbb13c94afc17d \ - --hash=sha256:6f85739e80d13644b981a88f529d79c5bdf646b460ba190bffcaf6d57b2a9863 \ - --hash=sha256:706e7b739fdd17cb89e1fbf712d9dc21311fc2333f6d435eac2d4ee81985098c \ - --hash=sha256:741ba2f511cc9626b7561a440f87d658aabb3d6b744a86a3c025f866b4d19e7f \ - --hash=sha256:7434a61b158be563c1362d9071358f8ab91b8d928728cd2882af060481244c9e \ - --hash=sha256:76066d7ff61ba6bf3cb5efe2428fc82aac91802844c022d849a1f0f53820502d \ - --hash=sha256:7979b834ec7a33aafae34a90aad9f914c41fd6eaa8474e66953f3f6f7cbd4368 \ - --hash=sha256:7eece6fbd3eae4a92d7c748ae825cbc1ee41a89bb1c3db05b5578ed3cfcfd7cb \ - --hash=sha256:7ef1e014eed78ab650bef9a6a9cbe50b052c0aebe553fb2881e0453717573f52 \ - --hash=sha256:81dce2ddc9f6e8f543d94b05d56e70d03a0774d32f6cca53e978dc01e4fc75b8 \ - --hash=sha256:82fcc1f1cc3ff1ab8a57ba619b149b907072e750815c5ba63e7aa2e1163384a4 \ - --hash=sha256:8d1f21af4c1539051049796a0f50aa342f9a27cde57318f2fc41ed50b0dbc4ac \ - --hash=sha256:90a79bce019c442604662d17bf69df99090e24cdc6ad95b18b6725c2988a490e \ - --hash=sha256:9145f092b5d1977ec8c0ab46e7b3381b2fd069957b9862a43bd383e5c01d18c2 \ - --hash=sha256:91dc1d531f80c862441d7b66c4505cd6ea9d312f01fb2f4654f40c6fdf5cc37a \ - --hash=sha256:979c24cbefaf2420c4e377ecd1f165ea08cc3d1fbb44bdc51bccbbf7c66a2cb4 \ - --hash=sha256:994645a46c6a740ee8ce8df7911d4aee458d9b1bc5639bc968226763d07f00fa \ - --hash=sha256:9b98b7681a9437262947f41c7fac567c7e1f6eddd94b0483596d320092004533 \ - --hash=sha256:9c6b4d23c04831e3ab61717a707a5d763b300213db49ca680edf8bf13ab5d91b \ - --hash=sha256:9c6d0ced3c06d0f183b73d3c5920727268d2201aa0fe6d55c60d68c792ff3588 \ - --hash=sha256:9fd88f373cb71e6b59b7fa597e47e518282455c2734fd4306a05ca219a1991b0 \ - --hash=sha256:a8f4e49fc3ce020f65411432183e6775f24e02dff617281094ba6ab079ef0915 \ - --hash=sha256:a9e908ef5889cda4de038892b9accc36d33d72fb3e12c747e2799a0e806ec841 \ - --hash=sha256:ad08a69728ff3c79866d729b095872afe1e0557251da4abb2c5faff15a91d19a \ - --hash=sha256:adbccd17dcaff65704c856bd29951c58a1bd4b2b0f8ad6b826dbd543fe740988 \ - --hash=sha256:b0c7d2f698e83f15228ba41c135501cfe7d5740181d5903e250e47f617eb4292 \ - --hash=sha256:b3ab05a182c7937fb374f7e946f04fb23a0c0699c0450e9fb02ef567412d2fa3 \ - --hash=sha256:b6104f9a46bd8743e4f738afef69b153c4b8b592d35ae46db07fc28ae3d5fb7c \ - --hash=sha256:ba7cd6dc4d585ea544c1412019921570ebd8a597fabf475acc4528210d7c4a6f \ - --hash=sha256:bc72c231f5449d86d6c7d9cc7cd819b6eb30134bb770b8cfdc0765e48ef9c420 \ - --hash=sha256:bce8814b076f0ce5766dc87d5a056b0e9437b8e0cd351b9a6c4e1134a7dfbda9 \ - --hash=sha256:be5e22bbb67924dea15039c3282fa4cc6cdfbe0cbbd1c0515f9223186fc2ec5f \ - --hash=sha256:be6b7b8d42d3090b6c80793524fa66c57ad7ee3fe9722b258aec6d0672543fd0 \ - --hash=sha256:bfe50b61bab1b1ec260fa7cd91106fa9fece57e6beba05630afe27c71259c59b \ - --hash=sha256:bff507ae210371d4b1fe316d03433ac099f184d570a1a611e541923f78f05037 \ - --hash=sha256:c148bec483cc4b421562b4bcedb8e28a3b84fcc8f0aa4418e10898f3c2c0eb9b \ - --hash=sha256:c15ad0aee158a15e17e0495e1e18741573d04eb6da06d8b84af726cfc1ed02ee \ - --hash=sha256:c2169b2dcabf4e608416f7f9468737583ce5f0a6e8677c4efbf795ce81109d7c \ - --hash=sha256:c55853684fe08d4897c37dfc5faeff70607a5f1806c8be148f1695be4a63414b \ - --hash=sha256:c65a3b5330b54103e7d21cac3f6bf3900d46f6d50138d73343d9e5b2900b2353 \ - --hash=sha256:c7964c2183c3e6cce3f497e3a9f49d182e969f2dc3aeeadfa18945ff7bdd7051 \ - --hash=sha256:cc3f1c053b73f20c7ad88b0d1d23be7e7b3901229ce89f5000a8399746a6e039 \ - --hash=sha256:ce615c92d90df8373d9e13acddd154152645c0dc060871abf6bd43809673d20a \ - --hash=sha256:d29338556a59423d9ff7b6eb0cb89ead2b0875e08fe522f3e068b955c3e7b59b \ - --hash=sha256:d8a993c0a0ffd5f2d3bda23d0cd75e7086736f8f8268de8a82fbc4bd0ac6791e \ - --hash=sha256:d9c727bbcf0065cbb20f39d2b4f932f8fa1631c3e01fcedc979bd4f51fe051c5 \ - --hash=sha256:dac37cf08fcf2094159922edc7a2784cfcc5c70f8354469f79ed085f0328ebdf \ - --hash=sha256:dd829712de97753367153ed84f2de752b86cd1f7a88b55a3a775eb52eafe8a94 \ - --hash=sha256:e54ddd0bb8fb626aa1f9ba7b36629564544954fff9669b15da3610c22b9a0991 \ - --hash=sha256:e77c90ab5997e85901da85131fd36acd0ed2221368199b65f0d11bca44549711 \ - --hash=sha256:ebedc192abbc7fd13c5ee800e83a6df252bec691eb2c4bedc9f8b2e2903f5e2a \ - --hash=sha256:ef71561f82a89af6cfcbee47f0fabfdb6e63788a9258e913955d89fdd96902ab \ - --hash=sha256:f0a47efb1dbef13af9c9a54a94a0b814902e547b7f21acb29434504d18f36e3a \ - --hash=sha256:f4f2ca6df64cbdd27f27b34f35adb640b5d2d77264228554e68deda54456eb11 \ - --hash=sha256:fb02e4257376ae25c6dd95a5aec377f9b18c09be6ebdefa7ad209b9137b73d48 +regex==2024.5.15 \ + --hash=sha256:0721931ad5fe0dda45d07f9820b90b2148ccdd8e45bb9e9b42a146cb4f695649 \ + --hash=sha256:10002e86e6068d9e1c91eae8295ef690f02f913c57db120b58fdd35a6bb1af35 \ + --hash=sha256:10e4ce0dca9ae7a66e6089bb29355d4432caed736acae36fef0fdd7879f0b0cb \ + --hash=sha256:119af6e56dce35e8dfb5222573b50c89e5508d94d55713c75126b753f834de68 \ + --hash=sha256:1337b7dbef9b2f71121cdbf1e97e40de33ff114801263b275aafd75303bd62b5 \ + --hash=sha256:13cdaf31bed30a1e1c2453ef6015aa0983e1366fad2667657dbcac7b02f67133 \ + --hash=sha256:1595f2d10dff3d805e054ebdc41c124753631b6a471b976963c7b28543cf13b0 \ + --hash=sha256:16093f563098448ff6b1fa68170e4acbef94e6b6a4e25e10eae8598bb1694b5d \ + --hash=sha256:1878b8301ed011704aea4c806a3cadbd76f84dece1ec09cc9e4dc934cfa5d4da \ + --hash=sha256:19068a6a79cf99a19ccefa44610491e9ca02c2be3305c7760d3831d38a467a6f \ + --hash=sha256:19dfb1c504781a136a80ecd1fff9f16dddf5bb43cec6871778c8a907a085bb3d \ + --hash=sha256:1b5269484f6126eee5e687785e83c6b60aad7663dafe842b34691157e5083e53 \ + --hash=sha256:1c1c174d6ec38d6c8a7504087358ce9213d4332f6293a94fbf5249992ba54efa \ + --hash=sha256:2431b9e263af1953c55abbd3e2efca67ca80a3de8a0437cb58e2421f8184717a \ + --hash=sha256:287eb7f54fc81546346207c533ad3c2c51a8d61075127d7f6d79aaf96cdee890 \ + --hash=sha256:2b4c884767504c0e2401babe8b5b7aea9148680d2e157fa28f01529d1f7fcf67 \ + --hash=sha256:35cb514e137cb3488bce23352af3e12fb0dbedd1ee6e60da053c69fb1b29cc6c \ + --hash=sha256:391d7f7f1e409d192dba8bcd42d3e4cf9e598f3979cdaed6ab11288da88cb9f2 \ + --hash=sha256:3ad070b823ca5890cab606c940522d05d3d22395d432f4aaaf9d5b1653e47ced \ + --hash=sha256:3cd7874d57f13bf70078f1ff02b8b0aa48d5b9ed25fc48547516c6aba36f5741 \ + --hash=sha256:3e507ff1e74373c4d3038195fdd2af30d297b4f0950eeda6f515ae3d84a1770f \ + --hash=sha256:455705d34b4154a80ead722f4f185b04c4237e8e8e33f265cd0798d0e44825fa \ + --hash=sha256:4a605586358893b483976cffc1723fb0f83e526e8f14c6e6614e75919d9862cf \ + --hash=sha256:4babf07ad476aaf7830d77000874d7611704a7fcf68c9c2ad151f5d94ae4bfc4 \ + --hash=sha256:4eee78a04e6c67e8391edd4dad3279828dd66ac4b79570ec998e2155d2e59fd5 \ + --hash=sha256:5397de3219a8b08ae9540c48f602996aa6b0b65d5a61683e233af8605c42b0f2 \ + --hash=sha256:5b5467acbfc153847d5adb21e21e29847bcb5870e65c94c9206d20eb4e99a384 \ + --hash=sha256:5eaa7ddaf517aa095fa8da0b5015c44d03da83f5bd49c87961e3c997daed0de7 \ + --hash=sha256:632b01153e5248c134007209b5c6348a544ce96c46005d8456de1d552455b014 \ + --hash=sha256:64c65783e96e563103d641760664125e91bd85d8e49566ee560ded4da0d3e704 \ + --hash=sha256:64f18a9a3513a99c4bef0e3efd4c4a5b11228b48aa80743be822b71e132ae4f5 \ + --hash=sha256:673b5a6da4557b975c6c90198588181029c60793835ce02f497ea817ff647cb2 \ + --hash=sha256:68811ab14087b2f6e0fc0c2bae9ad689ea3584cad6917fc57be6a48bbd012c49 \ + --hash=sha256:6e8d717bca3a6e2064fc3a08df5cbe366369f4b052dcd21b7416e6d71620dca1 \ + --hash=sha256:71a455a3c584a88f654b64feccc1e25876066c4f5ef26cd6dd711308aa538694 \ + --hash=sha256:72d7a99cd6b8f958e85fc6ca5b37c4303294954eac1376535b03c2a43eb72629 \ + --hash=sha256:7b59138b219ffa8979013be7bc85bb60c6f7b7575df3d56dc1e403a438c7a3f6 \ + --hash=sha256:7dbe2467273b875ea2de38ded4eba86cbcbc9a1a6d0aa11dcf7bd2e67859c435 \ + --hash=sha256:833616ddc75ad595dee848ad984d067f2f31be645d603e4d158bba656bbf516c \ + --hash=sha256:87e2a9c29e672fc65523fb47a90d429b70ef72b901b4e4b1bd42387caf0d6835 \ + --hash=sha256:8fe45aa3f4aa57faabbc9cb46a93363edd6197cbc43523daea044e9ff2fea83e \ + --hash=sha256:9e717956dcfd656f5055cc70996ee2cc82ac5149517fc8e1b60261b907740201 \ + --hash=sha256:9efa1a32ad3a3ea112224897cdaeb6aa00381627f567179c0314f7b65d354c62 \ + --hash=sha256:9ff11639a8d98969c863d4617595eb5425fd12f7c5ef6621a4b74b71ed8726d5 \ + --hash=sha256:a094801d379ab20c2135529948cb84d417a2169b9bdceda2a36f5f10977ebc16 \ + --hash=sha256:a0981022dccabca811e8171f913de05720590c915b033b7e601f35ce4ea7019f \ + --hash=sha256:a0bd000c6e266927cb7a1bc39d55be95c4b4f65c5be53e659537537e019232b1 \ + --hash=sha256:a32b96f15c8ab2e7d27655969a23895eb799de3665fa94349f3b2fbfd547236f \ + --hash=sha256:a81e3cfbae20378d75185171587cbf756015ccb14840702944f014e0d93ea09f \ + --hash=sha256:ac394ff680fc46b97487941f5e6ae49a9f30ea41c6c6804832063f14b2a5a145 \ + --hash=sha256:ada150c5adfa8fbcbf321c30c751dc67d2f12f15bd183ffe4ec7cde351d945b3 \ + --hash=sha256:b2b6f1b3bb6f640c1a92be3bbfbcb18657b125b99ecf141fb3310b5282c7d4ed \ + --hash=sha256:b802512f3e1f480f41ab5f2cfc0e2f761f08a1f41092d6718868082fc0d27143 \ + --hash=sha256:ba68168daedb2c0bab7fd7e00ced5ba90aebf91024dea3c88ad5063c2a562cca \ + --hash=sha256:bfc4f82cabe54f1e7f206fd3d30fda143f84a63fe7d64a81558d6e5f2e5aaba9 \ + --hash=sha256:c0c18345010870e58238790a6779a1219b4d97bd2e77e1140e8ee5d14df071aa \ + --hash=sha256:c3bea0ba8b73b71b37ac833a7f3fd53825924165da6a924aec78c13032f20850 \ + --hash=sha256:c486b4106066d502495b3025a0a7251bf37ea9540433940a23419461ab9f2a80 \ + --hash=sha256:c49e15eac7c149f3670b3e27f1f28a2c1ddeccd3a2812cba953e01be2ab9b5fe \ + --hash=sha256:c6a2b494a76983df8e3d3feea9b9ffdd558b247e60b92f877f93a1ff43d26656 \ + --hash=sha256:cab12877a9bdafde5500206d1020a584355a97884dfd388af3699e9137bf7388 \ + --hash=sha256:cac27dcaa821ca271855a32188aa61d12decb6fe45ffe3e722401fe61e323cd1 \ + --hash=sha256:cdd09d47c0b2efee9378679f8510ee6955d329424c659ab3c5e3a6edea696294 \ + --hash=sha256:cf2430df4148b08fb4324b848672514b1385ae3807651f3567871f130a728cc3 \ + --hash=sha256:d0a3d8d6acf0c78a1fff0e210d224b821081330b8524e3e2bc5a68ef6ab5803d \ + --hash=sha256:d0c0c0003c10f54a591d220997dd27d953cd9ccc1a7294b40a4be5312be8797b \ + --hash=sha256:d1f059a4d795e646e1c37665b9d06062c62d0e8cc3c511fe01315973a6542e40 \ + --hash=sha256:d347a741ea871c2e278fde6c48f85136c96b8659b632fb57a7d1ce1872547600 \ + --hash=sha256:d3ee02d9e5f482cc8309134a91eeaacbdd2261ba111b0fef3748eeb4913e6a2c \ + --hash=sha256:d99ceffa25ac45d150e30bd9ed14ec6039f2aad0ffa6bb87a5936f5782fc1569 \ + --hash=sha256:e38a7d4e8f633a33b4c7350fbd8bad3b70bf81439ac67ac38916c4a86b465456 \ + --hash=sha256:e4682f5ba31f475d58884045c1a97a860a007d44938c4c0895f41d64481edbc9 \ + --hash=sha256:e5bb9425fe881d578aeca0b2b4b3d314ec88738706f66f219c194d67179337cb \ + --hash=sha256:e64198f6b856d48192bf921421fdd8ad8eb35e179086e99e99f711957ffedd6e \ + --hash=sha256:e6662686aeb633ad65be2a42b4cb00178b3fbf7b91878f9446075c404ada552f \ + --hash=sha256:ec54d5afa89c19c6dd8541a133be51ee1017a38b412b1321ccb8d6ddbeb4cf7d \ + --hash=sha256:f5b1dff3ad008dccf18e652283f5e5339d70bf8ba7c98bf848ac33db10f7bc7a \ + --hash=sha256:f8ec0c2fea1e886a19c3bee0cd19d862b3aa75dcdfb42ebe8ed30708df64687a \ + --hash=sha256:f9ebd0a36102fcad2f03696e8af4ae682793a5d30b46c647eaf280d6cfb32796 # via # -c release/ray_release/byod/requirements_compiled.txt # diffusers @@ -3080,9 +3094,9 @@ requests==2.31.0 \ # torchtext # transformers # wandb -requests-oauthlib==1.3.1 \ - --hash=sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5 \ - --hash=sha256:75beac4a47881eeb94d5ea5d6ad31ef88856affe2332b9aafb52c6452ccf0d7a +requests-oauthlib==2.0.0 \ + --hash=sha256:7dd8a5c40426b779b0868c404bdef9768deccf22749cde15852df527e6269b36 \ + --hash=sha256:b3dffaebd884d8cd778494369603a9e7b58d29111bf6b41bdc2dcd87203af4e9 # via # -c release/ray_release/byod/requirements_compiled.txt # google-auth-oauthlib @@ -3098,12 +3112,13 @@ retry-decorator==1.1.1 \ # -c release/ray_release/byod/requirements_compiled.txt # gcs-oauth2-boto-plugin # gsutil -rich==12.6.0 \ - --hash=sha256:a4eb26484f2c82589bd9a17c73d32a010b1e29d89f1604cd9bf3a2097b81bb5e \ - --hash=sha256:ba3a3775974105c221d31141f2c116f4fd65c5ceb0698657a11e9f295ec93fd0 +rich==13.3.2 \ + --hash=sha256:91954fe80cfb7985727a467ca98a7618e5dd15178cc2da10f553b36a93859001 \ + --hash=sha256:a104f37270bf677148d8acb07d33be1569eeee87e2d1beb286a4e9113caf6f2f # via # -c release/ray_release/byod/requirements_compiled.txt # memray + # typer rouge-score==0.1.2 \ --hash=sha256:c7d4da2683e68c9abf0135ef915d63a46643666f848e558a1b9f7ead17ff0f04 # via lm-eval @@ -3127,105 +3142,107 @@ s3transfer==0.6.2 \ sacrebleu==2.4.2 \ --hash=sha256:611a581d205828912f0b05f806b110180087184d3be2dc650fda7a729d6ecb89 # via lm-eval -safetensors==0.4.1 \ - --hash=sha256:04157d008385bea66d12fe90844a80d4a76dc25ec5230b5bd9a630496d1b7c03 \ - --hash=sha256:04dd14f53f5500eb4c4149674216ba1000670efbcf4b1b5c2643eb244e7882ea \ - --hash=sha256:097e9af2efa8778cd2f0cba451784253e62fa7cc9fc73c0744d27212f7294e25 \ - --hash=sha256:0bd0afd95c1e497f520e680ea01e0397c0868a3a3030e128438cf6e9e3fcd671 \ - --hash=sha256:0ddd050e01f3e843aa8c1c27bf68675b8a08e385d0045487af4d70418c3cb356 \ - --hash=sha256:16d8bbb7344e39cb9d4762e85c21df94ebeb03edac923dd94bb9ed8c10eac070 \ - --hash=sha256:1a45dbf03e8334d3a5dc93687d98b6dc422f5d04c7d519dac09b84a3c87dd7c6 \ - --hash=sha256:1d568628e9c43ca15eb96c217da73737c9ccb07520fafd8a1eba3f2750614105 \ - --hash=sha256:1faf5111c66a6ba91f85dff2e36edaaf36e6966172703159daeef330de4ddc7b \ - --hash=sha256:2297b359d91126c0f9d4fd17bae3cfa2fe3a048a6971b8db07db746ad92f850c \ - --hash=sha256:2304658e6ada81a5223225b4efe84748e760c46079bffedf7e321763cafb36c9 \ - --hash=sha256:2536b11ce665834201072e9397404170f93f3be10cca9995b909f023a04501ee \ - --hash=sha256:257d59e40a1b367cb544122e7451243d65b33c3f34d822a347f4eea6fdf97fdf \ - --hash=sha256:25a043cbb59d4f75e9dd87fdf5c009dd8830105a2c57ace49b72167dd9808111 \ - --hash=sha256:270b99885ec14abfd56c1d7f28ada81740a9220b4bae960c3de1c6fe84af9e4d \ - --hash=sha256:285b52a481e7ba93e29ad4ec5841ef2c4479ef0a6c633c4e2629e0508453577b \ - --hash=sha256:2b6a2814278b6660261aa9a9aae524616de9f1ec364e3716d219b6ed8f91801f \ - --hash=sha256:2d54c2f1826e790d1eb2d2512bfd0ee443f0206b423d6f27095057c7f18a0687 \ - --hash=sha256:2d87d993eaefe6611a9c241a8bd364a5f1ffed5771c74840363a6c4ed8d868f6 \ - --hash=sha256:2fe6926110e3d425c4b684a4379b7796fdc26ad7d16922ea1696c8e6ea7e920f \ - --hash=sha256:303d2c0415cf15a28f8d7f17379ea3c34c2b466119118a34edd9965983a1a8a6 \ - --hash=sha256:313e8472197bde54e3ec54a62df184c414582979da8f3916981b6a7954910a1b \ - --hash=sha256:35803201d980efcf964b75a0a2aee97fe5e9ecc5f3ad676b38fafdfe98e0620d \ - --hash=sha256:39d36f1d88468a87c437a1bc27c502e71b6ca44c385a9117a9f9ba03a75cc9c6 \ - --hash=sha256:3b0b7b2d5976fbed8a05e2bbdce5816a59e6902e9e7c7e07dc723637ed539787 \ - --hash=sha256:3b30abd0cddfe959d1daedf92edcd1b445521ebf7ddefc20860ed01486b33c90 \ - --hash=sha256:3c1b1d510c7aba71504ece87bf393ea82638df56303e371e5e2cf09d18977dd7 \ - --hash=sha256:3cfd1ca35eacc635f0eaa894e5c5ed83ffebd0f95cac298fd430014fa7323631 \ - --hash=sha256:3f6a520af7f2717c5ecba112041f2c8af1ca6480b97bf957aba81ed9642e654c \ - --hash=sha256:413e1f6ac248f7d1b755199a06635e70c3515493d3b41ba46063dec33aa2ebb7 \ - --hash=sha256:4177b456c6b0c722d82429127b5beebdaf07149d265748e97e0a34ff0b3694c8 \ - --hash=sha256:42c3710cec7e5c764c7999697516370bee39067de0aa089b7e2cfb97ac8c6b20 \ - --hash=sha256:44e230fbbe120de564b64f63ef3a8e6ff02840fa02849d9c443d56252a1646d4 \ - --hash=sha256:48901bd540f8a3c1791314bc5c8a170927bf7f6acddb75bf0a263d081a3637d4 \ - --hash=sha256:53134226053e56bd56e73f7db42596e7908ed79f3c9a1016e4c1dade593ac8e5 \ - --hash=sha256:573b6023a55a2f28085fc0a84e196c779b6cbef4d9e73acea14c8094fee7686f \ - --hash=sha256:5d95ea4d8b32233910734a904123bdd3979c137c461b905a5ed32511defc075f \ - --hash=sha256:5f25297148ec665f0deb8bd67e9564634d8d6841041ab5393ccfe203379ea88b \ - --hash=sha256:645b3f1138fce6e818e79d4128afa28f0657430764cc045419c1d069ff93f732 \ - --hash=sha256:660ca1d8bff6c7bc7c6b30b9b32df74ef3ab668f5df42cefd7588f0d40feadcb \ - --hash=sha256:6ace9e66a40f98a216ad661245782483cf79cf56eb2b112650bb904b0baa9db5 \ - --hash=sha256:6fd80f7794554091836d4d613d33a7d006e2b8d6ba014d06f97cebdfda744f64 \ - --hash=sha256:780dc21eb3fd32ddd0e8c904bdb0290f2454f4ac21ae71e94f9ce72db1900a5a \ - --hash=sha256:791edc10a3c359a2f5f52d5cddab0df8a45107d91027d86c3d44e57162e5d934 \ - --hash=sha256:7a8f6f679d97ea0135c7935c202feefbd042c149aa70ee759855e890c01c7814 \ - --hash=sha256:7ef010e9afcb4057fb6be3d0a0cfa07aac04fe97ef73fe4a23138d8522ba7c17 \ - --hash=sha256:7ff8a36e0396776d3ed9a106fc9a9d7c55d4439ca9a056a24bf66d343041d3e6 \ - --hash=sha256:82571d20288c975c1b30b08deb9b1c3550f36b31191e1e81fae87669a92217d0 \ - --hash=sha256:82cbb8f4d022f2e94498cbefca900698b8ded3d4f85212f47da614001ff06652 \ - --hash=sha256:83c2cfbe8c6304f0891e7bb378d56f66d2148972eeb5f747cd8a2246886f0d8c \ - --hash=sha256:845be0aafabf2a60c2d482d4e93023fecffe5e5443d801d7a7741bae9de41233 \ - --hash=sha256:88b4653059c903015284a9722f9a46838c654257173b279c8f6f46dbe80b612d \ - --hash=sha256:8b58ba13a9e82b4bc3fc221914f6ef237fe6c2adb13cede3ace64d1aacf49610 \ - --hash=sha256:8f69903ff49cb30b9227fb5d029bea276ea20d04b06803877a420c5b1b74c689 \ - --hash=sha256:8ff8e41c8037db17de0ea2a23bc684f43eaf623be7d34906fe1ac10985b8365e \ - --hash=sha256:911b48dc09e321a194def3a7431662ff4f03646832f3a8915bbf0f449b8a5fcb \ - --hash=sha256:998fbac99ca956c3a09fe07cc0b35fac26a521fa8865a690686d889f0ff4e4a6 \ - --hash=sha256:9a82bc2bd7a9a0e08239bdd6d7774d64121f136add93dfa344a2f1a6d7ef35fa \ - --hash=sha256:9d16b3b2fcc6fca012c74bd01b5619c655194d3e3c13e4d4d0e446eefa39a463 \ - --hash=sha256:a257de175c254d39ccd6a21341cd62eb7373b05c1e618a78096a56a857e0c316 \ - --hash=sha256:a79e16222106b2f5edbca1b8185661477d8971b659a3c814cc6f15181a9b34c8 \ - --hash=sha256:ae2d5a31cfb8a973a318f7c4d2cffe0bd1fe753cdf7bb41a1939d45a0a06f964 \ - --hash=sha256:ae2f67f04ed0bb2e56fd380a8bd3eef03f609df53f88b6f5c7e89c08e52aae00 \ - --hash=sha256:ae5497adc68669db2fed7cb2dad81e6a6106e79c9a132da3efdb6af1db1014fa \ - --hash=sha256:b287304f2b2220d51ccb51fd857761e78bcffbeabe7b0238f8dc36f2edfd9542 \ - --hash=sha256:b2f8877990a72ff595507b80f4b69036a9a1986a641f8681adf3425d97d3d2a5 \ - --hash=sha256:bb4cb3e37a9b961ddd68e873b29fe9ab4a081e3703412e34aedd2b7a8e9cafd9 \ - --hash=sha256:bbc2ce1f5ae5143a7fb72b71fa71db6a42b4f6cf912aa3acdc6b914084778e68 \ - --hash=sha256:bda3d98e2bcece388232cfc551ebf063b55bdb98f65ab54df397da30efc7dcc5 \ - --hash=sha256:bdc0d039e44a727824639824090bd8869535f729878fa248addd3dc01db30eae \ - --hash=sha256:bfa2e20342b81921b98edba52f8deb68843fa9c95250739a56b52ceda5ea5c61 \ - --hash=sha256:c3807ac3b16288dffebb3474b555b56fe466baa677dfc16290dcd02dca1ab228 \ - --hash=sha256:c3c9f0ca510e0de95abd6424789dcbc879942a3a4e29b0dfa99d9427bf1da75c \ - --hash=sha256:c8ed5d2c04cdc1afc6b3c28d59580448ac07732c50d94c15e14670f9c473a2ce \ - --hash=sha256:cba01c6b76e01ec453933b3b3c0157c59b52881c83eaa0f7666244e71aa75fd1 \ - --hash=sha256:ce7a28bc8af685a69d7e869d09d3e180a275e3281e29cf5f1c7319e231932cc7 \ - --hash=sha256:d10a9f7bae608ccfdc009351f01dc3d8535ff57f9488a58a4c38e45bf954fe93 \ - --hash=sha256:d3ac139377cfe71ba04573f1cda66e663b7c3e95be850e9e6c2dd4b5984bd513 \ - --hash=sha256:d5b3defa74f3723a388bfde2f5d488742bc4879682bd93267c09a3bcdf8f869b \ - --hash=sha256:d784938534e255473155e4d9f276ee69eb85455b6af1292172c731409bf9adee \ - --hash=sha256:d784a98c492c751f228a4a894c3b8a092ff08b24e73b5568938c28b8c0e8f8df \ - --hash=sha256:d8a85e3e47e0d4eebfaf9a58b40aa94f977a56050cb5598ad5396a9ee7c087c6 \ - --hash=sha256:d93321eea0dd7e81b283e47a1d20dee6069165cc158286316d0d06d340de8fe8 \ - --hash=sha256:da52ee0dc8ba03348ffceab767bd8230842fdf78f8a996e2a16445747143a778 \ - --hash=sha256:dab431699b5d45e0ca043bc580651ce9583dda594e62e245b7497adb32e99809 \ - --hash=sha256:dac4bb42f8679aadc59bd91a4c5a1784a758ad49d0912995945cd674089f628e \ - --hash=sha256:e056fb9e22d118cc546107f97dc28b449d88274207dd28872bd668c86216e4f6 \ - --hash=sha256:e09000b2599e1836314430f81a3884c66a5cbabdff5d9f175b5d560d4de38d78 \ - --hash=sha256:e0ccb5aa0f3be2727117e5631200fbb3a5b3a2b3757545a92647d6dd8be6658f \ - --hash=sha256:e57a5ab08b0ec7a7caf30d2ac79bb30c89168431aca4f8854464bb9461686925 \ - --hash=sha256:e9a7ffb1e551c6df51d267f5a751f042b183df22690f6feceac8d27364fd51d7 \ - --hash=sha256:e9c80ce0001efa16066358d2dd77993adc25f5a6c61850e4ad096a2232930bce \ - --hash=sha256:eb2c1da1cc39509d1a55620a5f4d14f8911c47a89c926a96e6f4876e864375a3 \ - --hash=sha256:edcf3121890b5f0616aa5a54683b1a5d2332037b970e507d6bb7841a3a596556 \ - --hash=sha256:f603bdd8deac6726d39f41688ed353c532dd53935234405d79e9eb53f152fbfb \ - --hash=sha256:f8934bdfd202ebd0697040a3dff40dd77bc4c5bbf3527ede0532f5e7fb4d970f \ - --hash=sha256:fdb4adb76e21bad318210310590de61c9f4adcef77ee49b4a234f9dc48867869 \ - --hash=sha256:fdb58dee173ef33634c3016c459d671ca12d11e6acf9db008261cbe58107e579 +safetensors==0.4.3 \ + --hash=sha256:018b691383026a2436a22b648873ed11444a364324e7088b99cd2503dd828400 \ + --hash=sha256:01e4b22e3284cd866edeabe4f4d896229495da457229408d2e1e4810c5187121 \ + --hash=sha256:01feb3089e5932d7e662eda77c3ecc389f97c0883c4a12b5cfdc32b589a811c3 \ + --hash=sha256:02318f01e332cc23ffb4f6716e05a492c5f18b1d13e343c49265149396284a44 \ + --hash=sha256:02ef3a24face643456020536591fbd3c717c5abaa2737ec428ccbbc86dffa7a4 \ + --hash=sha256:03a4447c784917c9bf01d8f2ac5080bc15c41692202cd5f406afba16629e84d6 \ + --hash=sha256:084fc436e317f83f7071fc6a62ca1c513b2103db325cd09952914b50f51cf78f \ + --hash=sha256:0bf4f9d6323d9f86eef5567eabd88f070691cf031d4c0df27a40d3b4aaee755b \ + --hash=sha256:0d52c958dc210265157573f81d34adf54e255bc2b59ded6218500c9b15a750eb \ + --hash=sha256:0d5ffc6a80f715c30af253e0e288ad1cd97a3d0086c9c87995e5093ebc075e50 \ + --hash=sha256:0d9cd8e1560dfc514b6d7859247dc6a86ad2f83151a62c577428d5102d872721 \ + --hash=sha256:0dd37306546b58d3043eb044c8103a02792cc024b51d1dd16bd3dd1f334cb3ed \ + --hash=sha256:1139eb436fd201c133d03c81209d39ac57e129f5e74e34bb9ab60f8d9b726270 \ + --hash=sha256:19bbdf95de2cf64f25cd614c5236c8b06eb2cfa47cbf64311f4b5d80224623a3 \ + --hash=sha256:1ab6527a20586d94291c96e00a668fa03f86189b8a9defa2cdd34a1a01acc7d5 \ + --hash=sha256:1b89381517891a7bb7d1405d828b2bf5d75528299f8231e9346b8eba092227f9 \ + --hash=sha256:1f598b713cc1a4eb31d3b3203557ac308acf21c8f41104cdd74bf640c6e538e3 \ + --hash=sha256:22d21760dc6ebae42e9c058d75aa9907d9f35e38f896e3c69ba0e7b213033856 \ + --hash=sha256:22f3b5d65e440cec0de8edaa672efa888030802e11c09b3d6203bff60ebff05a \ + --hash=sha256:2a0deb16a1d3ea90c244ceb42d2c6c276059616be21a19ac7101aa97da448faf \ + --hash=sha256:2a1f4430cc0c9d6afa01214a4b3919d0a029637df8e09675ceef1ca3f0dfa0df \ + --hash=sha256:2d603846a8585b9432a0fd415db1d4c57c0f860eb4aea21f92559ff9902bae4d \ + --hash=sha256:2f85fc50c4e07a21e95c24e07460fe6f7e2859d0ce88092838352b798ce711c2 \ + --hash=sha256:309b10dbcab63269ecbf0e2ca10ce59223bb756ca5d431ce9c9eeabd446569da \ + --hash=sha256:3615a96dd2dcc30eb66d82bc76cda2565f4f7bfa89fcb0e31ba3cea8a1a9ecbb \ + --hash=sha256:38e2a8666178224a51cca61d3cb4c88704f696eac8f72a49a598a93bbd8a4af9 \ + --hash=sha256:393e6e391467d1b2b829c77e47d726f3b9b93630e6a045b1d1fca67dc78bf632 \ + --hash=sha256:3f9cdca09052f585e62328c1c2923c70f46814715c795be65f0b93f57ec98a02 \ + --hash=sha256:41a727a7f5e6ad9f1db6951adee21bbdadc632363d79dc434876369a17de6ad6 \ + --hash=sha256:420a98f593ff9930f5822560d14c395ccbc57342ddff3b463bc0b3d6b1951550 \ + --hash=sha256:446e9fe52c051aeab12aac63d1017e0f68a02a92a027b901c4f8e931b24e5397 \ + --hash=sha256:455d538aa1aae4a8b279344a08136d3f16334247907b18a5c3c7fa88ef0d3c46 \ + --hash=sha256:4f9bac020faba7f5dc481e881b14b6425265feabb5bfc552551d21189c0eddc3 \ + --hash=sha256:53c4879b9c6bd7cd25d114ee0ef95420e2812e676314300624594940a8d6a91f \ + --hash=sha256:5757e4688f20df083e233b47de43845d1adb7e17b6cf7da5f8444416fc53828d \ + --hash=sha256:585c9ae13a205807b63bef8a37994f30c917ff800ab8a1ca9c9b5d73024f97ee \ + --hash=sha256:5d07cbca5b99babb692d76d8151bec46f461f8ad8daafbfd96b2fca40cadae65 \ + --hash=sha256:5fc6775529fb9f0ce2266edd3e5d3f10aab068e49f765e11f6f2a63b5367021d \ + --hash=sha256:622afd28968ef3e9786562d352659a37de4481a4070f4ebac883f98c5836563e \ + --hash=sha256:6f9568f380f513a60139971169c4a358b8731509cc19112369902eddb33faa4d \ + --hash=sha256:70a5319ef409e7f88686a46607cbc3c428271069d8b770076feaf913664a07ac \ + --hash=sha256:74707624b81f1b7f2b93f5619d4a9f00934d5948005a03f2c1845ffbfff42212 \ + --hash=sha256:7c4fa560ebd4522adddb71dcd25d09bf211b5634003f015a4b815b7647d62ebe \ + --hash=sha256:7de32d0d34b6623bb56ca278f90db081f85fb9c5d327e3c18fd23ac64f465768 \ + --hash=sha256:840b7ac0eff5633e1d053cc9db12fdf56b566e9403b4950b2dc85393d9b88d67 \ + --hash=sha256:840caf38d86aa7014fe37ade5d0d84e23dcfbc798b8078015831996ecbc206a3 \ + --hash=sha256:8651c7299cbd8b4161a36cd6a322fa07d39cd23535b144d02f1c1972d0c62f3c \ + --hash=sha256:868ad1b6fc41209ab6bd12f63923e8baeb1a086814cb2e81a65ed3d497e0cf8f \ + --hash=sha256:88887f69f7a00cf02b954cdc3034ffb383b2303bc0ab481d4716e2da51ddc10e \ + --hash=sha256:89f9f17b0dacb913ed87d57afbc8aad85ea42c1085bd5de2f20d83d13e9fc4b2 \ + --hash=sha256:8c496c5401c1b9c46d41a7688e8ff5b0310a3b9bae31ce0f0ae870e1ea2b8caf \ + --hash=sha256:8cf18888606dad030455d18f6c381720e57fc6a4170ee1966adb7ebc98d4d6a3 \ + --hash=sha256:8d22c1a10dff3f64d0d68abb8298a3fd88ccff79f408a3e15b3e7f637ef5c980 \ + --hash=sha256:90964917f5b0fa0fa07e9a051fbef100250c04d150b7026ccbf87a34a54012e0 \ + --hash=sha256:9bfb92f82574d9e58401d79c70c716985dc049b635fef6eecbb024c79b2c46ad \ + --hash=sha256:9c6ad011c1b4e3acff058d6b090f1da8e55a332fbf84695cf3100c649cc452d1 \ + --hash=sha256:a11c374eb63a9c16c5ed146457241182f310902bd2a9c18255781bb832b6748b \ + --hash=sha256:a7cef55929dcbef24af3eb40bedec35d82c3c2fa46338bb13ecf3c5720af8a61 \ + --hash=sha256:a844cdb5d7cbc22f5f16c7e2a0271170750763c4db08381b7f696dbd2c78a361 \ + --hash=sha256:ae7613a119a71a497d012ccc83775c308b9c1dab454806291427f84397d852fd \ + --hash=sha256:b1648568667f820b8c48317c7006221dc40aced1869908c187f493838a1362bc \ + --hash=sha256:b1e31be7945f66be23f4ec1682bb47faa3df34cb89fc68527de6554d3c4258a4 \ + --hash=sha256:b277482120df46e27a58082df06a15aebda4481e30a1c21eefd0921ae7e03f65 \ + --hash=sha256:b7ffba80aa49bd09195145a7fd233a7781173b422eeb995096f2b30591639517 \ + --hash=sha256:b852e47eb08475c2c1bd8131207b405793bfc20d6f45aff893d3baaad449ed14 \ + --hash=sha256:bb4f8c5d0358a31e9a08daeebb68f5e161cdd4018855426d3f0c23bb51087055 \ + --hash=sha256:bbae3b4b9d997971431c346edbfe6e41e98424a097860ee872721e176040a893 \ + --hash=sha256:befdf0167ad626f22f6aac6163477fcefa342224a22f11fdd05abb3995c1783c \ + --hash=sha256:c0acbe31340ab150423347e5b9cc595867d814244ac14218932a5cf1dd38eb39 \ + --hash=sha256:c41e1893d1206aa7054029681778d9a58b3529d4c807002c156d58426c225173 \ + --hash=sha256:c59d51f182c729f47e841510b70b967b0752039f79f1de23bcdd86462a9b09ee \ + --hash=sha256:cd6fff9e56df398abc5866b19a32124815b656613c1c5ec0f9350906fd798aac \ + --hash=sha256:cdd0a3b5da66e7f377474599814dbf5cbf135ff059cc73694de129b58a5e8a2c \ + --hash=sha256:cf476bca34e1340ee3294ef13e2c625833f83d096cfdf69a5342475602004f95 \ + --hash=sha256:d0dd4a1db09db2dba0f94d15addc7e7cd3a7b0d393aa4c7518c39ae7374623c3 \ + --hash=sha256:d1456f814655b224d4bf6e7915c51ce74e389b413be791203092b7ff78c936dd \ + --hash=sha256:d14d30c25897b2bf19b6fb5ff7e26cc40006ad53fd4a88244fdf26517d852dd7 \ + --hash=sha256:d244bcafeb1bc06d47cfee71727e775bca88a8efda77a13e7306aae3813fa7e4 \ + --hash=sha256:d8815b5e1dac85fc534a97fd339e12404db557878c090f90442247e87c8aeaea \ + --hash=sha256:d88b33980222085dd6001ae2cad87c6068e0991d4f5ccf44975d216db3b57376 \ + --hash=sha256:d8c5093206ef4b198600ae484230402af6713dab1bd5b8e231905d754022bec7 \ + --hash=sha256:d9c289f140a9ae4853fc2236a2ffc9a9f2d5eae0cb673167e0f1b8c18c0961ac \ + --hash=sha256:dcf5705cab159ce0130cd56057f5f3425023c407e170bca60b4868048bae64fd \ + --hash=sha256:e011cc162503c19f4b1fd63dfcddf73739c7a243a17dac09b78e57a00983ab35 \ + --hash=sha256:e066e8861eef6387b7c772344d1fe1f9a72800e04ee9a54239d460c400c72aab \ + --hash=sha256:e0b2104df1579d6ba9052c0ae0e3137c9698b2d85b0645507e6fd1813b70931a \ + --hash=sha256:e375d975159ac534c7161269de24ddcd490df2157b55c1a6eeace6cbb56903f0 \ + --hash=sha256:e4119532cd10dba04b423e0f86aecb96cfa5a602238c0aa012f70c3a40c44b50 \ + --hash=sha256:e7dbbde64b6c534548696808a0e01276d28ea5773bc9a2dfb97a88cd3dffe3df \ + --hash=sha256:e9afd5358719f1b2cf425fad638fc3c887997d6782da317096877e5b15b2ce93 \ + --hash=sha256:ec4b52ce9a396260eb9731eb6aea41a7320de22ed73a1042c2230af0212758ce \ + --hash=sha256:edb5698a7bc282089f64c96c477846950358a46ede85a1c040e0230344fdde10 \ + --hash=sha256:ee463219d9ec6c2be1d331ab13a8e0cd50d2f32240a81d498266d77d07b7e71e \ + --hash=sha256:efcc860be094b8d19ac61b452ec635c7acb9afa77beb218b1d7784c6d41fe8ad \ + --hash=sha256:f5e6883af9a68c0028f70a4c19d5a6ab6238a379be36ad300a22318316c00cb0 \ + --hash=sha256:f9650713b2cfa9537a2baf7dd9fee458b24a0aaaa6cafcea8bdd5fb2b8efdc34 \ + --hash=sha256:faefeb3b81bdfb4e5a55b9bbdf3d8d8753f65506e1d67d03f5c851a6c87150e9 \ + --hash=sha256:fb9c65bd82f9ef3ce4970dc19ee86be5f6f93d032159acf35e663c6bea02b237 \ + --hash=sha256:fe746d03ed8d193674a26105e4f0fe6c726f5bb602ffc695b409eaf02f04763d \ + --hash=sha256:fef5d70683643618244a4f5221053567ca3e77c2531e42ad48ae05fae909f542 # via # -c release/ray_release/byod/requirements_compiled.txt # accelerate @@ -3346,9 +3363,9 @@ sentencepiece==0.1.96 \ # via # -c release/ray_release/byod/requirements_compiled.txt # -r release/ray_release/byod/requirements_ml_byod_3.9.in -sentry-sdk==1.37.1 \ - --hash=sha256:7cd324dd2877fdc861f75cba4242bce23a58272a6fea581fcb218bb718bd9cc5 \ - --hash=sha256:a249c7364827ee89daaa078bb8b56ece0b3d52d9130961bef2302b79bdf7fe70 +sentry-sdk==2.10.0 \ + --hash=sha256:545fcc6e36c335faa6d6cda84669b6e17025f31efbf3b2211ec14efe008b75d1 \ + --hash=sha256:87b3d413c87d8e7f816cc9334bff255a83d8b577db2b22042651c30c19c09190 # via # -c release/ray_release/byod/requirements_compiled.txt # wandb @@ -3444,6 +3461,12 @@ setproctitle==1.3.3 \ # via # -c release/ray_release/byod/requirements_compiled.txt # wandb +shellingham==1.5.4 \ + --hash=sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686 \ + --hash=sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de + # via + # -c release/ray_release/byod/requirements_compiled.txt + # typer six==1.16.0 \ --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 @@ -3470,50 +3493,15 @@ smmap==5.0.1 \ # via # -c release/ray_release/byod/requirements_compiled.txt # gitdb -sniffio==1.3.0 \ - --hash=sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101 \ - --hash=sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384 +sniffio==1.3.1 \ + --hash=sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2 \ + --hash=sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc # via # -c release/ray_release/byod/requirements_compiled.txt # anyio -sqlalchemy==1.4.17 \ - --hash=sha256:196fb6bb2733834e506c925d7532f8eabad9d2304deef738a40846e54c31e236 \ - --hash=sha256:1dd77acbc19bee9c0ba858ff5e4e5d5c60895495c83b4df9bcdf4ad5e9b74f21 \ - --hash=sha256:216ff28fe803885ceb5b131dcee6507d28d255808dd5bcffcb3b5fa75be2e102 \ - --hash=sha256:461a4ea803ce0834822f372617a68ac97f9fa1281f2a984624554c651d7c3ae1 \ - --hash=sha256:4b09191ed22af149c07a880f309b7740f3f782ff13325bae5c6168a6aa57e715 \ - --hash=sha256:4c5e20666b33b03bf7f14953f0deb93007bf8c1342e985bd7c7cf25f46fac579 \ - --hash=sha256:4d93b62e98248e3e1ac1e91c2e6ee1e7316f704be1f734338b350b6951e6c175 \ - --hash=sha256:5732858e56d32fa7e02468f4fd2d8f01ddf709e5b93d035c637762890f8ed8b6 \ - --hash=sha256:58c02d1771bb0e61bc9ced8f3b36b5714d9ece8fd4bdbe2a44a892574c3bbc3c \ - --hash=sha256:651cdb3adcee13624ba22d5ff3e96f91e16a115d2ca489ddc16a8e4c217e8509 \ - --hash=sha256:6fe1c8dc26bc0005439cb78ebc78772a22cccc773f5a0e67cb3002d791f53f0f \ - --hash=sha256:7222f3236c280fab3a2d76f903b493171f0ffc29667538cc388a5d5dd0216a88 \ - --hash=sha256:7dc3d3285fb682316d580d84e6e0840fdd8ffdc05cb696db74b9dd746c729908 \ - --hash=sha256:7e45043fe11d503e1c3f9dcf5b42f92d122a814237cd9af68a11dae46ecfcae1 \ - --hash=sha256:7eb55d5583076c03aaf1510473fad2a61288490809049cb31028af56af7068ee \ - --hash=sha256:82922a320d38d7d6aa3a8130523ec7e8c70fa95f7ca7d0fd6ec114b626e4b10b \ - --hash=sha256:8e133e2551fa99c75849848a4ac08efb79930561eb629dd7d2dc9b7ee05256e6 \ - --hash=sha256:949ac299903d2ed8419086f81847381184e2264f3431a33af4679546dcc87f01 \ - --hash=sha256:a2d225c8863a76d15468896dc5af36f1e196b403eb9c7e0151e77ffab9e7df57 \ - --hash=sha256:a5f00a2be7d777119e15ccfb5ba0b2a92e8a193959281089d79821a001095f80 \ - --hash=sha256:b0ad951a6e590bbcfbfeadc5748ef5ec8ede505a8119a71b235f7481cc08371c \ - --hash=sha256:b59b2c0a3b1d93027f6b6b8379a50c354483fe1ebe796c6740e157bb2e06d39a \ - --hash=sha256:bc89e37c359dcd4d75b744e5e81af128ba678aa2ecea4be957e80e6e958a1612 \ - --hash=sha256:bde055c019e6e449ebc4ec61abd3e08690abeb028c7ada2a3b95d8e352b7b514 \ - --hash=sha256:c367ed95d41df584f412a9419b5ece85b0d6c2a08a51ae13ae47ef74ff9a9349 \ - --hash=sha256:dde05ae0987e43ec84e64d6722ce66305eda2a5e2b7d6fda004b37aabdfbb909 \ - --hash=sha256:ee6e7ca09ff274c55d19a1e15ee6f884fa0230c0d9b8d22a456e249d08dee5bf \ - --hash=sha256:f1c68f7bd4a57ffdb85eab489362828dddf6cd565a4c18eda4c446c1d5d3059d \ - --hash=sha256:f63e1f531a8bf52184e2afb53648511f3f8534decb7575b483a583d3cd8d13ed \ - --hash=sha256:fdad4a33140b77df61d456922b7974c1f1bb2c35238f6809f078003a620c4734 - # via - # -c release/ray_release/byod/requirements_compiled.txt - # alembic - # dataset -sqlglot==20.4.0 \ - --hash=sha256:401a2933298cf66901704cf2029272d8243ee72ac47b9fd8784254401b43ee43 \ - --hash=sha256:9a42135d0530de8150a2c5106e0c52abd3396d92501ebe97df7b371d20de5dc9 +sqlglot==25.6.1 \ + --hash=sha256:c1fcbaa00429979f16fb8cea20279a8b3f5312e76d97abb8f8c6a9b21be450d7 \ + --hash=sha256:ea40f3bf8452e2c1a696fe120163190bd67e49b346336e7db6d34400b57b7601 # via # -c release/ray_release/byod/requirements_compiled.txt # fugue @@ -3571,9 +3559,9 @@ statsmodels==0.14.0 \ # via # -c release/ray_release/byod/requirements_compiled.txt # statsforecast -sympy==1.12 \ - --hash=sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5 \ - --hash=sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8 +sympy==1.13.1 \ + --hash=sha256:9cebf7e04ff162015ce31c9c6c9144daa34a93bd082f54fd8f12deca4f47515f \ + --hash=sha256:db36cdc64bf61b9b24578b6f7bab1ecdd2452cf008f34faa33776680c26d66f8 # via # -c release/ray_release/byod/requirements_compiled.txt # torch @@ -3764,17 +3752,12 @@ tokenizers==0.15.2 \ # via # -c release/ray_release/byod/requirements_compiled.txt # transformers -toml==0.10.2 \ - --hash=sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b \ - --hash=sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f - # via - # -c release/ray_release/byod/requirements_compiled.txt - # jupytext tomli==2.0.1 \ --hash=sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc \ --hash=sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f # via # -c release/ray_release/byod/requirements_compiled.txt + # jupytext # pytest torch==2.3.0 \ --hash=sha256:09c81c5859a5b819956c6925a405ef1cdda393c9d8a01ce3851453f699d3358c \ @@ -3881,9 +3864,9 @@ tqdm-multiprocess==0.0.11 \ --hash=sha256:3ebdf03e7a675150fa0bbceaa9c3c64b8cb556e9ffafa4fe6c078e51820524aa \ --hash=sha256:a74002a1222ea9cbe8cdc9bd460108c6009be359621fbee9b92d0515d4d180f7 # via lm-eval -traitlets==5.14.0 \ - --hash=sha256:f14949d23829023013c47df20b4a76ccd1a85effb786dc060f34de7948361b33 \ - --hash=sha256:fcdaa8ac49c04dfa0ed3ee3384ef6dfdb5d6f3741502be247279407679296772 +traitlets==5.14.3 \ + --hash=sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7 \ + --hash=sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f # via # -c release/ray_release/byod/requirements_compiled.txt # comm @@ -3900,9 +3883,9 @@ transformers==4.36.2 \ # -r release/ray_release/byod/requirements_ml_byod_3.9.in # lm-eval # peft -triad==0.9.3 \ - --hash=sha256:1862b5a78deb9d475c7747b605f2b32457e96c6719f8cbc4e7e95147f34f6f64 \ - --hash=sha256:e4dff41ffbb98bad4d9741c9dd632890cdfe0b873f23d76d2b5f9ca41d4440a7 +triad==0.9.8 \ + --hash=sha256:2c0ba7d83977c6d4e7b59e3cc70727f858014ef7676c62d184aa8e63f7bef5de \ + --hash=sha256:5b67673124891981daf8afbab44b2e6358932ca35ef3ff38a25bc3e0f6f03f17 # via # -c release/ray_release/byod/requirements_compiled.txt # adagio @@ -3930,9 +3913,9 @@ typepy[datetime]==1.3.2 \ # dataproperty # pytablewriter # tabledata -typer==0.9.0 \ - --hash=sha256:50922fd79aea2f4751a8e0408ff10d2662bd0c8bbfa84755a699f3bada2978b2 \ - --hash=sha256:5d96d986a21493606a358cae4461bd8cdf83cbf33a5aa950ae629ca3b51467ee +typer==0.12.3 \ + --hash=sha256:070d7ca53f785acbccba8e7d28b08dcd88f79f1fbda035ade0aecec71ca5c914 \ + --hash=sha256:49e73131481d804288ef62598d97a1ceef3058905aa536a1134f90891ba35482 # via # -c release/ray_release/byod/requirements_compiled.txt # -r release/ray_release/byod/requirements_ml_byod_3.9.in @@ -3942,7 +3925,6 @@ typing-extensions==4.8.0 \ # via # -c release/ray_release/byod/requirements_compiled.txt # -r release/ray_release/byod/requirements_ml_byod_3.9.in - # alembic # fastapi # huggingface-hub # ipython @@ -3954,9 +3936,9 @@ typing-extensions==4.8.0 \ # torch # typer # wandb -urllib3==1.26.18 \ - --hash=sha256:34b97092d7e0a3a8cf7cd10e386f401b3737364026c45e622aa02903dffe0f07 \ - --hash=sha256:f8ecc1bba5667413457c529ab955bf8c67b45db799d159066261719e328580a0 +urllib3==1.26.19 \ + --hash=sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3 \ + --hash=sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429 # via # -c release/ray_release/byod/requirements_compiled.txt # -r release/ray_release/byod/requirements_ml_byod_3.9.in @@ -3965,9 +3947,9 @@ urllib3==1.26.18 \ # requests # responses # sentry-sdk -utilsforecast==0.0.23 \ - --hash=sha256:188daa121c528965e26a3a38f409b66a15f9eef2b44684cc9426f3ddb1146841 \ - --hash=sha256:290882da47ebc7887663c05c46c67e19bc63898220be444ca6173d0a5fdeee4a +utilsforecast==0.2.0 \ + --hash=sha256:3db4245da4e361f26c8eaeef216c2d1206b20defbb033bf11d3e66ce2b1d6ef8 \ + --hash=sha256:a4825bf8da547e3dc552f9b9a7a8159341a118c3a5d122191f09bc3683cba433 # via # -c release/ray_release/byod/requirements_compiled.txt # statsforecast @@ -3992,9 +3974,9 @@ wandb==0.17.0 \ # via # -c release/ray_release/byod/requirements_compiled.txt # -r release/ray_release/byod/requirements_ml_byod_3.9.in -wcwidth==0.2.12 \ - --hash=sha256:f01c104efdf57971bcb756f054dd58ddec5204dd15fa31d6503ea57947d97c02 \ - --hash=sha256:f26ec43d96c8cbfed76a5075dac87680124fa84e0855195a6184da9c187f133c +wcwidth==0.2.13 \ + --hash=sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859 \ + --hash=sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5 # via # -c release/ray_release/byod/requirements_compiled.txt # prompt-toolkit @@ -4011,13 +3993,15 @@ widgetsnbextension==4.0.11 \ # via # -c release/ray_release/byod/requirements_compiled.txt # ipywidgets -xgboost==1.7.6 \ - --hash=sha256:127cf1f5e2ec25cd41429394c6719b87af1456ce583e89f0bffd35d02ad18bcb \ - --hash=sha256:1c527554a400445e0c38186039ba1a00425dcdb4e40b37eed0e74cb39a159c47 \ - --hash=sha256:281c3c6f4fbed2d36bf95cd02a641afa95e72e9abde70064056da5e76233e8df \ - --hash=sha256:4c34675b4d2678c624ddde5d45361e7e16046923e362e4e609b88353e6b87124 \ - --hash=sha256:59b4b366d2cafc7f645e87d897983a5b59be02876194b1d213bd8d8b811d8ce8 \ - --hash=sha256:b1d5db49b199152d62bd9217c98760207d3de86d2b9d243260c573ffe638f80a +xgboost==2.1.0 \ + --hash=sha256:19d145eb847b070c32342b1bf2d7331c102783e07a484f8b13b7d759d707c6b0 \ + --hash=sha256:43b16205689249d7509daf7a6ab00ad0e6c570b3a9c263cb32b26e39d9477bb3 \ + --hash=sha256:7144980923e76ce741c7b03a14d3bd7514db6de5c7cabe96ba95b229d274f5ca \ + --hash=sha256:73673c9bb85927db7fe2e3aed6df6d35dba708cfd6767cc63d4ea11dda2dede5 \ + --hash=sha256:74904b91c42524a6c32147fe5718569e78fb65911ff4499b053f81d0964514d4 \ + --hash=sha256:840a0c6e2119d8c8f260a5dace996ea064a267f62b301a25d7d452488a7ac860 \ + --hash=sha256:b2a456eb0f3d3e8fd8ab37e44ac288292bf8ea8744c294be9fd88713d27af810 \ + --hash=sha256:cedc2e386e686795735448fd4597533acacc5ba6fb47dd910c204c468b80bb96 # via # -c release/ray_release/byod/requirements_compiled.txt # -r release/ray_release/byod/requirements_ml_byod_3.9.in @@ -4134,103 +4118,103 @@ xxhash==3.4.1 \ # -c release/ray_release/byod/requirements_compiled.txt # datasets # evaluate -yarl==1.9.3 \ - --hash=sha256:09c19e5f4404574fcfb736efecf75844ffe8610606f3fccc35a1515b8b6712c4 \ - --hash=sha256:0ab5baaea8450f4a3e241ef17e3d129b2143e38a685036b075976b9c415ea3eb \ - --hash=sha256:0d155a092bf0ebf4a9f6f3b7a650dc5d9a5bbb585ef83a52ed36ba46f55cc39d \ - --hash=sha256:126638ab961633f0940a06e1c9d59919003ef212a15869708dcb7305f91a6732 \ - --hash=sha256:1a0a4f3aaa18580038cfa52a7183c8ffbbe7d727fe581300817efc1e96d1b0e9 \ - --hash=sha256:1d93461e2cf76c4796355494f15ffcb50a3c198cc2d601ad8d6a96219a10c363 \ - --hash=sha256:26a1a8443091c7fbc17b84a0d9f38de34b8423b459fb853e6c8cdfab0eacf613 \ - --hash=sha256:271d63396460b6607b588555ea27a1a02b717ca2e3f2cf53bdde4013d7790929 \ - --hash=sha256:28a108cb92ce6cf867690a962372996ca332d8cda0210c5ad487fe996e76b8bb \ - --hash=sha256:29beac86f33d6c7ab1d79bd0213aa7aed2d2f555386856bb3056d5fdd9dab279 \ - --hash=sha256:2c757f64afe53a422e45e3e399e1e3cf82b7a2f244796ce80d8ca53e16a49b9f \ - --hash=sha256:2dad8166d41ebd1f76ce107cf6a31e39801aee3844a54a90af23278b072f1ccf \ - --hash=sha256:2dc72e891672343b99db6d497024bf8b985537ad6c393359dc5227ef653b2f17 \ - --hash=sha256:2f3c8822bc8fb4a347a192dd6a28a25d7f0ea3262e826d7d4ef9cc99cd06d07e \ - --hash=sha256:32435d134414e01d937cd9d6cc56e8413a8d4741dea36af5840c7750f04d16ab \ - --hash=sha256:3cfa4dbe17b2e6fca1414e9c3bcc216f6930cb18ea7646e7d0d52792ac196808 \ - --hash=sha256:3d5434b34100b504aabae75f0622ebb85defffe7b64ad8f52b8b30ec6ef6e4b9 \ - --hash=sha256:4003f380dac50328c85e85416aca6985536812c082387255c35292cb4b41707e \ - --hash=sha256:44e91a669c43f03964f672c5a234ae0d7a4d49c9b85d1baa93dec28afa28ffbd \ - --hash=sha256:4a14907b597ec55740f63e52d7fee0e9ee09d5b9d57a4f399a7423268e457b57 \ - --hash=sha256:4ce77d289f8d40905c054b63f29851ecbfd026ef4ba5c371a158cfe6f623663e \ - --hash=sha256:4d6d74a97e898c1c2df80339aa423234ad9ea2052f66366cef1e80448798c13d \ - --hash=sha256:51382c72dd5377861b573bd55dcf680df54cea84147c8648b15ac507fbef984d \ - --hash=sha256:525cd69eff44833b01f8ef39aa33a9cc53a99ff7f9d76a6ef6a9fb758f54d0ff \ - --hash=sha256:53ec65f7eee8655bebb1f6f1607760d123c3c115a324b443df4f916383482a67 \ - --hash=sha256:5f74b015c99a5eac5ae589de27a1201418a5d9d460e89ccb3366015c6153e60a \ - --hash=sha256:6280353940f7e5e2efaaabd686193e61351e966cc02f401761c4d87f48c89ea4 \ - --hash=sha256:632c7aeb99df718765adf58eacb9acb9cbc555e075da849c1378ef4d18bf536a \ - --hash=sha256:6465d36381af057d0fab4e0f24ef0e80ba61f03fe43e6eeccbe0056e74aadc70 \ - --hash=sha256:66a6dbf6ca7d2db03cc61cafe1ee6be838ce0fbc97781881a22a58a7c5efef42 \ - --hash=sha256:6d350388ba1129bc867c6af1cd17da2b197dff0d2801036d2d7d83c2d771a682 \ - --hash=sha256:7217234b10c64b52cc39a8d82550342ae2e45be34f5bff02b890b8c452eb48d7 \ - --hash=sha256:721ee3fc292f0d069a04016ef2c3a25595d48c5b8ddc6029be46f6158d129c92 \ - --hash=sha256:72a57b41a0920b9a220125081c1e191b88a4cdec13bf9d0649e382a822705c65 \ - --hash=sha256:73cc83f918b69110813a7d95024266072d987b903a623ecae673d1e71579d566 \ - --hash=sha256:778df71c8d0c8c9f1b378624b26431ca80041660d7be7c3f724b2c7a6e65d0d6 \ - --hash=sha256:79e1df60f7c2b148722fb6cafebffe1acd95fd8b5fd77795f56247edaf326752 \ - --hash=sha256:7c86d0d0919952d05df880a1889a4f0aeb6868e98961c090e335671dea5c0361 \ - --hash=sha256:7eaf13af79950142ab2bbb8362f8d8d935be9aaf8df1df89c86c3231e4ff238a \ - --hash=sha256:828235a2a169160ee73a2fcfb8a000709edf09d7511fccf203465c3d5acc59e4 \ - --hash=sha256:8535e111a064f3bdd94c0ed443105934d6f005adad68dd13ce50a488a0ad1bf3 \ - --hash=sha256:88d2c3cc4b2f46d1ba73d81c51ec0e486f59cc51165ea4f789677f91a303a9a7 \ - --hash=sha256:8a2538806be846ea25e90c28786136932ec385c7ff3bc1148e45125984783dc6 \ - --hash=sha256:8dab30b21bd6fb17c3f4684868c7e6a9e8468078db00f599fb1c14e324b10fca \ - --hash=sha256:8f18a7832ff85dfcd77871fe677b169b1bc60c021978c90c3bb14f727596e0ae \ - --hash=sha256:946db4511b2d815979d733ac6a961f47e20a29c297be0d55b6d4b77ee4b298f6 \ - --hash=sha256:96758e56dceb8a70f8a5cff1e452daaeff07d1cc9f11e9b0c951330f0a2396a7 \ - --hash=sha256:9a172c3d5447b7da1680a1a2d6ecdf6f87a319d21d52729f45ec938a7006d5d8 \ - --hash=sha256:9a5211de242754b5e612557bca701f39f8b1a9408dff73c6db623f22d20f470e \ - --hash=sha256:9df9a0d4c5624790a0dea2e02e3b1b3c69aed14bcb8650e19606d9df3719e87d \ - --hash=sha256:aa4643635f26052401750bd54db911b6342eb1a9ac3e74f0f8b58a25d61dfe41 \ - --hash=sha256:aed37db837ecb5962469fad448aaae0f0ee94ffce2062cf2eb9aed13328b5196 \ - --hash=sha256:af52725c7c39b0ee655befbbab5b9a1b209e01bb39128dce0db226a10014aacc \ - --hash=sha256:b0b8c06afcf2bac5a50b37f64efbde978b7f9dc88842ce9729c020dc71fae4ce \ - --hash=sha256:b61e64b06c3640feab73fa4ff9cb64bd8182de52e5dc13038e01cfe674ebc321 \ - --hash=sha256:b7831566595fe88ba17ea80e4b61c0eb599f84c85acaa14bf04dd90319a45b90 \ - --hash=sha256:b8bc5b87a65a4e64bc83385c05145ea901b613d0d3a434d434b55511b6ab0067 \ - --hash=sha256:b8d51817cf4b8d545963ec65ff06c1b92e5765aa98831678d0e2240b6e9fd281 \ - --hash=sha256:b9f9cafaf031c34d95c1528c16b2fa07b710e6056b3c4e2e34e9317072da5d1a \ - --hash=sha256:bb72d2a94481e7dc7a0c522673db288f31849800d6ce2435317376a345728225 \ - --hash=sha256:c25ec06e4241e162f5d1f57c370f4078797ade95c9208bd0c60f484834f09c96 \ - --hash=sha256:c405d482c320a88ab53dcbd98d6d6f32ada074f2d965d6e9bf2d823158fa97de \ - --hash=sha256:c4472fe53ebf541113e533971bd8c32728debc4c6d8cc177f2bff31d011ec17e \ - --hash=sha256:c4b1efb11a8acd13246ffb0bee888dd0e8eb057f8bf30112e3e21e421eb82d4a \ - --hash=sha256:c5f3faeb8100a43adf3e7925d556801d14b5816a0ac9e75e22948e787feec642 \ - --hash=sha256:c6f034386e5550b5dc8ded90b5e2ff7db21f0f5c7de37b6efc5dac046eb19c10 \ - --hash=sha256:c99ddaddb2fbe04953b84d1651149a0d85214780e4d0ee824e610ab549d98d92 \ - --hash=sha256:ca6b66f69e30f6e180d52f14d91ac854b8119553b524e0e28d5291a724f0f423 \ - --hash=sha256:cccdc02e46d2bd7cb5f38f8cc3d9db0d24951abd082b2f242c9e9f59c0ab2af3 \ - --hash=sha256:cd49a908cb6d387fc26acee8b7d9fcc9bbf8e1aca890c0b2fdfd706057546080 \ - --hash=sha256:cf7a4e8de7f1092829caef66fd90eaf3710bc5efd322a816d5677b7664893c93 \ - --hash=sha256:cfd77e8e5cafba3fb584e0f4b935a59216f352b73d4987be3af51f43a862c403 \ - --hash=sha256:d34c4f80956227f2686ddea5b3585e109c2733e2d4ef12eb1b8b4e84f09a2ab6 \ - --hash=sha256:d61a0ca95503867d4d627517bcfdc28a8468c3f1b0b06c626f30dd759d3999fd \ - --hash=sha256:d81657b23e0edb84b37167e98aefb04ae16cbc5352770057893bd222cdc6e45f \ - --hash=sha256:d92d897cb4b4bf915fbeb5e604c7911021a8456f0964f3b8ebbe7f9188b9eabb \ - --hash=sha256:dd318e6b75ca80bff0b22b302f83a8ee41c62b8ac662ddb49f67ec97e799885d \ - --hash=sha256:dd952b9c64f3b21aedd09b8fe958e4931864dba69926d8a90c90d36ac4e28c9a \ - --hash=sha256:e0e7e83f31e23c5d00ff618045ddc5e916f9e613d33c5a5823bc0b0a0feb522f \ - --hash=sha256:e0f17d1df951336a02afc8270c03c0c6e60d1f9996fcbd43a4ce6be81de0bd9d \ - --hash=sha256:e2a16ef5fa2382af83bef4a18c1b3bcb4284c4732906aa69422cf09df9c59f1f \ - --hash=sha256:e36021db54b8a0475805acc1d6c4bca5d9f52c3825ad29ae2d398a9d530ddb88 \ - --hash=sha256:e73db54c967eb75037c178a54445c5a4e7461b5203b27c45ef656a81787c0c1b \ - --hash=sha256:e741bd48e6a417bdfbae02e088f60018286d6c141639359fb8df017a3b69415a \ - --hash=sha256:f7271d6bd8838c49ba8ae647fc06469137e1c161a7ef97d778b72904d9b68696 \ - --hash=sha256:fc391e3941045fd0987c77484b2799adffd08e4b6735c4ee5f054366a2e1551d \ - --hash=sha256:fc94441bcf9cb8c59f51f23193316afefbf3ff858460cb47b5758bf66a14d130 \ - --hash=sha256:fe34befb8c765b8ce562f0200afda3578f8abb159c76de3ab354c80b72244c41 \ - --hash=sha256:fe8080b4f25dfc44a86bedd14bc4f9d469dfc6456e6f3c5d9077e81a5fedfba7 \ - --hash=sha256:ff34cb09a332832d1cf38acd0f604c068665192c6107a439a92abfd8acf90fe2 +yarl==1.9.4 \ + --hash=sha256:008d3e808d03ef28542372d01057fd09168419cdc8f848efe2804f894ae03e51 \ + --hash=sha256:03caa9507d3d3c83bca08650678e25364e1843b484f19986a527630ca376ecce \ + --hash=sha256:07574b007ee20e5c375a8fe4a0789fad26db905f9813be0f9fef5a68080de559 \ + --hash=sha256:09efe4615ada057ba2d30df871d2f668af661e971dfeedf0c159927d48bbeff0 \ + --hash=sha256:0d2454f0aef65ea81037759be5ca9947539667eecebca092733b2eb43c965a81 \ + --hash=sha256:0e9d124c191d5b881060a9e5060627694c3bdd1fe24c5eecc8d5d7d0eb6faabc \ + --hash=sha256:18580f672e44ce1238b82f7fb87d727c4a131f3a9d33a5e0e82b793362bf18b4 \ + --hash=sha256:1f23e4fe1e8794f74b6027d7cf19dc25f8b63af1483d91d595d4a07eca1fb26c \ + --hash=sha256:206a55215e6d05dbc6c98ce598a59e6fbd0c493e2de4ea6cc2f4934d5a18d130 \ + --hash=sha256:23d32a2594cb5d565d358a92e151315d1b2268bc10f4610d098f96b147370136 \ + --hash=sha256:26a1dc6285e03f3cc9e839a2da83bcbf31dcb0d004c72d0730e755b33466c30e \ + --hash=sha256:29e0f83f37610f173eb7e7b5562dd71467993495e568e708d99e9d1944f561ec \ + --hash=sha256:2b134fd795e2322b7684155b7855cc99409d10b2e408056db2b93b51a52accc7 \ + --hash=sha256:2d47552b6e52c3319fede1b60b3de120fe83bde9b7bddad11a69fb0af7db32f1 \ + --hash=sha256:357495293086c5b6d34ca9616a43d329317feab7917518bc97a08f9e55648455 \ + --hash=sha256:35a2b9396879ce32754bd457d31a51ff0a9d426fd9e0e3c33394bf4b9036b099 \ + --hash=sha256:3777ce5536d17989c91696db1d459574e9a9bd37660ea7ee4d3344579bb6f129 \ + --hash=sha256:3986b6f41ad22988e53d5778f91855dc0399b043fc8946d4f2e68af22ee9ff10 \ + --hash=sha256:44d8ffbb9c06e5a7f529f38f53eda23e50d1ed33c6c869e01481d3fafa6b8142 \ + --hash=sha256:49a180c2e0743d5d6e0b4d1a9e5f633c62eca3f8a86ba5dd3c471060e352ca98 \ + --hash=sha256:4aa9741085f635934f3a2583e16fcf62ba835719a8b2b28fb2917bb0537c1dfa \ + --hash=sha256:4b21516d181cd77ebd06ce160ef8cc2a5e9ad35fb1c5930882baff5ac865eee7 \ + --hash=sha256:4b3c1ffe10069f655ea2d731808e76e0f452fc6c749bea04781daf18e6039525 \ + --hash=sha256:4c7d56b293cc071e82532f70adcbd8b61909eec973ae9d2d1f9b233f3d943f2c \ + --hash=sha256:4e9035df8d0880b2f1c7f5031f33f69e071dfe72ee9310cfc76f7b605958ceb9 \ + --hash=sha256:54525ae423d7b7a8ee81ba189f131054defdb122cde31ff17477951464c1691c \ + --hash=sha256:549d19c84c55d11687ddbd47eeb348a89df9cb30e1993f1b128f4685cd0ebbf8 \ + --hash=sha256:54beabb809ffcacbd9d28ac57b0db46e42a6e341a030293fb3185c409e626b8b \ + --hash=sha256:566db86717cf8080b99b58b083b773a908ae40f06681e87e589a976faf8246bf \ + --hash=sha256:5a2e2433eb9344a163aced6a5f6c9222c0786e5a9e9cac2c89f0b28433f56e23 \ + --hash=sha256:5aef935237d60a51a62b86249839b51345f47564208c6ee615ed2a40878dccdd \ + --hash=sha256:604f31d97fa493083ea21bd9b92c419012531c4e17ea6da0f65cacdcf5d0bd27 \ + --hash=sha256:63b20738b5aac74e239622d2fe30df4fca4942a86e31bf47a81a0e94c14df94f \ + --hash=sha256:686a0c2f85f83463272ddffd4deb5e591c98aac1897d65e92319f729c320eece \ + --hash=sha256:6a962e04b8f91f8c4e5917e518d17958e3bdee71fd1d8b88cdce74dd0ebbf434 \ + --hash=sha256:6ad6d10ed9b67a382b45f29ea028f92d25bc0bc1daf6c5b801b90b5aa70fb9ec \ + --hash=sha256:6f5cb257bc2ec58f437da2b37a8cd48f666db96d47b8a3115c29f316313654ff \ + --hash=sha256:6fe79f998a4052d79e1c30eeb7d6c1c1056ad33300f682465e1b4e9b5a188b78 \ + --hash=sha256:7855426dfbddac81896b6e533ebefc0af2f132d4a47340cee6d22cac7190022d \ + --hash=sha256:7d5aaac37d19b2904bb9dfe12cdb08c8443e7ba7d2852894ad448d4b8f442863 \ + --hash=sha256:801e9264d19643548651b9db361ce3287176671fb0117f96b5ac0ee1c3530d53 \ + --hash=sha256:81eb57278deb6098a5b62e88ad8281b2ba09f2f1147c4767522353eaa6260b31 \ + --hash=sha256:824d6c50492add5da9374875ce72db7a0733b29c2394890aef23d533106e2b15 \ + --hash=sha256:8397a3817d7dcdd14bb266283cd1d6fc7264a48c186b986f32e86d86d35fbac5 \ + --hash=sha256:848cd2a1df56ddbffeb375535fb62c9d1645dde33ca4d51341378b3f5954429b \ + --hash=sha256:84fc30f71689d7fc9168b92788abc977dc8cefa806909565fc2951d02f6b7d57 \ + --hash=sha256:8619d6915b3b0b34420cf9b2bb6d81ef59d984cb0fde7544e9ece32b4b3043c3 \ + --hash=sha256:8a854227cf581330ffa2c4824d96e52ee621dd571078a252c25e3a3b3d94a1b1 \ + --hash=sha256:8be9e837ea9113676e5754b43b940b50cce76d9ed7d2461df1af39a8ee674d9f \ + --hash=sha256:928cecb0ef9d5a7946eb6ff58417ad2fe9375762382f1bf5c55e61645f2c43ad \ + --hash=sha256:957b4774373cf6f709359e5c8c4a0af9f6d7875db657adb0feaf8d6cb3c3964c \ + --hash=sha256:992f18e0ea248ee03b5a6e8b3b4738850ae7dbb172cc41c966462801cbf62cf7 \ + --hash=sha256:9fc5fc1eeb029757349ad26bbc5880557389a03fa6ada41703db5e068881e5f2 \ + --hash=sha256:a00862fb23195b6b8322f7d781b0dc1d82cb3bcac346d1e38689370cc1cc398b \ + --hash=sha256:a3a6ed1d525bfb91b3fc9b690c5a21bb52de28c018530ad85093cc488bee2dd2 \ + --hash=sha256:a6327976c7c2f4ee6816eff196e25385ccc02cb81427952414a64811037bbc8b \ + --hash=sha256:a7409f968456111140c1c95301cadf071bd30a81cbd7ab829169fb9e3d72eae9 \ + --hash=sha256:a825ec844298c791fd28ed14ed1bffc56a98d15b8c58a20e0e08c1f5f2bea1be \ + --hash=sha256:a8c1df72eb746f4136fe9a2e72b0c9dc1da1cbd23b5372f94b5820ff8ae30e0e \ + --hash=sha256:a9bd00dc3bc395a662900f33f74feb3e757429e545d831eef5bb280252631984 \ + --hash=sha256:aa102d6d280a5455ad6a0f9e6d769989638718e938a6a0a2ff3f4a7ff8c62cc4 \ + --hash=sha256:aaaea1e536f98754a6e5c56091baa1b6ce2f2700cc4a00b0d49eca8dea471074 \ + --hash=sha256:ad4d7a90a92e528aadf4965d685c17dacff3df282db1121136c382dc0b6014d2 \ + --hash=sha256:b8477c1ee4bd47c57d49621a062121c3023609f7a13b8a46953eb6c9716ca392 \ + --hash=sha256:ba6f52cbc7809cd8d74604cce9c14868306ae4aa0282016b641c661f981a6e91 \ + --hash=sha256:bac8d525a8dbc2a1507ec731d2867025d11ceadcb4dd421423a5d42c56818541 \ + --hash=sha256:bef596fdaa8f26e3d66af846bbe77057237cb6e8efff8cd7cc8dff9a62278bbf \ + --hash=sha256:c0ec0ed476f77db9fb29bca17f0a8fcc7bc97ad4c6c1d8959c507decb22e8572 \ + --hash=sha256:c38c9ddb6103ceae4e4498f9c08fac9b590c5c71b0370f98714768e22ac6fa66 \ + --hash=sha256:c7224cab95645c7ab53791022ae77a4509472613e839dab722a72abe5a684575 \ + --hash=sha256:c74018551e31269d56fab81a728f683667e7c28c04e807ba08f8c9e3bba32f14 \ + --hash=sha256:ca06675212f94e7a610e85ca36948bb8fc023e458dd6c63ef71abfd482481aa5 \ + --hash=sha256:d1d2532b340b692880261c15aee4dc94dd22ca5d61b9db9a8a361953d36410b1 \ + --hash=sha256:d25039a474c4c72a5ad4b52495056f843a7ff07b632c1b92ea9043a3d9950f6e \ + --hash=sha256:d5ff2c858f5f6a42c2a8e751100f237c5e869cbde669a724f2062d4c4ef93551 \ + --hash=sha256:d7d7f7de27b8944f1fee2c26a88b4dabc2409d2fea7a9ed3df79b67277644e17 \ + --hash=sha256:d7eeb6d22331e2fd42fce928a81c697c9ee2d51400bd1a28803965883e13cead \ + --hash=sha256:d8a1c6c0be645c745a081c192e747c5de06e944a0d21245f4cf7c05e457c36e0 \ + --hash=sha256:d8b889777de69897406c9fb0b76cdf2fd0f31267861ae7501d93003d55f54fbe \ + --hash=sha256:d9e09c9d74f4566e905a0b8fa668c58109f7624db96a2171f21747abc7524234 \ + --hash=sha256:db8e58b9d79200c76956cefd14d5c90af54416ff5353c5bfd7cbe58818e26ef0 \ + --hash=sha256:ddb2a5c08a4eaaba605340fdee8fc08e406c56617566d9643ad8bf6852778fc7 \ + --hash=sha256:e0381b4ce23ff92f8170080c97678040fc5b08da85e9e292292aba67fdac6c34 \ + --hash=sha256:e23a6d84d9d1738dbc6e38167776107e63307dfc8ad108e580548d1f2c587f42 \ + --hash=sha256:e516dc8baf7b380e6c1c26792610230f37147bb754d6426462ab115a02944385 \ + --hash=sha256:ea65804b5dc88dacd4a40279af0cdadcfe74b3e5b4c897aa0d81cf86927fee78 \ + --hash=sha256:ec61d826d80fc293ed46c9dd26995921e3a82146feacd952ef0757236fc137be \ + --hash=sha256:ee04010f26d5102399bd17f8df8bc38dc7ccd7701dc77f4a68c5b8d733406958 \ + --hash=sha256:f3bc6af6e2b8f92eced34ef6a96ffb248e863af20ef4fde9448cc8c9b858b749 \ + --hash=sha256:f7d6b36dd2e029b6bcb8a13cf19664c7b8e19ab3a58e0fefbb5b8461447ed5ec # via # -c release/ray_release/byod/requirements_compiled.txt # aiohttp -zipp==3.17.0 \ - --hash=sha256:0e923e726174922dce09c53c59ad483ff7bbb8e572e00c7f7c46b88556409f31 \ - --hash=sha256:84e64a1c28cf7e91ed2078bb8cc8c259cb19b76942096c8d7b84947690cabaf0 +zipp==3.19.2 \ + --hash=sha256:bf1dcf6450f873a13e952a29504887c89e6de7506209e5b1bcc3460135d4de19 \ + --hash=sha256:f091755f667055f2d02b32c53771a7a6c8b47e1fdbc4b72a8b9072b3eef8015c # via # -c release/ray_release/byod/requirements_compiled.txt # importlib-metadata diff --git a/release/ray_release/cluster_manager/cluster_manager.py b/release/ray_release/cluster_manager/cluster_manager.py index fac34cc00eb6..3f42da467f91 100644 --- a/release/ray_release/cluster_manager/cluster_manager.py +++ b/release/ray_release/cluster_manager/cluster_manager.py @@ -108,8 +108,13 @@ def _annotate_cluster_compute( return cluster_compute cluster_compute = cluster_compute.copy() - aws = cluster_compute.get("aws", {}) - cluster_compute["aws"] = add_tags_to_aws_config( + if "aws" in cluster_compute: + raise ValueError( + "aws field is invalid in compute config, " + "use advanced_configurations_json instead" + ) + aws = cluster_compute.get("advanced_configurations_json", {}) + cluster_compute["advanced_configurations_json"] = add_tags_to_aws_config( aws, extra_tags, RELEASE_AWS_RESOURCE_TYPES_TO_TRACK_FOR_BILLING ) return cluster_compute diff --git a/release/ray_release/cluster_manager/minimal.py b/release/ray_release/cluster_manager/minimal.py index 8b202c247753..1cfe14c1e2f2 100644 --- a/release/ray_release/cluster_manager/minimal.py +++ b/release/ray_release/cluster_manager/minimal.py @@ -9,7 +9,7 @@ from ray_release.logger import logger from ray_release.cluster_manager.cluster_manager import ClusterManager from ray_release.util import format_link, anyscale_cluster_env_build_url -from retry import retry +from ray_release.retry import retry REPORT_S = 30.0 @@ -20,7 +20,12 @@ class MinimalClusterManager(ClusterManager): Builds app config and compute template but does not start or stop session. """ - @retry((ClusterEnvCreateError), delay=10, jitter=5, tries=2) + @retry( + init_delay_sec=10, + jitter_sec=5, + max_retry_count=2, + exceptions=(ClusterEnvCreateError,), + ) def create_cluster_env(self): assert self.cluster_env_id is None diff --git a/release/ray_release/config.py b/release/ray_release/config.py index 93b69e56601c..c38fa4b2f7e4 100644 --- a/release/ray_release/config.py +++ b/release/ray_release/config.py @@ -38,6 +38,11 @@ RELEASE_TEST_SCHEMA_FILE = bazel_runfile("release/ray_release/schema.json") +RELEASE_TEST_CONFIG_FILES = [ + "release/release_tests.yaml", + "release/release_data_tests.yaml", +] + def read_and_validate_release_test_collection( config_files: List[str], @@ -76,11 +81,22 @@ def _test_definition_invariant( def parse_test_definition(test_definitions: List[TestDefinition]) -> List[Test]: + default_definition = {} tests = [] for test_definition in test_definitions: + if test_definition["name"] == "DEFAULTS": + default_definition = copy.deepcopy(test_definition) + continue + + # Add default values to the test definition. + test_definition = deep_update( + copy.deepcopy(default_definition), test_definition + ) + if "variations" not in test_definition: tests.append(Test(test_definition)) continue + variations = test_definition.pop("variations") _test_definition_invariant( test_definition, diff --git a/release/ray_release/retry.py b/release/ray_release/retry.py new file mode 100644 index 000000000000..dec0bd9be925 --- /dev/null +++ b/release/ray_release/retry.py @@ -0,0 +1,42 @@ +"""Utils on retry.""" + +import time +from functools import wraps +from typing import Tuple + +# Default configuration for retry. +_DEFAULT_MAX_RETRY_COUNT: int = 10 +_DEFAULT_INIT_DELAY_SEC: int = 1 +_DEFAULT_MAX_DELAY_SEC: int = 30 +_DEFAULT_BACKOFF: int = 2 +_DEFAULT_JITTER_SEC: int = 1 +_DEFAULT_EXCEPTIONS: Tuple[Exception] = (Exception,) + + +def retry( + max_retry_count: int = _DEFAULT_MAX_RETRY_COUNT, + init_delay_sec: int = _DEFAULT_INIT_DELAY_SEC, + max_delay_sec: int = _DEFAULT_MAX_DELAY_SEC, + backoff: int = _DEFAULT_BACKOFF, + jitter_sec: int = _DEFAULT_JITTER_SEC, + exceptions: Tuple[Exception] = _DEFAULT_EXCEPTIONS, +): + def wrapper(fn): + @wraps(fn) + def wrapped(*args, **kwargs): + for cur_retry_count in range(max_retry_count): + try: + return fn(*args, **kwargs) + except exceptions: + if cur_retry_count + 1 == max_retry_count: + raise + + sleep_sec = min( + init_delay_sec * (backoff**cur_retry_count) + jitter_sec, + max_delay_sec, + ) + time.sleep(sleep_sec) + + return wrapped + + return wrapper diff --git a/release/ray_release/scripts/build_pipeline.py b/release/ray_release/scripts/build_pipeline.py index 735ddb2e8cdf..29e448d8f4fd 100644 --- a/release/ray_release/scripts/build_pipeline.py +++ b/release/ray_release/scripts/build_pipeline.py @@ -14,7 +14,10 @@ build_anyscale_base_byod_images, build_anyscale_custom_byod_image, ) -from ray_release.config import read_and_validate_release_test_collection +from ray_release.config import ( + read_and_validate_release_test_collection, + RELEASE_TEST_CONFIG_FILES, +) from ray_release.configs.global_config import init_global_config from ray_release.exception import ReleaseTestCLIError, ReleaseTestConfigError from ray_release.logger import logger @@ -92,7 +95,7 @@ def main( try: test_collection = read_and_validate_release_test_collection( - test_collection_file or ["release/release_tests.yaml"] + test_collection_file or RELEASE_TEST_CONFIG_FILES ) except ReleaseTestConfigError as e: raise ReleaseTestConfigError( diff --git a/release/ray_release/scripts/get_test_summary.py b/release/ray_release/scripts/get_test_summary.py index b4793a860d06..ccb5066713b2 100644 --- a/release/ray_release/scripts/get_test_summary.py +++ b/release/ray_release/scripts/get_test_summary.py @@ -5,7 +5,10 @@ import click from ray_release.buildkite.concurrency import get_test_resources -from ray_release.config import read_and_validate_release_test_collection +from ray_release.config import ( + read_and_validate_release_test_collection, + RELEASE_TEST_CONFIG_FILES, +) @click.command() @@ -26,7 +29,7 @@ def main(test_collection_file: Optional[str] = None, output: Optional[str] = Non output = output or os.path.join(os.path.dirname(__file__), "test_summary.csv") tests = read_and_validate_release_test_collection( - test_collection_file or ["release/release_tests.yaml"] + test_collection_file or RELEASE_TEST_CONFIG_FILES ) with open(output, "w") as f: diff --git a/release/ray_release/scripts/ray_bisect.py b/release/ray_release/scripts/ray_bisect.py index 46d3c521bb93..15ebd4c9d396 100644 --- a/release/ray_release/scripts/ray_bisect.py +++ b/release/ray_release/scripts/ray_bisect.py @@ -14,7 +14,10 @@ build_anyscale_base_byod_images, build_anyscale_custom_byod_image, ) -from ray_release.config import read_and_validate_release_test_collection +from ray_release.config import ( + read_and_validate_release_test_collection, + RELEASE_TEST_CONFIG_FILES, +) from ray_release.configs.global_config import init_global_config from ray_release.test import Test from ray_release.test_automation.release_state_machine import ReleaseTestStateMachine @@ -241,7 +244,7 @@ def _obtain_test_result( def _get_test(test_name: str, test_collection_file: Tuple[str]) -> Test: test_collection = read_and_validate_release_test_collection( - test_collection_file or ["release/release_tests.yaml"], + test_collection_file or RELEASE_TEST_CONFIG_FILES, ) return [test for test in test_collection if test["name"] == test_name][0] diff --git a/release/ray_release/scripts/run_release_test.py b/release/ray_release/scripts/run_release_test.py index 13830139554a..e03912b9f681 100644 --- a/release/ray_release/scripts/run_release_test.py +++ b/release/ray_release/scripts/run_release_test.py @@ -9,6 +9,7 @@ as_smoke_test, find_test, read_and_validate_release_test_collection, + RELEASE_TEST_CONFIG_FILES, ) from ray_release.configs.global_config import init_global_config from ray_release.env import DEFAULT_ENVIRONMENT, load_environment, populate_os_env @@ -114,7 +115,7 @@ def main( ) init_global_config(global_config_file) test_collection = read_and_validate_release_test_collection( - test_collection_file or ["release/release_tests.yaml"], + test_collection_file or RELEASE_TEST_CONFIG_FILES, test_definition_root, ) test = find_test(test_collection, test_name) diff --git a/release/ray_release/tests/test_cluster_manager.py b/release/ray_release/tests/test_cluster_manager.py index 170d9b71c1cb..1b652754a96f 100644 --- a/release/ray_release/tests/test_cluster_manager.py +++ b/release/ray_release/tests/test_cluster_manager.py @@ -275,19 +275,20 @@ def testClusterComputeExtraTags(self): # All ResourceTypes as in # ray_release.aws.RELEASE_AWS_RESOURCE_TYPES_TO_TRACK_FOR_BILLING target_cluster_compute = TEST_CLUSTER_COMPUTE.copy() - target_cluster_compute["aws"] = { + target_cluster_compute["advanced_configurations_json"] = { "TagSpecifications": [ {"ResourceType": "instance", "Tags": [{"Key": "foo", "Value": "bar"}]}, {"ResourceType": "volume", "Tags": [{"Key": "foo", "Value": "bar"}]}, ] } self.assertEqual( - self.cluster_manager.cluster_compute["aws"], target_cluster_compute["aws"] + self.cluster_manager.cluster_compute["advanced_configurations_json"], + target_cluster_compute["advanced_configurations_json"], ) # Test merging with already existing tags cluster_compute_with_tags = TEST_CLUSTER_COMPUTE.copy() - cluster_compute_with_tags["aws"] = { + cluster_compute_with_tags["advanced_configurations_json"] = { "TagSpecifications": [ {"ResourceType": "fake", "Tags": []}, {"ResourceType": "instance", "Tags": [{"Key": "key", "Value": "val"}]}, @@ -299,7 +300,7 @@ def testClusterComputeExtraTags(self): # All ResourceTypes as in RELEASE_AWS_RESOURCE_TYPES_TO_TRACK_FOR_BILLING target_cluster_compute = TEST_CLUSTER_COMPUTE.copy() - target_cluster_compute["aws"] = { + target_cluster_compute["advanced_configurations_json"] = { "TagSpecifications": [ {"ResourceType": "fake", "Tags": []}, { @@ -313,7 +314,8 @@ def testClusterComputeExtraTags(self): ] } self.assertEqual( - self.cluster_manager.cluster_compute["aws"], target_cluster_compute["aws"] + self.cluster_manager.cluster_compute["advanced_configurations_json"], + target_cluster_compute["advanced_configurations_json"], ) @patch("time.sleep", lambda *a, **kw: None) diff --git a/release/ray_release/tests/test_config.py b/release/ray_release/tests/test_config.py index f922d6512d99..c7884b116880 100644 --- a/release/ray_release/tests/test_config.py +++ b/release/ray_release/tests/test_config.py @@ -14,6 +14,7 @@ _TEST_COLLECTION_FILES = [ "release/release_tests.yaml", + "release/release_data_tests.yaml", "release/ray_release/tests/test_collection_data.yaml", ] @@ -92,6 +93,42 @@ def test_parse_test_definition(): parse_test_definition([invalid_test_definition]) +def test_parse_test_definition_with_defaults(): + test_definitions = yaml.safe_load( + """ + - name: DEFAULTS + working_dir: default_working_dir + - name: sample_test_with_default_working_dir + frequency: nightly + team: sample + cluster: + byod: + type: gpu + cluster_compute: compute.yaml + run: + timeout: 100 + script: python script.py + - name: sample_test_with_overridden_working_dir + working_dir: overridden_working_dir + frequency: nightly + team: sample + cluster: + byod: + type: gpu + cluster_compute: compute.yaml + run: + timeout: 100 + script: python script.py + """ + ) + test_with_default, test_with_override = parse_test_definition(test_definitions) + schema = load_schema_file() + assert not validate_test(test_with_default, schema) + assert not validate_test(test_with_override, schema) + assert test_with_default["working_dir"] == "default_working_dir" + assert test_with_override["working_dir"] == "overridden_working_dir" + + def test_schema_validation(): test = VALID_TEST.copy() diff --git a/release/ray_release/tests/test_retry.py b/release/ray_release/tests/test_retry.py new file mode 100644 index 000000000000..b630e19f2dd0 --- /dev/null +++ b/release/ray_release/tests/test_retry.py @@ -0,0 +1,75 @@ +from ray_release import retry + +import sys +import pytest + + +def test_retry_with_no_error(): + invocation_count = 0 + + # Function doesn't raise exception; use a dummy value to check invocation. + @retry.retry() + def no_error_func() -> int: + nonlocal invocation_count + invocation_count += 1 + return 1 + + assert no_error_func() == 1 + assert invocation_count == 1 + + +# Test senario: exception count is less than retry count. +def test_retry_with_limited_error(): + invocation_count = 0 + + # Function doesn't raise exception; use a dummy value to check invocation. + @retry.retry(init_delay_sec=1, jitter_sec=1) + def limited_error() -> int: + nonlocal invocation_count + + invocation_count += 1 + + if invocation_count == 1: + raise Exception("Manual exception") + return 1 + + assert limited_error() == 1 + assert invocation_count == 2 + + +# Test senario: exception count exceeds retry count. +def test_retry_with_unlimited_error(): + invocation_count = 0 + + @retry.retry(init_delay_sec=1, jitter_sec=1, backoff=1, max_retry_count=3) + def unlimited_error() -> int: + nonlocal invocation_count + + invocation_count += 1 + raise Exception("Manual exception") + + with pytest.raises(Exception, match="Manual exception"): + unlimited_error() + assert invocation_count == 3 + + +def test_retry_on_certain_errors(): + invocation_count = 0 + + # Function doesn't raise exception; use a dummy value to check invocation. + @retry.retry(init_delay_sec=1, jitter_sec=1, exceptions=(KeyError,)) + def limited_error() -> int: + nonlocal invocation_count + + invocation_count += 1 + + if invocation_count == 1: + raise KeyError("Manual exception") + return 1 + + assert limited_error() == 1 + assert invocation_count == 2 + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", __file__])) diff --git a/release/release_data_tests.yaml b/release/release_data_tests.yaml new file mode 100644 index 000000000000..9ea55e27af8e --- /dev/null +++ b/release/release_data_tests.yaml @@ -0,0 +1,721 @@ +- name: DEFAULTS + group: data-tests + working_dir: nightly_tests/dataset + + frequency: nightly + team: data + + cluster: + byod: + type: gpu + cluster_compute: multi_node_autoscaling_compute.yaml + +############### +# Reading tests +############### + +- name: read_parquet + run: + timeout: 3600 + script: python read_and_consume_benchmark.py s3://ray-benchmark-data/parquet/10TiB --format parquet --iterate + +- name: read_images + run: + timeout: 3600 + script: python read_and_consume_benchmark.py s3://air-example-data-2/300G-image-data-synthetic-raw --format image --iterate + +############### +# Dataset tests +############### + +- name: count_parquet + run: + timeout: 600 + script: python read_and_consume_benchmark.py s3://ray-benchmark-data/parquet/10TiB --format parquet --count + +- name: stable_diffusion_benchmark + group: data-tests + working_dir: nightly_tests/dataset + + frequency: nightly + team: data + + cluster: + byod: + type: gpu + post_build_script: byod_stable_diffusion.sh + cluster_compute: stable_diffusion_benchmark_compute.yaml + + run: + timeout: 1800 + script: python stable_diffusion_benchmark.py + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: stable_diffusion_benchmark_compute_gce.yaml + +- name: streaming_data_ingest_benchmark_1tb + group: data-tests + working_dir: nightly_tests/dataset + + frequency: nightly + team: data + + cluster: + byod: + type: gpu + cluster_compute: data_ingest_benchmark_compute.yaml + + run: + timeout: 300 + script: python data_ingest_benchmark.py --dataset-size-gb=1000 --num-workers=20 --streaming + wait_for_nodes: + num_nodes: 20 + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: data_ingest_benchmark_compute_gce.yaml + +- name: streaming_data_ingest_benchmark_100gb_gpu + group: data-tests + working_dir: nightly_tests/dataset + + frequency: nightly + team: data + + cluster: + byod: + type: gpu + cluster_compute: data_ingest_benchmark_compute_gpu.yaml + + run: + timeout: 300 + script: python data_ingest_benchmark.py --dataset-size-gb=100 --num-workers=4 --streaming --use-gpu + wait_for_nodes: + num_nodes: 3 + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: data_ingest_benchmark_compute_gpu_gce.yaml + +# This test case will early stop the data ingestion iteration on the GPU actors. +# This is a common usage in PyTorch Lightning +# (https://lightning.ai/docs/pytorch/stable/common/trainer.html#limit-train-batches). +# There was a bug in Ray Data that caused GPU memoy leak (see #3.919). +# We add this test case to cover this scenario. +- name: streaming_data_ingest_benchmark_100gb_gpu_early_stop + group: data-tests + working_dir: nightly_tests/dataset + + frequency: nightly + team: data + + cluster: + byod: + type: gpu + cluster_compute: data_ingest_benchmark_compute_gpu.yaml + + run: + timeout: 300 + script: python data_ingest_benchmark.py --dataset-size-gb=100 --num-workers=4 --streaming --use-gpu --early-stop + wait_for_nodes: + num_nodes: 3 + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: data_ingest_benchmark_compute_gpu_gce.yaml + +- name: read_images_comparison_microbenchmark_single_node + group: data-tests + working_dir: nightly_tests/dataset + + frequency: nightly + team: data + cluster: + byod: + type: gpu + post_build_script: byod_install_mosaicml.sh + cluster_compute: single_worker_node_0_head_node_benchmark_compute.yaml + + run: + timeout: 1800 + script: bash run_image_loader_microbenchmark.sh + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: single_node_benchmark_compute_gce.yaml + +- name: read_images_train_4_gpu + group: data-tests + working_dir: nightly_tests/dataset + + frequency: nightly + team: data + cluster: + byod: + type: gpu + post_build_script: byod_install_mosaicml.sh + cluster_compute: multi_node_train_4_workers.yaml + + run: + timeout: 18000 + script: python multi_node_train_benchmark.py --num-workers 4 --file-type image --use-gpu --num-epochs 2 + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml + +- name: read_images_train_4_gpu_worker_chaos + group: data-tests + working_dir: nightly_tests + + frequency: nightly + team: data + cluster: + byod: + type: gpu + post_build_script: byod_install_mosaicml.sh + cluster_compute: dataset/multi_node_train_4_workers.yaml + + run: + timeout: 18000 + prepare: python setup_chaos.py --kill-workers --kill-interval 100 --max-to-kill 3 --task-names "ReadImage->Map(wnid_to_index)->Map(crop_and_flip_image)" + script: python dataset/multi_node_train_benchmark.py --num-workers 4 --file-type image --use-gpu --num-epochs 1 + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: ../air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml + +- name: read_images_train_4_gpu_node_chaos + group: data-tests + working_dir: nightly_tests + + frequency: nightly + team: data + cluster: + byod: + type: gpu + post_build_script: byod_install_mosaicml.sh + cluster_compute: dataset/multi_node_train_4_workers.yaml + + run: + timeout: 18000 + prepare: python setup_chaos.py --kill-interval 200 --max-to-kill 1 --task-names "_RayTrainWorker__execute.get_next" + script: python dataset/multi_node_train_benchmark.py --num-workers 4 --file-type image --use-gpu --num-epochs 1 + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: ../air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml + +- name: read_images_train_16_gpu + group: data-tests + working_dir: nightly_tests/dataset + + frequency: nightly + team: data + cluster: + byod: + type: gpu + post_build_script: byod_install_mosaicml.sh + cluster_compute: multi_node_train_16_workers.yaml + + run: + timeout: 18000 + script: python multi_node_train_benchmark.py --num-workers 16 --file-type image --use-gpu --num-epochs 2 + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml + +- name: read_images_train_16_gpu_preserve_order + group: data-tests + working_dir: nightly_tests/dataset + + frequency: nightly + team: data + cluster: + byod: + type: gpu + post_build_script: byod_install_mosaicml.sh + cluster_compute: multi_node_train_16_workers.yaml + + run: + timeout: 18000 + script: python multi_node_train_benchmark.py --num-workers 16 --file-type image --preserve-order --use-gpu --num-epochs 2 + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml + +- name: read_parquet_train_4_gpu + group: data-tests + working_dir: nightly_tests/dataset + + frequency: nightly + team: data + cluster: + byod: + type: gpu + post_build_script: byod_install_mosaicml.sh + cluster_compute: multi_node_train_4_workers.yaml + + run: + timeout: 3600 + script: python multi_node_train_benchmark.py --num-workers 4 --file-type parquet --target-worker-gb 50 --use-gpu + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml + +- name: read_parquet_train_16_gpu + group: data-tests + working_dir: nightly_tests/dataset + + frequency: nightly + team: data + cluster: + byod: + type: gpu + post_build_script: byod_install_mosaicml.sh + cluster_compute: multi_node_train_16_workers.yaml + + run: + timeout: 3600 + script: python multi_node_train_benchmark.py --num-workers 16 --file-type parquet --target-worker-gb 50 --use-gpu + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml + +- name: read_images_train_1_gpu_5_cpu + group: data-tests + working_dir: nightly_tests/dataset + + frequency: nightly + team: data + cluster: + byod: + type: gpu + post_build_script: byod_install_mosaicml.sh + cluster_compute: multi_node_train_1g5c.yaml + + run: + timeout: 2400 + script: python multi_node_train_benchmark.py --num-workers 1 --file-type image --use-gpu --num-epochs 2 --skip-train-model --prefetch-batches 16 --batch-size -1 --disable-locality-with-output + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: compute_gpu_1g5c_gce.yaml + +- name: read_tfrecords_benchmark_single_node + group: data-tests + working_dir: nightly_tests/dataset + + frequency: nightly + team: data + + cluster: + byod: + type: gpu + post_build_script: byod_install_mosaicml.sh + cluster_compute: single_node_benchmark_compute.yaml + + run: + # Expect the benchmark to finish around 30 minutes. + timeout: 2700 + script: python read_tfrecords_benchmark.py + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: single_node_benchmark_compute_gce.yaml + +- name: map_batches_benchmark_single_node + group: data-tests + working_dir: nightly_tests/dataset + + frequency: nightly + team: data + + cluster: + byod: + type: gpu + cluster_compute: single_node_benchmark_compute.yaml + + run: + # Expect the benchmark to finish around 30 minutes. + timeout: 2400 + script: python map_batches_benchmark.py + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: single_node_benchmark_compute_gce.yaml + +- name: iter_tensor_batches_benchmark_single_node + group: data-tests + working_dir: nightly_tests/dataset + + frequency: nightly + team: data + + cluster: + byod: + type: gpu + cluster_compute: single_node_benchmark_compute.yaml + + run: + # Expect the benchmark to finish around 30 minutes. + timeout: 2400 + script: python iter_tensor_batches_benchmark.py + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: single_node_benchmark_compute_gce.yaml + +- name: iter_tensor_batches_benchmark_multi_node + group: data-tests + working_dir: nightly_tests/dataset + + frequency: nightly + team: data + + cluster: + byod: + type: gpu + cluster_compute: multi_node_benchmark_compute.yaml + + run: + # Expect the benchmark to finish within 90 minutes. + timeout: 5400 + script: python iter_tensor_batches_benchmark.py --data-size-gb=10 + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: multi_node_benchmark_compute_gce.yaml + +- name: iter_batches_benchmark_single_node + group: data-tests + working_dir: nightly_tests/dataset + + frequency: nightly + team: data + + cluster: + byod: + type: gpu + cluster_compute: single_node_benchmark_compute.yaml + + run: + # Expect the benchmark to finish around 12 minutes. + timeout: 1080 + script: python iter_batches_benchmark.py + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: single_node_benchmark_compute_gce.yaml + +- name: dataset_shuffle_random_shuffle_1tb + group: data-tests + working_dir: nightly_tests + + frequency: nightly + team: data + + cluster: + byod: + runtime_env: + - RAY_worker_killing_policy=retriable_lifo + pip: + - ray[default] + cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml + + run: + timeout: 7200 + script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9 --shuffle + wait_for_nodes: + num_nodes: 20 + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml + +- name: dataset_shuffle_sort_1tb + group: data-tests + working_dir: nightly_tests + + frequency: nightly + team: data + stable: False + + cluster: + byod: + runtime_env: + - RAY_worker_killing_policy=retriable_lifo + pip: + - ray[default] + cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml + + run: + timeout: 7200 + script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9 + wait_for_nodes: + num_nodes: 20 + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml + + +############################ +# Batch Inference Benchmarks +############################ + +# 10 GB image classification raw images with 1 GPU. +# 1 g4dn.4xlarge +- name: torch_batch_inference_1_gpu_10gb_raw + group: data-tests + working_dir: nightly_tests/dataset + + frequency: nightly + team: data + cluster: + byod: + type: gpu + cluster_compute: compute_gpu_1_cpu_16_aws.yaml + + run: + timeout: 500 + script: python gpu_batch_inference.py --data-directory=10G-image-data-synthetic-raw --data-format raw + + alert: default + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: compute_gpu_1_cpu_16_gce.yaml + +# 10 GB image classification parquet with 1 GPU. +# 1 g4dn.4xlarge +- name: torch_batch_inference_1_gpu_10gb_parquet + group: data-tests + working_dir: nightly_tests/dataset + + frequency: nightly + team: data + cluster: + byod: + type: gpu + cluster_compute: compute_gpu_1_cpu_16_aws.yaml + + run: + timeout: 500 + script: python gpu_batch_inference.py --data-directory=10G-image-data-synthetic-raw-parquet --data-format parquet + + alert: default + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: compute_gpu_1_cpu_16_gce.yaml + + +# 300 GB image classification raw images with 16 GPUs +# 4 g4dn.12xlarge +- name: torch_batch_inference_16_gpu_300gb_raw + group: data-tests + working_dir: nightly_tests/dataset + + frequency: nightly + team: data + cluster: + byod: + type: gpu + cluster_compute: compute_gpu_4x4_aws.yaml + + run: + timeout: 1000 + script: python gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw --data-format raw + + wait_for_nodes: + num_nodes: 4 + + alert: default + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: compute_gpu_4x4_gce.yaml + + +- name: chaos_torch_batch_inference_16_gpu_300gb_raw + group: data-tests + working_dir: nightly_tests + stable: false + + frequency: nightly + team: data + cluster: + byod: + type: gpu + cluster_compute: dataset/compute_gpu_4x4_aws.yaml + + run: + timeout: 1000 + prepare: python setup_chaos.py --max-to-kill 2 --kill-delay 30 + script: python dataset/gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw --data-format raw + + wait_for_nodes: + num_nodes: 4 + + alert: default + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: dataset/compute_gpu_4x4_gce.yaml + + +# 300 GB image classification parquet data with 16 GPUs +# 4 g4dn.12xlarge +- name: torch_batch_inference_16_gpu_300gb_parquet + group: data-tests + working_dir: nightly_tests/dataset + + frequency: nightly + team: data + + cluster: + byod: + type: gpu + cluster_compute: compute_gpu_4x4_aws.yaml + + run: + timeout: 1000 + script: python gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw-parquet --data-format parquet + + wait_for_nodes: + num_nodes: 4 + + alert: default + + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + frequency: manual + cluster: + cluster_compute: compute_gpu_4x4_gce.yaml + +# 10 TB image classification parquet data with heterogenous cluster +# 10 g4dn.12xlarge, 10 m5.16xlarge +- name: torch_batch_inference_hetero_10tb_parquet + group: data-tests + working_dir: nightly_tests/dataset + + frequency: weekly + team: data + + cluster: + byod: + type: gpu + cluster_compute: compute_hetero_10x10_aws.yaml + + run: + timeout: 2000 + script: python gpu_batch_inference.py --data-directory 10T-image-data-synthetic-raw-parquet --data-format parquet + + wait_for_nodes: + num_nodes: 20 + + alert: default diff --git a/release/release_tests.yaml b/release/release_tests.yaml index 33dc2486eae6..1c150aedf601 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -106,181 +106,6 @@ alert: default -############################ -# Batch Inference Benchmarks -############################ - -# 10 GB image classification raw images with 1 GPU. -# 1 g4dn.4xlarge -- name: torch_batch_inference_1_gpu_10gb_raw - group: data-tests - working_dir: nightly_tests/dataset - - frequency: nightly - team: data - cluster: - byod: - type: gpu - cluster_compute: compute_gpu_1_cpu_16_aws.yaml - - run: - timeout: 500 - script: python gpu_batch_inference.py --data-directory=10G-image-data-synthetic-raw --data-format raw - - alert: default - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: compute_gpu_1_cpu_16_gce.yaml - -# 10 GB image classification parquet with 1 GPU. -# 1 g4dn.4xlarge -- name: torch_batch_inference_1_gpu_10gb_parquet - group: data-tests - working_dir: nightly_tests/dataset - - frequency: nightly - team: data - cluster: - byod: - type: gpu - cluster_compute: compute_gpu_1_cpu_16_aws.yaml - - run: - timeout: 500 - script: python gpu_batch_inference.py --data-directory=10G-image-data-synthetic-raw-parquet --data-format parquet - - alert: default - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: compute_gpu_1_cpu_16_gce.yaml - - -# 300 GB image classification raw images with 16 GPUs -# 4 g4dn.12xlarge -- name: torch_batch_inference_16_gpu_300gb_raw - group: data-tests - working_dir: nightly_tests/dataset - - frequency: nightly - team: data - cluster: - byod: - type: gpu - cluster_compute: compute_gpu_4x4_aws.yaml - - run: - timeout: 1000 - script: python gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw --data-format raw - - wait_for_nodes: - num_nodes: 4 - - alert: default - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: compute_gpu_4x4_gce.yaml - - -- name: chaos_torch_batch_inference_16_gpu_300gb_raw - group: data-tests - working_dir: nightly_tests - stable: false - - frequency: nightly - team: data - cluster: - byod: - type: gpu - cluster_compute: dataset/compute_gpu_4x4_aws.yaml - - run: - timeout: 1000 - prepare: python setup_chaos.py --max-to-kill 2 --kill-delay 30 - script: python dataset/gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw --data-format raw - - wait_for_nodes: - num_nodes: 4 - - alert: default - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: dataset/compute_gpu_4x4_gce.yaml - - -# 300 GB image classification parquet data with 16 GPUs -# 4 g4dn.12xlarge -- name: torch_batch_inference_16_gpu_300gb_parquet - group: data-tests - working_dir: nightly_tests/dataset - - frequency: nightly - team: data - - cluster: - byod: - type: gpu - cluster_compute: compute_gpu_4x4_aws.yaml - - run: - timeout: 1000 - script: python gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw-parquet --data-format parquet - - wait_for_nodes: - num_nodes: 4 - - alert: default - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: compute_gpu_4x4_gce.yaml - -# 10 TB image classification parquet data with heterogenous cluster -# 10 g4dn.12xlarge, 10 m5.16xlarge -- name: torch_batch_inference_hetero_10tb_parquet - group: data-tests - working_dir: nightly_tests/dataset - - frequency: weekly - team: data - - cluster: - byod: - type: gpu - cluster_compute: compute_hetero_10x10_aws.yaml - - run: - timeout: 2000 - script: python gpu_batch_inference.py --data-directory 10T-image-data-synthetic-raw-parquet --data-format parquet - - wait_for_nodes: - num_nodes: 20 - - alert: default - ######################### # AIR release tests @@ -1469,7 +1294,7 @@ group: Golden Notebook tests working_dir: golden_notebook_tests - frequency: nightly-3x + frequency: manual team: ml cluster: @@ -3876,26 +3701,27 @@ # stable: false -############### -# Dataset tests -############### -- name: parquet_metadata_resolution - group: data-tests - working_dir: nightly_tests/dataset +################## +# Core Chaos tests +################## - frequency: nightly - team: data +- name: chaos_many_tasks_kill_raylet + group: core-nightly-test + working_dir: nightly_tests + frequency: nightly + team: core cluster: - byod: - type: gpu - cluster_compute: single_node_benchmark_compute.yaml + byod: {} + cluster_compute: chaos_test/compute_template.yaml run: - # Expect the test to finish around 40 seconds. - timeout: 100 - script: python parquet_metadata_resolution.py --num-files 915 --cloud aws + timeout: 3600 + wait_for_nodes: + num_nodes: 10 + prepare: python setup_chaos.py --no-start + script: python chaos_test/test_chaos_basic.py --workload=tasks variations: - __suffix__: aws @@ -3903,52 +3729,44 @@ env: gce frequency: manual cluster: - cluster_compute: single_node_benchmark_compute_gce.yaml - run: - script: python parquet_metadata_resolution.py --num-files 915 --cloud gcp + cluster_compute: chaos_test/compute_template_gce.yaml -- name: stable_diffusion_benchmark - group: data-tests - working_dir: nightly_tests/dataset +- name: chaos_many_tasks_terminate_instance + group: core-nightly-test + working_dir: nightly_tests frequency: nightly - team: data - + team: core cluster: - byod: - type: gpu - post_build_script: byod_stable_diffusion.sh - cluster_compute: stable_diffusion_benchmark_compute.yaml + byod: {} + cluster_compute: chaos_test/compute_template.yaml run: - timeout: 1800 - script: python stable_diffusion_benchmark.py + timeout: 3600 + wait_for_nodes: + num_nodes: 10 + prepare: python setup_chaos.py --no-start --chaos TerminateEC2Instance + script: python chaos_test/test_chaos_basic.py --workload=tasks variations: - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: stable_diffusion_benchmark_compute_gce.yaml -- name: streaming_data_ingest_benchmark_1tb - group: data-tests - working_dir: nightly_tests/dataset +- name: chaos_many_actors_kill_raylet + group: core-nightly-test + working_dir: nightly_tests frequency: nightly - team: data - + team: core cluster: - byod: - type: gpu - cluster_compute: data_ingest_benchmark_compute.yaml + byod: {} + cluster_compute: chaos_test/compute_template.yaml run: - timeout: 300 - script: python data_ingest_benchmark.py --dataset-size-gb=1000 --num-workers=20 --streaming + timeout: 4200 wait_for_nodes: - num_nodes: 20 + num_nodes: 10 + prepare: python setup_chaos.py --no-start + script: python chaos_test/test_chaos_basic.py --workload=actors variations: - __suffix__: aws @@ -3956,56 +3774,48 @@ env: gce frequency: manual cluster: - cluster_compute: data_ingest_benchmark_compute_gce.yaml + cluster_compute: chaos_test/compute_template_gce.yaml -- name: streaming_data_ingest_benchmark_100gb_gpu - group: data-tests - working_dir: nightly_tests/dataset +- name: chaos_many_actors_terminate_instance + group: core-nightly-test + working_dir: nightly_tests frequency: nightly - team: data - + team: core cluster: - byod: - type: gpu - cluster_compute: data_ingest_benchmark_compute_gpu.yaml + byod: {} + cluster_compute: chaos_test/compute_template.yaml run: - timeout: 300 - script: python data_ingest_benchmark.py --dataset-size-gb=100 --num-workers=4 --streaming --use-gpu + timeout: 4200 wait_for_nodes: - num_nodes: 3 + num_nodes: 10 + prepare: python setup_chaos.py --no-start --chaos TerminateEC2Instance + script: python chaos_test/test_chaos_basic.py --workload=actors variations: - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: data_ingest_benchmark_compute_gpu_gce.yaml - -# This test case will early stop the data ingestion iteration on the GPU actors. -# This is a common usage in PyTorch Lightning -# (https://lightning.ai/docs/pytorch/stable/common/trainer.html#limit-train-batches). -# There was a bug in Ray Data that caused GPU memoy leak (see #3.919). -# We add this test case to cover this scenario. -- name: streaming_data_ingest_benchmark_100gb_gpu_early_stop + +- name: chaos_dask_on_ray_large_scale_test_no_spilling group: data-tests - working_dir: nightly_tests/dataset + working_dir: nightly_tests frequency: nightly team: data cluster: byod: - type: gpu - cluster_compute: data_ingest_benchmark_compute_gpu.yaml + runtime_env: + - RAY_lineage_pinning_enabled=1 + cluster_compute: dask_on_ray/chaos_dask_on_ray_stress_compute.yaml run: - timeout: 300 - script: python data_ingest_benchmark.py --dataset-size-gb=100 --num-workers=4 --streaming --use-gpu --early-stop + timeout: 7200 wait_for_nodes: - num_nodes: 3 + num_nodes: 21 + prepare: python setup_chaos.py --kill-interval 100 + script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb + 20 --error_rate 0 --data_save_path /tmp/ray variations: - __suffix__: aws @@ -4013,23 +3823,28 @@ env: gce frequency: manual cluster: - cluster_compute: data_ingest_benchmark_compute_gpu_gce.yaml + cluster_compute: dask_on_ray/dask_on_ray_stress_compute_gce.yaml -- name: aggregate_benchmark +- name: chaos_dask_on_ray_large_scale_test_spilling group: data-tests - working_dir: nightly_tests/dataset + working_dir: nightly_tests frequency: nightly team: data cluster: byod: - type: gpu - cluster_compute: single_node_benchmark_compute.yaml + runtime_env: + - RAY_lineage_pinning_enabled=1 + cluster_compute: dask_on_ray/dask_on_ray_stress_compute.yaml run: - timeout: 1800 - script: python aggregate_benchmark.py + timeout: 7200 + wait_for_nodes: + num_nodes: 21 + prepare: python setup_chaos.py --kill-interval 100 + script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb + 70 --error_rate 0 --data_save_path /tmp/ray variations: - __suffix__: aws @@ -4037,749 +3852,24 @@ env: gce frequency: manual cluster: - cluster_compute: single_node_benchmark_compute_gce.yaml + cluster_compute: dask_on_ray/dask_on_ray_stress_compute_gce.yaml -- name: read_parquet_benchmark_single_node +- name: chaos_dataset_shuffle_push_based_sort_1tb group: data-tests - working_dir: nightly_tests/dataset + working_dir: nightly_tests + + stable: false frequency: nightly team: data cluster: byod: - type: gpu - post_build_script: byod_install_mosaicml.sh - cluster_compute: single_node_benchmark_compute.yaml - - run: - # Expect the benchmark to finish in 400 seconds. - timeout: 400 - script: python read_parquet_benchmark.py - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: single_node_benchmark_compute_gce.yaml - -- name: read_images_benchmark_single_node - group: data-tests - working_dir: nightly_tests/dataset - - frequency: nightly - team: data - - cluster: - byod: - type: gpu - post_build_script: byod_install_mosaicml.sh - cluster_compute: single_node_benchmark_compute.yaml - - run: - timeout: 1800 - script: python read_images_benchmark.py --single-node - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: single_node_benchmark_compute_gce.yaml - -# TODO: Re-enable this test once we fix https://github.com/ray-project/ray/issues/40686. -# - name: read_images_benchmark_multi_node -# group: data-tests -# working_dir: nightly_tests/dataset - -# frequency: nightly-3x -# team: data - -# cluster: -# byod: -# type: gpu -# cluster_compute: multi_node_read_images_benchmark_compute.yaml - -# run: -# timeout: 28800 -# script: python read_images_benchmark.py --multi-node - -# variations: -# - __suffix__: aws -# - __suffix__: gce -# env: gce -# frequency: manual -# cluster: -# cluster_compute: multi_node_read_images_benchmark_compute_gce.yaml - -- name: read_images_comparison_microbenchmark_single_node - group: data-tests - working_dir: nightly_tests/dataset - - frequency: nightly - team: data - cluster: - byod: - type: gpu - post_build_script: byod_install_mosaicml.sh - cluster_compute: single_worker_node_0_head_node_benchmark_compute.yaml - - run: - timeout: 1800 - script: bash run_image_loader_microbenchmark.sh - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: single_node_benchmark_compute_gce.yaml - -- name: read_images_train_4_gpu - group: data-tests - working_dir: nightly_tests/dataset - - frequency: nightly - team: data - cluster: - byod: - type: gpu - post_build_script: byod_install_mosaicml.sh - cluster_compute: multi_node_train_4_workers.yaml - - run: - timeout: 18000 - script: python multi_node_train_benchmark.py --num-workers 4 --file-type image --use-gpu --num-epochs 2 - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml - -- name: read_images_train_4_gpu_worker_chaos - group: data-tests - working_dir: nightly_tests - - frequency: nightly - team: data - cluster: - byod: - type: gpu - post_build_script: byod_install_mosaicml.sh - cluster_compute: dataset/multi_node_train_4_workers.yaml - - run: - timeout: 18000 - prepare: python setup_chaos.py --kill-workers --kill-interval 100 --max-to-kill 3 --task-names "ReadImage->Map(wnid_to_index)->Map(crop_and_flip_image)" - script: python dataset/multi_node_train_benchmark.py --num-workers 4 --file-type image --use-gpu --num-epochs 1 - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: ../air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml - -- name: read_images_train_4_gpu_node_chaos - group: data-tests - working_dir: nightly_tests - - frequency: nightly - team: data - cluster: - byod: - type: gpu - post_build_script: byod_install_mosaicml.sh - cluster_compute: dataset/multi_node_train_4_workers.yaml - - run: - timeout: 18000 - prepare: python setup_chaos.py --kill-interval 200 --max-to-kill 1 --task-names "_RayTrainWorker__execute.get_next" - script: python dataset/multi_node_train_benchmark.py --num-workers 4 --file-type image --use-gpu --num-epochs 1 - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: ../air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml - -- name: read_images_train_16_gpu - group: data-tests - working_dir: nightly_tests/dataset - - frequency: nightly - team: data - cluster: - byod: - type: gpu - post_build_script: byod_install_mosaicml.sh - cluster_compute: multi_node_train_16_workers.yaml - - run: - timeout: 18000 - script: python multi_node_train_benchmark.py --num-workers 16 --file-type image --use-gpu --num-epochs 2 - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml - -- name: read_images_train_16_gpu_preserve_order - group: data-tests - working_dir: nightly_tests/dataset - - frequency: nightly - team: data - cluster: - byod: - type: gpu - post_build_script: byod_install_mosaicml.sh - cluster_compute: multi_node_train_16_workers.yaml - - run: - timeout: 18000 - script: python multi_node_train_benchmark.py --num-workers 16 --file-type image --preserve-order --use-gpu --num-epochs 2 - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml - -- name: read_parquet_train_4_gpu - group: data-tests - working_dir: nightly_tests/dataset - - frequency: nightly - team: data - cluster: - byod: - type: gpu - post_build_script: byod_install_mosaicml.sh - cluster_compute: multi_node_train_4_workers.yaml - - run: - timeout: 3600 - script: python multi_node_train_benchmark.py --num-workers 4 --file-type parquet --target-worker-gb 50 --use-gpu - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml - -- name: read_parquet_train_16_gpu - group: data-tests - working_dir: nightly_tests/dataset - - frequency: nightly - team: data - cluster: - byod: - type: gpu - post_build_script: byod_install_mosaicml.sh - cluster_compute: multi_node_train_16_workers.yaml - - run: - timeout: 3600 - script: python multi_node_train_benchmark.py --num-workers 16 --file-type parquet --target-worker-gb 50 --use-gpu - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: ../../air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml - -- name: read_images_train_1_gpu_5_cpu - group: data-tests - working_dir: nightly_tests/dataset - - frequency: nightly - team: data - cluster: - byod: - type: gpu - post_build_script: byod_install_mosaicml.sh - cluster_compute: multi_node_train_1g5c.yaml - - run: - timeout: 2400 - script: python multi_node_train_benchmark.py --num-workers 1 --file-type image --use-gpu --num-epochs 2 --skip-train-model --prefetch-batches 16 --batch-size -1 --disable-locality-with-output - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: compute_gpu_1g5c_gce.yaml - -- name: read_tfrecords_benchmark_single_node - group: data-tests - working_dir: nightly_tests/dataset - - frequency: nightly - team: data - - cluster: - byod: - type: gpu - post_build_script: byod_install_mosaicml.sh - cluster_compute: single_node_benchmark_compute.yaml - - run: - # Expect the benchmark to finish around 30 minutes. - timeout: 2700 - script: python read_tfrecords_benchmark.py - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: single_node_benchmark_compute_gce.yaml - -- name: map_batches_benchmark_single_node - group: data-tests - working_dir: nightly_tests/dataset - - frequency: nightly - team: data - - cluster: - byod: - type: gpu - cluster_compute: single_node_benchmark_compute.yaml - - run: - # Expect the benchmark to finish around 30 minutes. - timeout: 2400 - script: python map_batches_benchmark.py - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: single_node_benchmark_compute_gce.yaml - -- name: iter_tensor_batches_benchmark_single_node - group: data-tests - working_dir: nightly_tests/dataset - - frequency: nightly - team: data - - cluster: - byod: - type: gpu - cluster_compute: single_node_benchmark_compute.yaml - - run: - # Expect the benchmark to finish around 30 minutes. - timeout: 2400 - script: python iter_tensor_batches_benchmark.py - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: single_node_benchmark_compute_gce.yaml - -- name: iter_tensor_batches_benchmark_multi_node - group: data-tests - working_dir: nightly_tests/dataset - - frequency: nightly - team: data - - cluster: - byod: - type: gpu - cluster_compute: multi_node_benchmark_compute.yaml - - run: - # Expect the benchmark to finish within 90 minutes. - timeout: 5400 - script: python iter_tensor_batches_benchmark.py --data-size-gb=10 - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: multi_node_benchmark_compute_gce.yaml - -- name: iter_batches_benchmark_single_node - group: data-tests - working_dir: nightly_tests/dataset - - frequency: nightly - team: data - - cluster: - byod: - type: gpu - cluster_compute: single_node_benchmark_compute.yaml - - run: - # Expect the benchmark to finish around 12 minutes. - timeout: 1080 - script: python iter_batches_benchmark.py - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: single_node_benchmark_compute_gce.yaml - -- name: dataset_shuffle_random_shuffle_1tb - group: data-tests - working_dir: nightly_tests - - frequency: nightly - team: data - - cluster: - byod: - runtime_env: - - RAY_worker_killing_policy=retriable_lifo - pip: - - ray[default] - cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml - - run: - timeout: 7200 - script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9 --shuffle - wait_for_nodes: - num_nodes: 20 - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml - -- name: dataset_shuffle_sort_1tb - group: data-tests - working_dir: nightly_tests - - frequency: nightly - team: data - stable: False - - cluster: - byod: - runtime_env: - - RAY_worker_killing_policy=retriable_lifo - pip: - - ray[default] - cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml - - run: - timeout: 7200 - script: python dataset/sort.py --num-partitions=1000 --partition-size=1e9 - wait_for_nodes: - num_nodes: 20 - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml - -- name: dataset_shuffle_push_based_random_shuffle_1tb - group: data-tests - working_dir: nightly_tests - - stable: false - - frequency: nightly - team: data - - cluster: - byod: - runtime_env: - - RAY_worker_killing_policy=retriable_lifo - pip: - - ray[default] - cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml - - run: - timeout: 7200 - script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=1000 --partition-size=1e9 --shuffle - wait_for_nodes: - num_nodes: 20 - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml - -- name: dataset_shuffle_push_based_sort_1tb - group: data-tests - working_dir: nightly_tests - - frequency: nightly - team: data - stable: False - - cluster: - byod: - runtime_env: - - RAY_worker_killing_policy=retriable_lifo - pip: - - ray[default] - cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml - - run: - timeout: 7200 - script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=1000 --partition-size=1e9 - wait_for_nodes: - num_nodes: 20 - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml - -- name: dataset_shuffle_push_based_random_shuffle_100tb - group: data-tests - working_dir: nightly_tests - stable: false - - frequency: weekly - team: data - cluster: - byod: - runtime_env: - - RAY_object_spilling_config={"type":"filesystem","params":{"directory_path":["/tmp/data0","/tmp/data1"]}} - post_build_script: byod_dataset_shuffle.sh - cluster_compute: shuffle/100tb_shuffle_compute.yaml - - run: - timeout: 28800 - script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=100000 --partition-size=1e9 --shuffle - wait_for_nodes: - num_nodes: 100 - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: shuffle/100tb_shuffle_compute_gce.yaml - run: - timeout: 28800 - script: RAY_DATA_PUSH_BASED_SHUFFLE=1 python dataset/sort.py --num-partitions=40000 --partition-size=1e9 --shuffle - wait_for_nodes: - num_nodes: 100 - -################## -# Core Chaos tests -################## - -- name: chaos_many_tasks_kill_raylet - group: core-nightly-test - working_dir: nightly_tests - - frequency: nightly - team: core - cluster: - byod: {} - cluster_compute: chaos_test/compute_template.yaml - - run: - timeout: 3600 - wait_for_nodes: - num_nodes: 10 - prepare: python setup_chaos.py --no-start - script: python chaos_test/test_chaos_basic.py --workload=tasks - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: chaos_test/compute_template_gce.yaml - -- name: chaos_many_tasks_terminate_instance - group: core-nightly-test - working_dir: nightly_tests - - frequency: nightly - team: core - cluster: - byod: {} - cluster_compute: chaos_test/compute_template.yaml - - run: - timeout: 3600 - wait_for_nodes: - num_nodes: 10 - prepare: python setup_chaos.py --no-start --chaos TerminateEC2Instance - script: python chaos_test/test_chaos_basic.py --workload=tasks - - variations: - - __suffix__: aws - -- name: chaos_many_actors_kill_raylet - group: core-nightly-test - working_dir: nightly_tests - - frequency: nightly - team: core - cluster: - byod: {} - cluster_compute: chaos_test/compute_template.yaml - - run: - timeout: 4200 - wait_for_nodes: - num_nodes: 10 - prepare: python setup_chaos.py --no-start - script: python chaos_test/test_chaos_basic.py --workload=actors - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: chaos_test/compute_template_gce.yaml - -- name: chaos_many_actors_terminate_instance - group: core-nightly-test - working_dir: nightly_tests - - frequency: nightly - team: core - cluster: - byod: {} - cluster_compute: chaos_test/compute_template.yaml - - run: - timeout: 4200 - wait_for_nodes: - num_nodes: 10 - prepare: python setup_chaos.py --no-start --chaos TerminateEC2Instance - script: python chaos_test/test_chaos_basic.py --workload=actors - - variations: - - __suffix__: aws - -- name: chaos_dask_on_ray_large_scale_test_no_spilling - group: data-tests - working_dir: nightly_tests - - frequency: nightly - team: data - - cluster: - byod: - runtime_env: - - RAY_lineage_pinning_enabled=1 - cluster_compute: dask_on_ray/chaos_dask_on_ray_stress_compute.yaml - - run: - timeout: 7200 - wait_for_nodes: - num_nodes: 21 - prepare: python setup_chaos.py --kill-interval 100 - script: python dask_on_ray/large_scale_test.py --num_workers 20 --worker_obj_store_size_in_gb - 20 --error_rate 0 --data_save_path /tmp/ray - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: dask_on_ray/dask_on_ray_stress_compute_gce.yaml - -- name: chaos_dask_on_ray_large_scale_test_spilling - group: data-tests - working_dir: nightly_tests - - frequency: nightly - team: data - - cluster: - byod: - runtime_env: - - RAY_lineage_pinning_enabled=1 - cluster_compute: dask_on_ray/dask_on_ray_stress_compute.yaml - - run: - timeout: 7200 - wait_for_nodes: - num_nodes: 21 - prepare: python setup_chaos.py --kill-interval 100 - script: python dask_on_ray/large_scale_test.py --num_workers 150 --worker_obj_store_size_in_gb - 70 --error_rate 0 --data_save_path /tmp/ray - - variations: - - __suffix__: aws - - __suffix__: gce - env: gce - frequency: manual - cluster: - cluster_compute: dask_on_ray/dask_on_ray_stress_compute_gce.yaml - -- name: chaos_dataset_shuffle_push_based_sort_1tb - group: data-tests - working_dir: nightly_tests - - stable: false - - frequency: nightly - team: data - - cluster: - byod: - runtime_env: - - RAY_worker_killing_policy=retriable_lifo - pip: - - ray[default] - cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml + runtime_env: + - RAY_worker_killing_policy=retriable_lifo + pip: + - ray[default] + cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml run: timeout: 7200 @@ -4956,7 +4046,7 @@ group: cluster-launcher-test working_dir: ../python/ray/autoscaler/ - frequency: nightly + frequency: manual team: clusters cluster: byod: {} @@ -4971,7 +4061,7 @@ group: cluster-launcher-test working_dir: ../python/ray/autoscaler/ - frequency: nightly + frequency: manual team: clusters cluster: byod: {} @@ -5041,7 +4131,7 @@ run: timeout: 1200 - script: python launch_and_verify_cluster.py gcp/example-minimal.yaml + script: python launch_and_verify_cluster.py gcp/example-minimal-pinned.yaml - name: gcp_cluster_launcher_full group: cluster-launcher-test @@ -5067,7 +4157,7 @@ stable: true env: gce - frequency: nightly + frequency: manual team: clusters cluster: byod: {} @@ -5084,7 +4174,7 @@ stable: true env: gce - frequency: nightly + frequency: manual team: clusters cluster: byod: {} diff --git a/release/requirements_buildkite.in b/release/requirements_buildkite.in index 96d05e27bb96..0c20af4d9088 100644 --- a/release/requirements_buildkite.in +++ b/release/requirements_buildkite.in @@ -14,7 +14,6 @@ pyyaml pybuildkite PyGithub requests -retry twine == 5.0.0 docker >= 7.1.0 diff --git a/release/requirements_buildkite.txt b/release/requirements_buildkite.txt index 9bfaada37880..659db4c5e8db 100644 --- a/release/requirements_buildkite.txt +++ b/release/requirements_buildkite.txt @@ -424,9 +424,7 @@ debugpy==1.8.2 \ decorator==5.1.1 \ --hash=sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330 \ --hash=sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186 - # via - # ipython - # retry + # via ipython deprecated==1.2.14 \ --hash=sha256:6fac8b097794a90302bdbb17b9b815e732d3c4720583ff1b198499d78470466c \ --hash=sha256:e5323eb936458dccc2582dc6f9c322c852a775a27065ff2b0c4970b9d53d01b3 @@ -1174,10 +1172,6 @@ pure-eval==0.2.3 \ --hash=sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0 \ --hash=sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42 # via stack-data -py==1.11.0 \ - --hash=sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719 \ - --hash=sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378 - # via retry pyasn1==0.6.0 \ --hash=sha256:3a35ab2c4b5ef98e17dfdec8ab074046fbda76e281c5a706ccd82328cfc8f64c \ --hash=sha256:cca4bb0f2df5504f02f6f8a775b6e416ff9b0b3b16f7ee80b5a3153d9b804473 @@ -1539,10 +1533,6 @@ requests-toolbelt==1.0.0 \ --hash=sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6 \ --hash=sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06 # via twine -retry==0.9.2 \ - --hash=sha256:ccddf89761fa2c726ab29391837d4327f819ea14d244c232a1d24c67a2f98606 \ - --hash=sha256:f8bfa8b99b69c4506d6f5bd3b0aabf77f98cdb17f3c9fc3f5ca820033336fba4 - # via -r release/requirements_buildkite.in rfc3986==2.0.0 \ --hash=sha256:50b1502b60e289cb37883f3dfd34532b8873c7de9f49bb546641ce9cbd256ebd \ --hash=sha256:97aacf9dbd4bfd829baad6e6309fa6573aaf1be3f6fa735c8ab05e46cecb261c diff --git a/release/rllib_tests/1gpu_16cpus.yaml b/release/rllib_tests/1gpu_16cpus.yaml index 2a0cdea1c0b3..1b11511cdcb8 100644 --- a/release/rllib_tests/1gpu_16cpus.yaml +++ b/release/rllib_tests/1gpu_16cpus.yaml @@ -9,7 +9,7 @@ head_node_type: worker_node_types: [] -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/rllib_tests/1gpu_16cpus_gce.yaml b/release/rllib_tests/1gpu_16cpus_gce.yaml index f0ad9d505d4a..26b262ab8bed 100644 --- a/release/rllib_tests/1gpu_16cpus_gce.yaml +++ b/release/rllib_tests/1gpu_16cpus_gce.yaml @@ -19,7 +19,7 @@ gcp_advanced_configurations_json: initialize_params: disk_size_gb: 500 -#aws: +#advanced_configurations_json: # BlockDeviceMappings: # - DeviceName: /dev/sda1 # Ebs: diff --git a/release/rllib_tests/1gpu_24cpus.yaml b/release/rllib_tests/1gpu_24cpus.yaml index af4def71489d..d8d8cb4b866a 100644 --- a/release/rllib_tests/1gpu_24cpus.yaml +++ b/release/rllib_tests/1gpu_24cpus.yaml @@ -14,7 +14,7 @@ worker_node_types: max_workers: 1 use_spot: false -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/rllib_tests/1gpu_24cpus_gce.yaml b/release/rllib_tests/1gpu_24cpus_gce.yaml index ec79552e4984..11e5dc4283d6 100644 --- a/release/rllib_tests/1gpu_24cpus_gce.yaml +++ b/release/rllib_tests/1gpu_24cpus_gce.yaml @@ -24,7 +24,7 @@ gcp_advanced_configurations_json: initialize_params: disk_size_gb: 500 -#aws: +#advanced_configurations_json: # BlockDeviceMappings: # - DeviceName: /dev/sda1 # Ebs: diff --git a/release/rllib_tests/1gpu_32cpus.yaml b/release/rllib_tests/1gpu_32cpus.yaml index 660791a6cc2f..d1b1349f284f 100644 --- a/release/rllib_tests/1gpu_32cpus.yaml +++ b/release/rllib_tests/1gpu_32cpus.yaml @@ -9,7 +9,7 @@ head_node_type: worker_node_types: [] -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/rllib_tests/1gpu_4cpus.yaml b/release/rllib_tests/1gpu_4cpus.yaml index fa1b042a9260..dcc8baf66e9b 100644 --- a/release/rllib_tests/1gpu_4cpus.yaml +++ b/release/rllib_tests/1gpu_4cpus.yaml @@ -9,7 +9,7 @@ head_node_type: worker_node_types: [] -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/rllib_tests/1gpu_4cpus_gce.yaml b/release/rllib_tests/1gpu_4cpus_gce.yaml index b239d699f91c..7613f5062cfa 100644 --- a/release/rllib_tests/1gpu_4cpus_gce.yaml +++ b/release/rllib_tests/1gpu_4cpus_gce.yaml @@ -19,7 +19,7 @@ gcp_advanced_configurations_json: initialize_params: disk_size_gb: 500 -#aws: +#advanced_configurations_json: # BlockDeviceMappings: # - DeviceName: /dev/sda1 # Ebs: diff --git a/release/rllib_tests/2gpus_32cpus.yaml b/release/rllib_tests/2gpus_32cpus.yaml index 1c74596c4c8a..02065ef9dc8f 100644 --- a/release/rllib_tests/2gpus_32cpus.yaml +++ b/release/rllib_tests/2gpus_32cpus.yaml @@ -9,7 +9,7 @@ head_node_type: worker_node_types: [] -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/rllib_tests/2gpus_32cpus_gce.yaml b/release/rllib_tests/2gpus_32cpus_gce.yaml index 7086e8d82fbf..fe56a4b11161 100644 --- a/release/rllib_tests/2gpus_32cpus_gce.yaml +++ b/release/rllib_tests/2gpus_32cpus_gce.yaml @@ -24,7 +24,7 @@ gcp_advanced_configurations_json: initialize_params: disk_size_gb: 500 -#aws: +#advanced_configurations_json: # BlockDeviceMappings: # - DeviceName: /dev/sda1 # Ebs: diff --git a/release/rllib_tests/2gpus_64cpus.yaml b/release/rllib_tests/2gpus_64cpus.yaml index 67392db81700..bd7f534c1fdf 100644 --- a/release/rllib_tests/2gpus_64cpus.yaml +++ b/release/rllib_tests/2gpus_64cpus.yaml @@ -14,7 +14,7 @@ worker_node_types: max_workers: 1 use_spot: false -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/rllib_tests/2gpus_64cpus_gce.yaml b/release/rllib_tests/2gpus_64cpus_gce.yaml index 825124fabbad..484cbf999bd8 100644 --- a/release/rllib_tests/2gpus_64cpus_gce.yaml +++ b/release/rllib_tests/2gpus_64cpus_gce.yaml @@ -28,7 +28,7 @@ gcp_advanced_configurations_json: initialize_params: disk_size_gb: 500 -#aws: +#advanced_configurations_json: # BlockDeviceMappings: # - DeviceName: /dev/sda1 # Ebs: diff --git a/release/rllib_tests/32cpus.yaml b/release/rllib_tests/32cpus.yaml index f1e092047f78..d67b810f66e1 100644 --- a/release/rllib_tests/32cpus.yaml +++ b/release/rllib_tests/32cpus.yaml @@ -9,7 +9,7 @@ head_node_type: worker_node_types: [] -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/rllib_tests/32cpus_gce.yaml b/release/rllib_tests/32cpus_gce.yaml index c6d1a6729fa0..466d7fe8602c 100644 --- a/release/rllib_tests/32cpus_gce.yaml +++ b/release/rllib_tests/32cpus_gce.yaml @@ -19,7 +19,7 @@ gcp_advanced_configurations_json: initialize_params: disk_size_gb: 500 -#aws: +#advanced_configurations_json: # BlockDeviceMappings: # - DeviceName: /dev/sda1 # Ebs: diff --git a/release/rllib_tests/4gpus_512_cpus_gce.yaml b/release/rllib_tests/4gpus_512_cpus_gce.yaml index a42e4805795f..92ddfdde89e7 100644 --- a/release/rllib_tests/4gpus_512_cpus_gce.yaml +++ b/release/rllib_tests/4gpus_512_cpus_gce.yaml @@ -24,7 +24,7 @@ gcp_advanced_configurations_json: initialize_params: disk_size_gb: 500 -#aws: +#advanced_configurations_json: # BlockDeviceMappings: # - DeviceName: /dev/sda1 # Ebs: diff --git a/release/rllib_tests/4gpus_544_cpus.yaml b/release/rllib_tests/4gpus_544_cpus.yaml index dd30af32c19a..c4559737cc17 100644 --- a/release/rllib_tests/4gpus_544_cpus.yaml +++ b/release/rllib_tests/4gpus_544_cpus.yaml @@ -14,7 +14,7 @@ worker_node_types: max_workers: 5 use_spot: false -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/rllib_tests/4gpus_64cpus.yaml b/release/rllib_tests/4gpus_64cpus.yaml index c0f4c76ee300..60d03f122263 100644 --- a/release/rllib_tests/4gpus_64cpus.yaml +++ b/release/rllib_tests/4gpus_64cpus.yaml @@ -14,7 +14,7 @@ worker_node_types: max_workers: 1 use_spot: false -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/rllib_tests/4gpus_64cpus_gce.yaml b/release/rllib_tests/4gpus_64cpus_gce.yaml index 82b95a8b4fdc..a4453843a482 100644 --- a/release/rllib_tests/4gpus_64cpus_gce.yaml +++ b/release/rllib_tests/4gpus_64cpus_gce.yaml @@ -19,7 +19,7 @@ gcp_advanced_configurations_json: initialize_params: disk_size_gb: 500 -#aws: +#advanced_configurations_json: # BlockDeviceMappings: # - DeviceName: /dev/sda1 # Ebs: diff --git a/release/rllib_tests/4gpus_96cpus.yaml b/release/rllib_tests/4gpus_96cpus.yaml index 80916596a054..e699e2588b8b 100644 --- a/release/rllib_tests/4gpus_96cpus.yaml +++ b/release/rllib_tests/4gpus_96cpus.yaml @@ -9,7 +9,7 @@ head_node_type: worker_node_types: [] -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/rllib_tests/4gpus_96cpus_gce.yaml b/release/rllib_tests/4gpus_96cpus_gce.yaml index 5d1e5b00182c..df83eba5f0f7 100644 --- a/release/rllib_tests/4gpus_96cpus_gce.yaml +++ b/release/rllib_tests/4gpus_96cpus_gce.yaml @@ -19,7 +19,7 @@ gcp_advanced_configurations_json: initialize_params: disk_size_gb: 500 -#aws: +#advanced_configurations_json: # BlockDeviceMappings: # - DeviceName: /dev/sda1 # Ebs: diff --git a/release/rllib_tests/8gpus_96cpus.yaml b/release/rllib_tests/8gpus_96cpus.yaml index d9509c85dbda..614944b6d20c 100644 --- a/release/rllib_tests/8gpus_96cpus.yaml +++ b/release/rllib_tests/8gpus_96cpus.yaml @@ -9,7 +9,7 @@ head_node_type: worker_node_types: [] -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/rllib_tests/multi_node_checkpointing_compute_config.yaml b/release/rllib_tests/multi_node_checkpointing_compute_config.yaml index 60784554811e..36f37a8738c7 100644 --- a/release/rllib_tests/multi_node_checkpointing_compute_config.yaml +++ b/release/rllib_tests/multi_node_checkpointing_compute_config.yaml @@ -14,7 +14,7 @@ worker_node_types: max_workers: 2 use_spot: false -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/serve_tests/compute_tpl_32_cpu.yaml b/release/serve_tests/compute_tpl_32_cpu.yaml index adcb0ee49569..442ba4222534 100644 --- a/release/serve_tests/compute_tpl_32_cpu.yaml +++ b/release/serve_tests/compute_tpl_32_cpu.yaml @@ -20,7 +20,7 @@ worker_node_types: custom_resources: worker: 1 -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/serve_tests/compute_tpl_32_cpu_autoscaling.yaml b/release/serve_tests/compute_tpl_32_cpu_autoscaling.yaml index 001ff48fef96..c65682bf76b8 100644 --- a/release/serve_tests/compute_tpl_32_cpu_autoscaling.yaml +++ b/release/serve_tests/compute_tpl_32_cpu_autoscaling.yaml @@ -20,7 +20,7 @@ worker_node_types: max_workers: 35 use_spot: false -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/serve_tests/compute_tpl_8_cpu_autoscaling.yaml b/release/serve_tests/compute_tpl_8_cpu_autoscaling.yaml index ab5b59006a93..851620488b20 100644 --- a/release/serve_tests/compute_tpl_8_cpu_autoscaling.yaml +++ b/release/serve_tests/compute_tpl_8_cpu_autoscaling.yaml @@ -26,7 +26,7 @@ worker_node_types: custom_resources: proxy: 1 -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/serve_tests/compute_tpl_gpu_node.yaml b/release/serve_tests/compute_tpl_gpu_node.yaml index c1450feb62d4..96f7dc057a45 100644 --- a/release/serve_tests/compute_tpl_gpu_node.yaml +++ b/release/serve_tests/compute_tpl_gpu_node.yaml @@ -13,7 +13,7 @@ worker_node_types: max_workers: 1 use_spot: false -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/serve_tests/compute_tpl_single_node.yaml b/release/serve_tests/compute_tpl_single_node.yaml index c9f7f84179a0..5f46c956ab7d 100644 --- a/release/serve_tests/compute_tpl_single_node.yaml +++ b/release/serve_tests/compute_tpl_single_node.yaml @@ -10,7 +10,7 @@ head_node_type: worker_node_types: [] -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/serve_tests/compute_tpl_single_node_32_cpu.yaml b/release/serve_tests/compute_tpl_single_node_32_cpu.yaml index c9e1ec1d0f7f..d40e806ea617 100644 --- a/release/serve_tests/compute_tpl_single_node_32_cpu.yaml +++ b/release/serve_tests/compute_tpl_single_node_32_cpu.yaml @@ -10,7 +10,7 @@ head_node_type: worker_node_types: [] -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/serve_tests/compute_tpl_single_node_k8s.yaml b/release/serve_tests/compute_tpl_single_node_k8s.yaml index bbccdfb95a07..7dd79bb0adf1 100644 --- a/release/serve_tests/compute_tpl_single_node_k8s.yaml +++ b/release/serve_tests/compute_tpl_single_node_k8s.yaml @@ -12,7 +12,7 @@ head_node_type: worker_node_types: [] -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/train_tests/colocate_trainer/compute_aws.yaml b/release/train_tests/colocate_trainer/compute_aws.yaml index e2542caf5000..abe241ebddfd 100644 --- a/release/train_tests/colocate_trainer/compute_aws.yaml +++ b/release/train_tests/colocate_trainer/compute_aws.yaml @@ -14,7 +14,7 @@ worker_node_types: min_workers: 3 use_spot: false -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/train_tests/horovod/compute_tpl_aws.yaml b/release/train_tests/horovod/compute_tpl_aws.yaml index 8424f13fbf61..b7065014230c 100644 --- a/release/train_tests/horovod/compute_tpl_aws.yaml +++ b/release/train_tests/horovod/compute_tpl_aws.yaml @@ -15,7 +15,7 @@ worker_node_types: min_workers: 1 use_spot: false -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/train_tests/horovod/compute_tpl_gce.yaml b/release/train_tests/horovod/compute_tpl_gce.yaml index 31730aac6e79..59e43741f983 100644 --- a/release/train_tests/horovod/compute_tpl_gce.yaml +++ b/release/train_tests/horovod/compute_tpl_gce.yaml @@ -17,7 +17,7 @@ worker_node_types: min_workers: 1 use_spot: false -#aws: +#advanced_configurations_json: # TagSpecifications: # - ResourceType: "instance" # Tags: diff --git a/release/train_tests/multinode_persistence/compute_aws.yaml b/release/train_tests/multinode_persistence/compute_aws.yaml index ad578f3feba0..a0e4116acafb 100644 --- a/release/train_tests/multinode_persistence/compute_aws.yaml +++ b/release/train_tests/multinode_persistence/compute_aws.yaml @@ -14,7 +14,7 @@ worker_node_types: min_workers: 3 use_spot: false -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/train_tests/xgboost_lightgbm/compute_aws_10workers.yaml b/release/train_tests/xgboost_lightgbm/compute_aws_10workers.yaml index 7888ba7cbec3..55fa05e163f0 100644 --- a/release/train_tests/xgboost_lightgbm/compute_aws_10workers.yaml +++ b/release/train_tests/xgboost_lightgbm/compute_aws_10workers.yaml @@ -17,7 +17,7 @@ worker_node_types: min_workers: 10 use_spot: false -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/train_tests/xgboost_lightgbm/compute_aws_1worker.yaml b/release/train_tests/xgboost_lightgbm/compute_aws_1worker.yaml index 28d8b4a66016..8796876a7de5 100644 --- a/release/train_tests/xgboost_lightgbm/compute_aws_1worker.yaml +++ b/release/train_tests/xgboost_lightgbm/compute_aws_1worker.yaml @@ -17,7 +17,7 @@ worker_node_types: min_workers: 1 use_spot: false -aws: +advanced_configurations_json: BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: diff --git a/release/tune_tests/cloud_tests/tpl_aws_1x4.yaml b/release/tune_tests/cloud_tests/tpl_aws_1x4.yaml index 1672eb48ba78..0307acc970a5 100644 --- a/release/tune_tests/cloud_tests/tpl_aws_1x4.yaml +++ b/release/tune_tests/cloud_tests/tpl_aws_1x4.yaml @@ -9,7 +9,7 @@ head_node_type: worker_node_types: [] -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/tune_tests/fault_tolerance_tests/tpl_aws_16x1.yaml b/release/tune_tests/fault_tolerance_tests/tpl_aws_16x1.yaml index abe801c950f1..302b975fa5a4 100644 --- a/release/tune_tests/fault_tolerance_tests/tpl_aws_16x1.yaml +++ b/release/tune_tests/fault_tolerance_tests/tpl_aws_16x1.yaml @@ -18,7 +18,7 @@ worker_node_types: use_spot: true # Required to allow nodes to terminate themselves. -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/tune_tests/fault_tolerance_tests/tpl_gce_16x1.yaml b/release/tune_tests/fault_tolerance_tests/tpl_gce_16x1.yaml index dbccfa496b2d..d99976a529e4 100644 --- a/release/tune_tests/fault_tolerance_tests/tpl_gce_16x1.yaml +++ b/release/tune_tests/fault_tolerance_tests/tpl_gce_16x1.yaml @@ -20,7 +20,7 @@ worker_node_types: use_spot: true ## Required to allow nodes to terminate themselves. -#aws: +#advanced_configurations_json: # TagSpecifications: # - ResourceType: "instance" # Tags: diff --git a/release/tune_tests/scalability_tests/tpl_1x32_hd.yaml b/release/tune_tests/scalability_tests/tpl_1x32_hd.yaml index 8fed22723702..de26a8b43484 100644 --- a/release/tune_tests/scalability_tests/tpl_1x32_hd.yaml +++ b/release/tune_tests/scalability_tests/tpl_1x32_hd.yaml @@ -9,7 +9,7 @@ head_node_type: worker_node_types: [] -aws: +advanced_configurations_json: TagSpecifications: - ResourceType: "instance" Tags: diff --git a/release/tune_tests/scalability_tests/tpl_gce_1x32_hd.yaml b/release/tune_tests/scalability_tests/tpl_gce_1x32_hd.yaml index 387f47a83437..60ade7f48b9c 100644 --- a/release/tune_tests/scalability_tests/tpl_gce_1x32_hd.yaml +++ b/release/tune_tests/scalability_tests/tpl_gce_1x32_hd.yaml @@ -11,7 +11,7 @@ head_node_type: worker_node_types: [] -#aws: +#advanced_configurations_json: # TagSpecifications: # - ResourceType: "instance" # Tags: diff --git a/rllib/BUILD b/rllib/BUILD index e2ec7386ae0a..dc3fd6830245 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -164,23 +164,24 @@ py_test( tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_discrete", "torch_only"], size = "large", srcs = ["tuned_examples/appo/cartpole_appo.py"], - args = ["--as-test", "--enable-new-api-stack", "--num-learners=1"] -) -py_test( - name = "learning_tests_cartpole_appo_gpu", - main = "tuned_examples/appo/cartpole_appo.py", - tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "gpu"], - size = "large", - srcs = ["tuned_examples/appo/cartpole_appo.py"], - args = ["--as-test", "--enable-new-api-stack", "--num-learners=0", "--num-gpus-per-learner=1"] + args = ["--as-test", "--num-learners=1", "--num-cpus=8", "--num-env-runners=6"] ) +# TODO (sven): For some weird reason, this test runs extremely slow on the CI (not on cluster, not locally) -> taking this out for now ... +# py_test( +# name = "learning_tests_cartpole_appo_gpu", +# main = "tuned_examples/appo/cartpole_appo.py", +# tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "gpu"], +# size = "large", +# srcs = ["tuned_examples/appo/cartpole_appo.py"], +# args = ["--as-test", "--num-learners=0", "--num-gpus-per-learner=1", "--num-cpus=7", "--num-env-runners=6"] +# ) py_test( name = "learning_tests_cartpole_appo_multi_cpu", main = "tuned_examples/appo/cartpole_appo.py", tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"], size = "large", srcs = ["tuned_examples/appo/cartpole_appo.py"], - args = ["--as-test", "--enable-new-api-stack", "--num-learners=2"] + args = ["--as-test", "--num-learners=2", "--num-cpus=9", "--num-env-runners=6"] ) py_test( name = "learning_tests_cartpole_appo_multi_gpu", @@ -188,7 +189,7 @@ py_test( tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"], size = "large", srcs = ["tuned_examples/appo/cartpole_appo.py"], - args = ["--as-test", "--enable-new-api-stack", "--num-learners=2", "--num-gpus-per-learner=1"] + args = ["--as-test", "--num-learners=2", "--num-gpus-per-learner=1", "--num-cpus=7", "--num-env-runners=6"] ) # MultiAgentCartPole py_test( @@ -197,7 +198,7 @@ py_test( tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_discrete", "torch_only"], size = "large", srcs = ["tuned_examples/appo/multi_agent_cartpole_appo.py"], - args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-learners=1"] + args = ["--as-test", "--num-agents=2", "--num-learners=1", "--num-cpus=8", "--num-env-runners=6"] ) py_test( name = "learning_tests_multi_agent_cartpole_appo_gpu", @@ -205,7 +206,7 @@ py_test( tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "gpu"], size = "large", srcs = ["tuned_examples/appo/multi_agent_cartpole_appo.py"], - args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-learners=0", "--num-gpus-per-learner=1", "--num-cpus=6"] + args = ["--as-test", "--num-agents=2", "--num-learners=0", "--num-gpus-per-learner=1", "--num-cpus=7", "--num-env-runners=6"] ) py_test( name = "learning_tests_multi_agent_cartpole_appo_multi_cpu", @@ -213,7 +214,7 @@ py_test( tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"], size = "large", srcs = ["tuned_examples/appo/multi_agent_cartpole_appo.py"], - args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-learners=2", "--num-cpus=7"] + args = ["--as-test", "--num-agents=2", "--num-learners=2", "--num-cpus=9", "--num-env-runners=6"] ) py_test( name = "learning_tests_multi_agent_cartpole_appo_multi_gpu", @@ -221,7 +222,7 @@ py_test( tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"], size = "large", srcs = ["tuned_examples/appo/multi_agent_cartpole_appo.py"], - args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-learners=2", "--num-gpus-per-learner=1", "--num-cpus=7"] + args = ["--as-test", "--num-agents=2", "--num-learners=2", "--num-gpus-per-learner=1", "--num-cpus=7", "--num-env-runners=6"] ) # StatelessCartPole py_test( @@ -230,7 +231,7 @@ py_test( tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"], size = "large", srcs = ["tuned_examples/appo/stateless_cartpole_appo.py"], - args = ["--as-test", "--enable-new-api-stack", "--num-learners=1"] + args = ["--as-test", "--num-learners=1", "--num-cpus=8", "--num-env-runners=6"] ) py_test( name = "learning_tests_stateless_cartpole_appo_gpu", @@ -238,7 +239,7 @@ py_test( tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "gpu"], size = "large", srcs = ["tuned_examples/appo/stateless_cartpole_appo.py"], - args = ["--as-test", "--enable-new-api-stack", "--num-agents=2", "--num-learners=0", "--num-gpus-per-learner=1"] + args = ["--as-test", "--num-agents=2", "--num-learners=0", "--num-gpus-per-learner=1", "--num-cpus=7", "--num-env-runners=6"] ) py_test( name = "learning_tests_stateless_cartpole_appo_multi_cpu", @@ -246,7 +247,7 @@ py_test( tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core"], size = "large", srcs = ["tuned_examples/appo/stateless_cartpole_appo.py"], - args = ["--as-test", "--enable-new-api-stack", "--num-learners=2"] + args = ["--as-test", "--num-learners=2", "--num-cpus=9", "--num-env-runners=6"] ) py_test( name = "learning_tests_stateless_cartpole_appo_multi_gpu", @@ -254,7 +255,7 @@ py_test( tags = ["team:rllib", "exclusive", "learning_tests", "torch_only", "learning_tests_discrete", "learning_tests_pytorch_use_all_core", "multi_gpu"], size = "large", srcs = ["tuned_examples/appo/stateless_cartpole_appo.py"], - args = ["--as-test", "--enable-new-api-stack", "--num-learners=2", "--num-gpus-per-learner=1"] + args = ["--as-test", "--num-learners=2", "--num-gpus-per-learner=1", "--num-cpus=7", "--num-env-runners=6"] ) # MultiAgentStatelessCartPole # py_test( @@ -2928,6 +2929,17 @@ py_test( # subdirectory: offline_rl/ # .................................... +# Does run into scheduling problems in CI tests. Works on local +# and GCP cloud. +# py_test( +# name = "examples/offline_rl/cartpole_recording", +# main = "examples/offline_rl/cartpole_recording.py", +# tags = ["team:rllib", "examples", "exclusive"], +# size = "large", +# srcs = ["examples/offline_rl/cartpole_recording.py"], +# args = ["--enable-new-api-stack", "--as-test", "--framework=torch", "--num-cpus=12"], +# ) + py_test( name = "examples/offline_rl/train_w_bc_finetune_w_ppo", main = "examples/offline_rl/train_w_bc_finetune_w_ppo.py", diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py index bf01aff4be89..7ed33f749713 100644 --- a/rllib/algorithms/algorithm.py +++ b/rllib/algorithms/algorithm.py @@ -129,6 +129,7 @@ NUM_ENV_STEPS_TRAINED_LIFETIME, NUM_EPISODES, NUM_EPISODES_LIFETIME, + NUM_TRAINING_STEP_CALLS_PER_ITERATION, RESTORE_WORKERS_TIMER, RESTORE_EVAL_WORKERS_TIMER, SYNCH_ENV_CONNECTOR_STATES_TIMER, @@ -3215,7 +3216,17 @@ def _run_one_training_iteration(self) -> Tuple[ResultDict, "TrainIterCtx"]: "one single result dict per training iteration." ) - # Only here, reduce the results into a single result dict. + # TODO (sven): Resolve this metric through log_time's future + # ability to compute throughput. + self.metrics.log_value( + NUM_TRAINING_STEP_CALLS_PER_ITERATION, + 1, + reduce="sum", + clear_on_reduce=True, + ) + + # Only here (at the end of the iteration), reduce the results into a single + # result dict. return self.metrics.reduce(), train_iter_ctx def _run_one_evaluation( @@ -3527,13 +3538,16 @@ def _compile_iteration_results_new_api_stack(self, *, train_results, eval_result # Fault tolerance stats. results[FAULT_TOLERANCE_STATS] = { "num_healthy_workers": self.env_runner_group.num_healthy_remote_workers(), - "num_in_flight_async_reqs": ( - self.env_runner_group.num_in_flight_async_reqs() - ), "num_remote_worker_restarts": ( self.env_runner_group.num_remote_worker_restarts() ), } + results["env_runner_group"] = { + "actor_manager_num_outstanding_async_reqs": ( + self.env_runner_group.num_in_flight_async_reqs() + ), + } + # Resolve all `Stats` leafs by peeking (get their reduced values). return tree.map_structure( lambda s: s.peek() if isinstance(s, Stats) else s, diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 542240a00dac..a9c3b5598ef5 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -357,6 +357,11 @@ def __init__(self, algo_class: Optional[type] = None): self.num_gpus_per_learner = 0 self.num_cpus_per_learner = 1 self.local_gpu_idx = 0 + # TODO (sven): This probably works even without any restriction + # (allowing for any arbitrary number of requests in-flight). Test with + # 3 first, then with unlimited, and if both show the same behavior on + # an async algo, remove this restriction entirely. + self.max_requests_in_flight_per_learner = 3 # `self.training()` self.gamma = 0.99 @@ -463,6 +468,7 @@ def __init__(self, algo_class: Optional[type] = None): self.output_compress_columns = [Columns.OBS, Columns.NEXT_OBS] self.output_max_file_size = 64 * 1024 * 1024 self.output_max_rows_per_file = None + self.output_write_remaining_data = False self.output_write_method = "write_parquet" self.output_write_method_kwargs = {} self.output_filesystem = None @@ -1798,15 +1804,16 @@ def env_runners( synchronously in turn with their update step (e.g., PPO or DQN). Not relevant for any algos that sample asynchronously, such as APPO or IMPALA. - max_requests_in_flight_per_env_runner: Max number of inflight requests - to each EnvRunner worker. See the FaultTolerantActorManager class for - more details. + max_requests_in_flight_per_env_runner: Max number of in-flight requests + to each EnvRunner (actor)). See the + `ray.rllib.utils.actor_manager.FaultTolerantActorManager` class for more + details. Tuning these values is important when running experiments with large sample batches, where there is the risk that the object store may fill up, causing spilling of objects to disk. This can cause any asynchronous requests to become very slow, making your experiment run slowly as well. You can inspect the object store during your experiment - via a call to Ray memory on your head node, and by using the Ray + via a call to `ray memory` on your head node, and by using the Ray dashboard. If you're seeing that the object store is filling up, turn down the number of remote requests in flight or enable compression. sample_collector: For the old API stack only. The SampleCollector class to @@ -2110,6 +2117,7 @@ def learners( num_cpus_per_learner: Optional[Union[float, int]] = NotProvided, num_gpus_per_learner: Optional[Union[float, int]] = NotProvided, local_gpu_idx: Optional[int] = NotProvided, + max_requests_in_flight_per_learner: Optional[int] = NotProvided, ): """Sets LearnerGroup and Learner worker related configurations. @@ -2135,6 +2143,10 @@ def learners( an index into the available CUDA devices. For example if `os.environ["CUDA_VISIBLE_DEVICES"] = "1"` and `local_gpu_idx=0`, RLlib uses the GPU with ID=1 on the node. + max_requests_in_flight_per_learner: Max number of in-flight requests + to each Learner (actor)). See the + `ray.rllib.utils.actor_manager.FaultTolerantActorManager` class for more + details. Returns: This updated AlgorithmConfig object. @@ -2147,6 +2159,8 @@ def learners( self.num_gpus_per_learner = num_gpus_per_learner if local_gpu_idx is not NotProvided: self.local_gpu_idx = local_gpu_idx + if max_requests_in_flight_per_learner is not NotProvided: + self.max_requests_in_flight_per_learner = max_requests_in_flight_per_learner return self @@ -2566,6 +2580,7 @@ def offline_data( output_compress_columns: Optional[List[str]] = NotProvided, output_max_file_size: Optional[float] = NotProvided, output_max_rows_per_file: Optional[int] = NotProvided, + output_write_remaining_data: Optional[bool] = NotProvided, output_write_method: Optional[str] = NotProvided, output_write_method_kwargs: Optional[Dict] = NotProvided, output_filesystem: Optional[str] = NotProvided, @@ -2735,6 +2750,15 @@ def offline_data( to a new file. output_max_rows_per_file: Max output row numbers before rolling over to a new file. + output_write_remaining_data: Determines whether any remaining data in the + recording buffers should be stored to disk. It is only applicable if + `output_max_rows_per_file` is defined. When sampling data, it is + buffered until the threshold specified by `output_max_rows_per_file` + is reached. Only complete multiples of `output_max_rows_per_file` are + written to disk, while any leftover data remains in the buffers. If a + recording session is stopped, residual data may still reside in these + buffers. Setting `output_write_remaining_data` to `True` ensures this + data is flushed to disk. By default, this attribute is set to `False`. output_write_method: Write method for the `ray.data.Dataset` to write the offline data to `output`. The default is `read_parquet` for Parquet files. See https://docs.ray.io/en/latest/data/api/input_output.html for @@ -2842,6 +2866,8 @@ def offline_data( self.output_max_file_size = output_max_file_size if output_max_rows_per_file is not NotProvided: self.output_max_rows_per_file = output_max_rows_per_file + if output_write_remaining_data is not NotProvided: + self.output_write_remaining_data = output_write_remaining_data if output_write_method is not NotProvided: self.output_write_method = output_write_method if output_write_method_kwargs is not NotProvided: diff --git a/rllib/algorithms/appo/appo.py b/rllib/algorithms/appo/appo.py index 37b8fd863c66..5302f7540248 100644 --- a/rllib/algorithms/appo/appo.py +++ b/rllib/algorithms/appo/appo.py @@ -1,13 +1,13 @@ -""" -Asynchronous Proximal Policy Optimization (APPO) -================================================ +"""Asynchronous Proximal Policy Optimization (APPO) -This file defines the distributed Algorithm class for the asynchronous version -of proximal policy optimization (APPO). -See `appo_[tf|torch]_policy.py` for the definition of the policy loss. +The algorithm is described in [1] (under the name of "IMPACT"): Detailed documentation: https://docs.ray.io/en/master/rllib-algorithms.html#appo + +[1] IMPACT: Importance Weighted Asynchronous Architectures with Clipped Target Networks. +Luo et al. 2020 +https://arxiv.org/pdf/1912.00167 """ from typing import Optional, Type @@ -32,8 +32,7 @@ LEARNER_RESULTS_KL_KEY = "mean_kl_loss" LEARNER_RESULTS_CURR_KL_COEFF_KEY = "curr_kl_coeff" -OLD_ACTION_DIST_KEY = "old_action_dist" -OLD_ACTION_DIST_LOGITS_KEY = "old_action_dist_logits" +TARGET_ACTION_DIST_LOGITS_KEY = "target_action_dist_logits" class APPOConfig(IMPALAConfig): @@ -101,25 +100,25 @@ def __init__(self, algo_class=None): # __sphinx_doc_begin__ # APPO specific settings: self.vtrace = True - self.use_critic = True self.use_gae = True self.lambda_ = 1.0 self.clip_param = 0.4 self.use_kl_loss = False self.kl_coeff = 1.0 self.kl_target = 0.01 - # TODO (sven): Activate once v-trace sequences in non-RNN batch are solved. - # If we switch this on right now, the shuffling would destroy the rollout - # sequences (non-zero-padded!) needed in the batch for v-trace. - # self.shuffle_batch_per_epoch = True + self.target_worker_clipping = 2.0 + + # Circular replay buffer settings. + # Used in [1] for discrete action tasks: + # `circular_buffer_num_batches=4` and `circular_buffer_iterations_per_batch=2` + # For cont. action tasks: + # `circular_buffer_num_batches=16` and `circular_buffer_iterations_per_batch=20` + self.circular_buffer_num_batches = 4 + self.circular_buffer_iterations_per_batch = 2 # Override some of IMPALAConfig's default values with APPO-specific values. self.num_env_runners = 2 - self.min_time_s_per_iteration = 10 - self.target_network_update_freq = 1 - self.learner_queue_size = 16 - self.learner_queue_timeout = 300 - self.max_sample_requests_in_flight_per_worker = 2 + self.target_network_update_freq = 2 self.broadcast_interval = 1 self.grad_clip = 40.0 # Note: Only when using enable_rl_module_and_learner=True can the clipping mode @@ -145,26 +144,32 @@ def __init__(self, algo_class=None): self.minibatch_buffer_size = 1 # @OldAPIStack self.replay_proportion = 0.0 # @OldAPIStack self.replay_buffer_num_slots = 100 # @OldAPIStack + self.learner_queue_size = 16 # @OldAPIStack + self.learner_queue_timeout = 300 # @OldAPIStack # Deprecated keys. self.target_update_frequency = DEPRECATED_VALUE + self.use_critic = DEPRECATED_VALUE @override(IMPALAConfig) def training( self, *, vtrace: Optional[bool] = NotProvided, - use_critic: Optional[bool] = NotProvided, use_gae: Optional[bool] = NotProvided, lambda_: Optional[float] = NotProvided, clip_param: Optional[float] = NotProvided, use_kl_loss: Optional[bool] = NotProvided, kl_coeff: Optional[float] = NotProvided, kl_target: Optional[float] = NotProvided, - tau: Optional[float] = NotProvided, target_network_update_freq: Optional[int] = NotProvided, + tau: Optional[float] = NotProvided, + target_worker_clipping: Optional[float] = NotProvided, + circular_buffer_num_batches: Optional[int] = NotProvided, + circular_buffer_iterations_per_batch: Optional[int] = NotProvided, # Deprecated keys. target_update_frequency=DEPRECATED_VALUE, + use_critic=DEPRECATED_VALUE, **kwargs, ) -> "APPOConfig": """Sets the training related configuration. @@ -172,8 +177,6 @@ def training( Args: vtrace: Whether to use V-trace weighted advantages. If false, PPO GAE advantages will be used instead. - use_critic: Should use a critic as a baseline (otherwise don't use value - baseline; required for using GAE). Only applies if vtrace=False. use_gae: If true, use the Generalized Advantage Estimator (GAE) with a value function, see https://arxiv.org/pdf/1506.02438.pdf. Only applies if vtrace=False. @@ -183,9 +186,18 @@ def training( kl_coeff: Coefficient for weighting the KL-loss term. kl_target: Target term for the KL-term to reach (via adjusting the `kl_coeff` automatically). - tau: The factor by which to update the target policy network towards - the current policy network. Can range between 0 and 1. - e.g. updated_param = tau * current_param + (1 - tau) * target_param + target_network_update_freq: NOTE: This parameter is only applicable on + the new API stack. The frequency with which to update the target + policy network from the main trained policy network. The metric + used is `NUM_ENV_STEPS_TRAINED_LIFETIME` and the unit is `n` (see [1] + 4.1.1), where: `n = [circular_buffer_num_batches (N)] * + [circular_buffer_iterations_per_batch (K)] * [train batch size]` + For example, if you set `target_network_update_freq=2`, and N=4, K=2, + and `train_batch_size_per_learner=500`, then the target net is updated + every 2*4*2*500=8000 trained env steps (every 16 batch updates on each + learner). + The authors in [1] suggests that this setting is robust to a range of + choices (try values between 0.125 and 4). target_network_update_freq: The frequency to update the target policy and tune the kl loss coefficients that are used during training. After setting this parameter, the algorithm waits for at least @@ -193,6 +205,20 @@ def training( on before updating the target networks and tune the kl loss coefficients. NOTE: This parameter is only applicable when using the Learner API (enable_rl_module_and_learner=True). + tau: The factor by which to update the target policy network towards + the current policy network. Can range between 0 and 1. + e.g. updated_param = tau * current_param + (1 - tau) * target_param + target_worker_clipping: The maximum value for the target-worker-clipping + used for computing the IS ratio, described in [1] + IS = min(π(i) / π(target), ρ) * (π / π(i)) + circular_buffer_num_batches: The number of train batches that fit + into the circular buffer. Each such train batch can be sampled for + training max. `circular_buffer_iterations_per_batch` times. + circular_buffer_iterations_per_batch: The number of times any train + batch in the circular buffer can be sampled for training. A batch gets + evicted from the buffer either if it's the oldest batch in the buffer + and a new batch is added OR if the batch reaches this max. number of + being sampled. Returns: This updated AlgorithmConfig object. @@ -203,14 +229,19 @@ def training( new="target_network_update_freq", error=True, ) + if use_critic != DEPRECATED_VALUE: + deprecation_warning( + old="use_critic", + help="`use_critic` no longer supported! APPO always uses a value " + "function (critic).", + error=True, + ) # Pass kwargs onto super's `training()` method. super().training(**kwargs) if vtrace is not NotProvided: self.vtrace = vtrace - if use_critic is not NotProvided: - self.use_critic = use_critic if use_gae is not NotProvided: self.use_gae = use_gae if lambda_ is not NotProvided: @@ -223,13 +254,56 @@ def training( self.kl_coeff = kl_coeff if kl_target is not NotProvided: self.kl_target = kl_target - if tau is not NotProvided: - self.tau = tau if target_network_update_freq is not NotProvided: self.target_network_update_freq = target_network_update_freq + if tau is not NotProvided: + self.tau = tau + if target_worker_clipping is not NotProvided: + self.target_worker_clipping = target_worker_clipping + if circular_buffer_num_batches is not NotProvided: + self.circular_buffer_num_batches = circular_buffer_num_batches + if circular_buffer_iterations_per_batch is not NotProvided: + self.circular_buffer_iterations_per_batch = ( + circular_buffer_iterations_per_batch + ) return self + @override(IMPALAConfig) + def validate(self) -> None: + super().validate() + + # On new API stack, circular buffer should be used, not `minibatch_buffer_size`. + if self.enable_rl_module_and_learner: + if self.minibatch_buffer_size != 1 or self.replay_proportion != 0.0: + raise ValueError( + "`minibatch_buffer_size/replay_proportion` not valid on new API " + "stack with APPO! " + "Use `circular_buffer_num_batches` for the number of train batches " + "in the circular buffer. To change the maximum number of times " + "any batch may be sampled, set " + "`circular_buffer_iterations_per_batch`." + ) + if self.num_multi_gpu_tower_stacks != 1: + raise ValueError( + "`num_multi_gpu_tower_stacks` not supported on new API stack with " + "APPO! In order to train on multi-GPU, use " + "`config.learners(num_learners=[number of GPUs], " + "num_gpus_per_learner=1)`. To scale the throughput of batch-to-GPU-" + "pre-loading on each of your `Learners`, set " + "`num_gpu_loader_threads` to a higher number (recommended values: " + "1-8)." + ) + if self.learner_queue_size != 16: + raise ValueError( + "`learner_queue_size` not supported on new API stack with " + "APPO! In order set the size of the circular buffer (which acts as " + "a 'learner queue'), use " + "`config.training(circular_buffer_num_batches=..)`. To change the " + "maximum number of times any batch may be sampled, set " + "`config.training(circular_buffer_iterations_per_batch=..)`." + ) + @override(IMPALAConfig) def get_default_learner_class(self): if self.framework_str == "torch": diff --git a/rllib/algorithms/appo/appo_learner.py b/rllib/algorithms/appo/appo_learner.py index 7b4cf2b14d8f..431449893264 100644 --- a/rllib/algorithms/appo/appo_learner.py +++ b/rllib/algorithms/appo/appo_learner.py @@ -2,6 +2,7 @@ from typing import Any, Dict, Optional from ray.rllib.algorithms.appo.appo import APPOConfig +from ray.rllib.algorithms.appo.utils import CircularBuffer from ray.rllib.algorithms.impala.impala_learner import IMPALALearner from ray.rllib.core.learner.learner import Learner from ray.rllib.core.learner.utils import update_target_network @@ -11,8 +12,9 @@ from ray.rllib.utils.annotations import override from ray.rllib.utils.lambda_defaultdict import LambdaDefaultDict from ray.rllib.utils.metrics import ( + ALL_MODULES, LAST_TARGET_UPDATE_TS, - NUM_ENV_STEPS_SAMPLED_LIFETIME, + NUM_ENV_STEPS_TRAINED_LIFETIME, NUM_MODULE_STEPS_TRAINED, NUM_TARGET_UPDATES, ) @@ -28,6 +30,11 @@ class APPOLearner(IMPALALearner): @override(IMPALALearner) def build(self): + self._learner_thread_in_queue = CircularBuffer( + num_batches=self.config.circular_buffer_num_batches, + iterations_per_batch=self.config.circular_buffer_iterations_per_batch, + ) + super().build() # Make target networks. @@ -80,30 +87,22 @@ def after_gradient_based_update(self, *, timesteps: Dict[str, Any]) -> None: """Updates the target Q Networks.""" super().after_gradient_based_update(timesteps=timesteps) - timestep = timesteps.get(NUM_ENV_STEPS_SAMPLED_LIFETIME, 0) - # TODO (sven): Maybe we should have a `after_gradient_based_update` # method per module? + curr_timestep = self.metrics.peek((ALL_MODULES, NUM_ENV_STEPS_TRAINED_LIFETIME)) for module_id, module in self.module._rl_modules.items(): config = self.config.get_config_for_module(module_id) - # TODO (avnish) Using steps trained here instead of sampled ... I'm not sure - # why the other implementation uses sampled. - # The difference in steps sampled/trained is pretty - # much always going to be larger than self.config.num_epochs * - # self.config.minibatch_buffer_size unless the number of steps collected - # is really small. The thing is that the default rollout fragment length - # is 50, so the minibatch buffer size * num_epochs is going to be - # have to be 50 to even meet the threshold of having delayed target - # updates. - # We should instead have the target / kl threshold update be based off - # of the train_batch_size * some target update frequency * num_epochs. - last_update_ts_key = (module_id, LAST_TARGET_UPDATE_TS) - if timestep - self.metrics.peek( - last_update_ts_key, default=0 - ) >= config.target_network_update_freq and isinstance( - module.unwrapped(), TargetNetworkAPI + if isinstance(module.unwrapped(), TargetNetworkAPI) and ( + curr_timestep - self.metrics.peek(last_update_ts_key, default=0) + >= ( + config.target_network_update_freq + * config.circular_buffer_num_batches + * config.circular_buffer_iterations_per_batch + * config.total_train_batch_size + / (config.num_learners or 1) + ) ): for ( main_net, @@ -117,7 +116,7 @@ def after_gradient_based_update(self, *, timesteps: Dict[str, Any]) -> None: # Increase lifetime target network update counter by one. self.metrics.log_value((module_id, NUM_TARGET_UPDATES), 1, reduce="sum") # Update the (single-value -> window=1) last updated timestep metric. - self.metrics.log_value(last_update_ts_key, timestep, window=1) + self.metrics.log_value(last_update_ts_key, curr_timestep, window=1) if ( config.use_kl_loss diff --git a/rllib/algorithms/appo/appo_rl_module.py b/rllib/algorithms/appo/appo_rl_module.py index a3a34bb37735..178f3d0951fb 100644 --- a/rllib/algorithms/appo/appo_rl_module.py +++ b/rllib/algorithms/appo/appo_rl_module.py @@ -2,7 +2,7 @@ from typing import Any, Dict, List, Tuple from ray.rllib.algorithms.ppo.ppo_rl_module import PPORLModule -from ray.rllib.algorithms.appo.appo import OLD_ACTION_DIST_LOGITS_KEY +from ray.rllib.algorithms.appo.appo import TARGET_ACTION_DIST_LOGITS_KEY from ray.rllib.core.learner.utils import make_target_network from ray.rllib.core.models.base import ACTOR from ray.rllib.core.models.tf.encoder import ENCODER_OUT @@ -32,7 +32,7 @@ def get_target_network_pairs(self) -> List[Tuple[NetworkType, NetworkType]]: def forward_target(self, batch: Dict[str, Any]) -> Dict[str, Any]: old_pi_inputs_encoded = self._old_encoder(batch)[ENCODER_OUT][ACTOR] old_action_dist_logits = self._old_pi(old_pi_inputs_encoded) - return {OLD_ACTION_DIST_LOGITS_KEY: old_action_dist_logits} + return {TARGET_ACTION_DIST_LOGITS_KEY: old_action_dist_logits} @OverrideToImplementCustomLogic_CallToSuperRecommended @override(PPORLModule) diff --git a/rllib/algorithms/appo/torch/appo_torch_learner.py b/rllib/algorithms/appo/torch/appo_torch_learner.py index d53815989e09..67d585424343 100644 --- a/rllib/algorithms/appo/torch/appo_torch_learner.py +++ b/rllib/algorithms/appo/torch/appo_torch_learner.py @@ -1,10 +1,21 @@ +"""Asynchronous Proximal Policy Optimization (APPO) + +The algorithm is described in [1] (under the name of "IMPACT"): + +Detailed documentation: +https://docs.ray.io/en/master/rllib-algorithms.html#appo + +[1] IMPACT: Importance Weighted Asynchronous Architectures with Clipped Target Networks. +Luo et al. 2020 +https://arxiv.org/pdf/1912.00167 +""" from typing import Dict from ray.rllib.algorithms.appo.appo import ( APPOConfig, LEARNER_RESULTS_CURR_KL_COEFF_KEY, LEARNER_RESULTS_KL_KEY, - OLD_ACTION_DIST_LOGITS_KEY, + TARGET_ACTION_DIST_LOGITS_KEY, ) from ray.rllib.algorithms.appo.appo_learner import APPOLearner from ray.rllib.algorithms.impala.torch.impala_torch_learner import IMPALATorchLearner @@ -60,45 +71,49 @@ def compute_loss_for_module( ) action_dist_cls_train = module.get_train_action_dist_cls() - target_policy_dist = action_dist_cls_train.from_logits( - fwd_out[Columns.ACTION_DIST_INPUTS] - ) - old_target_policy_dist = action_dist_cls_train.from_logits( - module.forward_target(batch)[OLD_ACTION_DIST_LOGITS_KEY] - ) - old_target_policy_actions_logp = old_target_policy_dist.logp( - batch[Columns.ACTIONS] + # Policy being trained (current). + current_action_dist = action_dist_cls_train.from_logits( + fwd_out[Columns.ACTION_DIST_INPUTS] ) - behaviour_actions_logp = batch[Columns.ACTION_LOGP] - target_actions_logp = target_policy_dist.logp(batch[Columns.ACTIONS]) - - behaviour_actions_logp_time_major = make_time_major( - behaviour_actions_logp, + current_actions_logp = current_action_dist.logp(batch[Columns.ACTIONS]) + current_actions_logp_time_major = make_time_major( + current_actions_logp, trajectory_len=rollout_frag_or_episode_len, recurrent_seq_len=recurrent_seq_len, ) + + # Target policy. + target_action_dist = action_dist_cls_train.from_logits( + module.forward_target(batch)[TARGET_ACTION_DIST_LOGITS_KEY] + ) + target_actions_logp = target_action_dist.logp(batch[Columns.ACTIONS]) target_actions_logp_time_major = make_time_major( target_actions_logp, trajectory_len=rollout_frag_or_episode_len, recurrent_seq_len=recurrent_seq_len, ) - old_actions_logp_time_major = make_time_major( - old_target_policy_actions_logp, + + # EnvRunner's policy (behavior). + behavior_actions_logp = batch[Columns.ACTION_LOGP] + behavior_actions_logp_time_major = make_time_major( + behavior_actions_logp, trajectory_len=rollout_frag_or_episode_len, recurrent_seq_len=recurrent_seq_len, ) + rewards_time_major = make_time_major( batch[Columns.REWARDS], trajectory_len=rollout_frag_or_episode_len, recurrent_seq_len=recurrent_seq_len, ) + + assert Columns.VALUES_BOOTSTRAPPED not in batch values_time_major = make_time_major( values, trajectory_len=rollout_frag_or_episode_len, recurrent_seq_len=recurrent_seq_len, ) - assert Columns.VALUES_BOOTSTRAPPED not in batch # Use as bootstrap values the vf-preds in the next "batch row", except # for the very last row (which doesn't have a next row), for which the # bootstrap value does not matter b/c it has a +1ts value at its end @@ -112,61 +127,86 @@ def compute_loss_for_module( dim=0, ) - # The discount factor that is used should be gamma except for timesteps where - # the episode is terminated. In that case, the discount factor should be 0. + # The discount factor that is used should be `gamma * lambda_`, except for + # termination timesteps, in which case the discount factor should be 0. discounts_time_major = ( - 1.0 - - make_time_major( - batch[Columns.TERMINATEDS], - trajectory_len=rollout_frag_or_episode_len, - recurrent_seq_len=recurrent_seq_len, - ).float() - ) * config.gamma + ( + 1.0 + - make_time_major( + batch[Columns.TERMINATEDS], + trajectory_len=rollout_frag_or_episode_len, + recurrent_seq_len=recurrent_seq_len, + ).float() + # See [1] 3.1: Discounts must contain the GAE lambda_ parameter as well. + ) + * config.gamma + * config.lambda_ + ) # Note that vtrace will compute the main loop on the CPU for better performance. vtrace_adjusted_target_values, pg_advantages = vtrace_torch( - target_action_log_probs=old_actions_logp_time_major, - behaviour_action_log_probs=behaviour_actions_logp_time_major, + # See [1] 3.1: For AˆV-GAE, the ratios used are: min(c¯, π(target)/π(i)) + # π(target) + target_action_log_probs=target_actions_logp_time_major, + # π(i) + behaviour_action_log_probs=behavior_actions_logp_time_major, + # See [1] 3.1: Discounts must contain the GAE lambda_ parameter as well. discounts=discounts_time_major, rewards=rewards_time_major, values=values_time_major, bootstrap_values=bootstrap_values, - clip_pg_rho_threshold=config.vtrace_clip_pg_rho_threshold, + # c¯ clip_rho_threshold=config.vtrace_clip_rho_threshold, + # c¯ (but we allow users to distinguish between c¯ used for + # value estimates and c¯ used for the advantages. + clip_pg_rho_threshold=config.vtrace_clip_pg_rho_threshold, ) pg_advantages = pg_advantages * loss_mask_time_major - # The policy gradients loss. - is_ratio = torch.clip( - torch.exp(behaviour_actions_logp_time_major - old_actions_logp_time_major), + # The policy gradient loss. + # As described in [1], use a logp-ratio of: + # min(π(i) / π(target), ρ) * (π / π(i)), where .. + # - π are the action probs from the current (learner) policy + # - π(i) are the action probs from the ith EnvRunner + # - π(target) are the action probs from the target network + # - ρ is the "target-worker clipping" (2.0 in the paper) + target_worker_is_ratio = torch.clip( + torch.exp( + behavior_actions_logp_time_major - target_actions_logp_time_major + ), 0.0, - 2.0, + config.target_worker_clipping, ) - logp_ratio = is_ratio * torch.exp( - target_actions_logp_time_major - behaviour_actions_logp_time_major + target_worker_logp_ratio = target_worker_is_ratio * torch.exp( + current_actions_logp_time_major - behavior_actions_logp_time_major ) - surrogate_loss = torch.minimum( - pg_advantages * logp_ratio, + pg_advantages * target_worker_logp_ratio, pg_advantages - * torch.clip(logp_ratio, 1 - config.clip_param, 1 + config.clip_param), + * torch.clip( + target_worker_logp_ratio, + 1 - config.clip_param, + 1 + config.clip_param, + ), ) + mean_pi_loss = -(torch.sum(surrogate_loss) / size_loss_mask) + # Compute KL-loss (if required): KL divergence between current action dist. + # and target action dict. if config.use_kl_loss: - action_kl = old_target_policy_dist.kl(target_policy_dist) * loss_mask + action_kl = target_action_dist.kl(current_action_dist) * loss_mask mean_kl_loss = torch.sum(action_kl) / size_loss_mask else: mean_kl_loss = 0.0 - mean_pi_loss = -(torch.sum(surrogate_loss) / size_loss_mask) - # The baseline loss. + # Compute value function loss. delta = values_time_major - vtrace_adjusted_target_values vf_loss = 0.5 * torch.sum(torch.pow(delta, 2.0) * loss_mask_time_major) mean_vf_loss = vf_loss / size_loss_mask - # The entropy loss. + # Compute entropy loss. mean_entropy_loss = ( - -torch.sum(target_policy_dist.entropy() * loss_mask) / size_loss_mask + -torch.sum(current_action_dist.entropy() * loss_mask) / size_loss_mask ) # The summed weighted loss. diff --git a/rllib/algorithms/appo/utils.py b/rllib/algorithms/appo/utils.py index cbd2efe82161..9a4f1e66d0a9 100644 --- a/rllib/algorithms/appo/utils.py +++ b/rllib/algorithms/appo/utils.py @@ -1,12 +1,99 @@ +""" +[1] IMPACT: Importance Weighted Asynchronous Architectures with Clipped Target Networks. +Luo et al. 2020 +https://arxiv.org/pdf/1912.00167 +""" +from collections import deque +import random +import threading +import time + from ray.rllib.models.catalog import ModelCatalog from ray.rllib.models.modelv2 import ModelV2 +from ray.rllib.utils.annotations import OldAPIStack POLICY_SCOPE = "func" TARGET_POLICY_SCOPE = "target_func" -# TODO (sven): Deprecate once APPO and IMPALA fully on RLModules/Learner APIs. +class CircularBuffer: + """A circular batch-wise buffer as described in [1] for APPO. + + The buffer holds at most N batches, which are sampled at random (uniformly). + If full and a new batch is added, the oldest batch is discarded. Also, each batch + currently in the buffer can be sampled at most K times (after which it is also + discarded). + """ + + def __init__(self, num_batches: int, iterations_per_batch: int): + # N from the paper (buffer size). + self.num_batches = num_batches + # K ("replay coefficient") from the paper. + self.iterations_per_batch = iterations_per_batch + + self._buffer = deque(maxlen=self.num_batches) + self._lock = threading.Lock() + + # The number of valid (not expired) entries in this buffer. + self._num_valid_batches = 0 + + def add(self, batch): + dropped_entry = None + dropped_ts = 0 + + # Add buffer and k=0 information to the deque. + with self._lock: + len_ = len(self._buffer) + if len_ == self.num_batches: + dropped_entry = self._buffer[0] + self._buffer.append([batch, 0]) + self._num_valid_batches += 1 + + # A valid entry (w/ a batch whose k has not been reach K yet) was dropped. + if dropped_entry is not None and dropped_entry[0] is not None: + dropped_ts += dropped_entry[0].env_steps() * ( + self.iterations_per_batch - dropped_entry[1] + ) + self._num_valid_batches -= 1 + + return dropped_ts + + def sample(self): + k = entry = batch = None + + while True: + # Only initially, the buffer may be empty -> Just wait for some time. + if len(self) == 0: + time.sleep(0.001) + continue + # Sample a random buffer index. + with self._lock: + entry = self._buffer[random.randint(0, len(self._buffer) - 1)] + batch, k = entry + # Ignore batches that have already been invalidated. + if batch is not None: + break + + # Increase k += 1 for this batch. + assert k is not None + entry[1] += 1 + + # This batch has been exhausted (k == K) -> Invalidate it in the buffer. + if k == self.iterations_per_batch - 1: + entry[0] = None + entry[1] = None + self._num_valid_batches += 1 + + # Return the sampled batch. + return batch + + def __len__(self) -> int: + """Returns the number of actually valid (non-expired) batches in the buffer.""" + return self._num_valid_batches + + +@OldAPIStack def make_appo_models(policy) -> ModelV2: """Builds model and target model for APPO. diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py index 69f140fda775..862c2cf84388 100644 --- a/rllib/algorithms/impala/impala.py +++ b/rllib/algorithms/impala/impala.py @@ -80,12 +80,16 @@ class IMPALAConfig(AlgorithmConfig): .. testcode:: from ray.rllib.algorithms.impala import IMPALAConfig - config = IMPALAConfig() - config = config.training(lr=0.0003, train_batch_size_per_learner=512) - config = config.learners(num_learners=1) - config = config.env_runners(num_env_runners=1) + + config = ( + IMPALAConfig() + .environment("CartPole-v1") + .env_runners(num_env_runners=1) + .training(lr=0.0003, train_batch_size_per_learner=512) + .learners(num_learners=1) + ) # Build a Algorithm object from the config and run 1 training iteration. - algo = config.build(env="CartPole-v1") + algo = config.build() algo.train() del algo @@ -94,16 +98,14 @@ class IMPALAConfig(AlgorithmConfig): from ray.rllib.algorithms.impala import IMPALAConfig from ray import air from ray import tune - config = IMPALAConfig() - # Update the config object. - config = config.training( - lr=tune.grid_search([0.0001, 0.0002]), grad_clip=20.0 + config = ( + IMPALAConfig() + .environment("CartPole-v1") + .env_runners(num_env_runners=1) + .training(lr=tune.grid_search([0.0001, 0.0002]), grad_clip=20.0) + .learners(num_learners=1) ) - config = config.learners(num_learners=1) - config = config.env_runners(num_env_runners=1) - # Set the config object's env. - config = config.environment(env="CartPole-v1") # Run with tune. tune.Tuner( "IMPALA", @@ -146,8 +148,6 @@ def __init__(self, algo_class=None): self.broadcast_interval = 1 self.num_aggregation_workers = 0 self.num_gpu_loader_threads = 8 - # IMPALA takes care of its own EnvRunner (weights, connector, metrics) synching. - self._dont_auto_sync_env_runner_states = True self.grad_clip = 40.0 # Note: Only when using enable_rl_module_and_learner=True can the clipping mode @@ -168,6 +168,9 @@ def __init__(self, algo_class=None): # __sphinx_doc_end__ # fmt: on + # IMPALA takes care of its own EnvRunner (weights, connector, metrics) synching. + self._dont_auto_sync_env_runner_states = True + self.lr_schedule = None # @OldAPIStack self.entropy_coeff_schedule = None # @OldAPIStack self.num_multi_gpu_tower_stacks = 1 # @OldAPIstack @@ -181,7 +184,6 @@ def __init__(self, algo_class=None): self.epsilon = 0.1 # @OldAPIstack self._separate_vf_optimizer = False # @OldAPIstack self._lr_vf = 0.0005 # @OldAPIstack - self.train_batch_size = 500 # @OldAPIstack self.num_gpus = 1 # @OldAPIstack self._tf_policy_handles_more_than_one_loss = True # @OldAPIstack diff --git a/rllib/algorithms/impala/impala_learner.py b/rllib/algorithms/impala/impala_learner.py index c38315d543b7..1929f9f010d6 100644 --- a/rllib/algorithms/impala/impala_learner.py +++ b/rllib/algorithms/impala/impala_learner.py @@ -3,11 +3,12 @@ import queue import threading import time -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Union import tree # pip install dm_tree import ray +from ray.rllib.algorithms.appo.utils import CircularBuffer from ray.rllib.algorithms.impala.impala import LEARNER_RESULTS_CURR_ENTROPY_COEFF_KEY from ray.rllib.core.columns import Columns from ray.rllib.core.learner.learner import Learner @@ -71,7 +72,7 @@ def build(self) -> None: ): self._learner_connector.prepend(AddOneTsToEpisodesAndTruncate()) # Leave all batches on the CPU (they'll be moved to the GPU, if applicable, - # by the n GPU loader threads. + # by the n GPU loader threads). numpy_to_tensor_connector = self._learner_connector[NumpyToTensor][0] numpy_to_tensor_connector._device = "cpu" # TODO (sven): Provide API? @@ -80,7 +81,9 @@ def build(self) -> None: # on the "update queue" for the actual RLModule forward pass and loss # computations. self._gpu_loader_in_queue = queue.Queue() - self._learner_thread_in_queue = deque(maxlen=self.config.learner_queue_size) + # Default is to have a learner thread. + if not hasattr(self, "_learner_thread_in_queue"): + self._learner_thread_in_queue = deque(maxlen=self.config.learner_queue_size) self._learner_thread_out_queue = queue.Queue() # Create and start the GPU loader thread(s). @@ -103,9 +106,6 @@ def build(self) -> None: in_queue=self._learner_thread_in_queue, out_queue=self._learner_thread_out_queue, metrics_logger=self.metrics, - num_epochs=self.config.num_epochs, - minibatch_size=self.config.minibatch_size, - shuffle_batch_per_epoch=self.config.shuffle_batch_per_epoch, ) self._learner_thread.start() @@ -115,13 +115,6 @@ def update_from_episodes( episodes: List[EpisodeType], *, timesteps: Dict[str, Any], - # TODO (sven): Deprecate these in favor of config attributes for only those - # algos that actually need (and know how) to do minibatching. - minibatch_size: Optional[int] = None, - num_epochs: int = 1, - shuffle_batch_per_epoch: bool = False, - num_total_minibatches: int = 0, - reduce_fn=None, # Deprecated args. **kwargs, ) -> ResultDict: self.metrics.set_value( @@ -175,15 +168,25 @@ def update_from_episodes( self._gpu_loader_in_queue.qsize(), ) else: - # Enqueue to Learner thread's in-queue. - _LearnerThread.enqueue( - self._learner_thread_in_queue, - MultiAgentBatch( - {mid: SampleBatch(b) for mid, b in batch.items()}, - env_steps=env_steps, - ), - self.metrics, + ma_batch = MultiAgentBatch( + {mid: SampleBatch(b) for mid, b in batch.items()}, + env_steps=env_steps, ) + # Add the batch directly to the circular buffer. + if isinstance(self._learner_thread_in_queue, CircularBuffer): + ts_dropped = self._learner_thread_in_queue.add(ma_batch) + self.metrics.log_value( + (ALL_MODULES, LEARNER_THREAD_ENV_STEPS_DROPPED), + ts_dropped, + reduce="sum", + ) + else: + # Enqueue to Learner thread's in-queue. + _LearnerThread.enqueue( + self._learner_thread_in_queue, + ma_batch, + self.metrics, + ) # Return all queued result dicts thus far (after reducing over them). results = {} @@ -263,8 +266,17 @@ def _step(self) -> None: policy_batches={mid: SampleBatch(b) for mid, b in batch_on_gpu.items()}, env_steps=env_steps, ) - # Enqueue to Learner thread's in-queue. - _LearnerThread.enqueue(self._out_queue, ma_batch_on_gpu, self.metrics) + + if isinstance(self._out_queue, CircularBuffer): + ts_dropped = self._out_queue.add(ma_batch_on_gpu) + self.metrics.log_value( + (ALL_MODULES, LEARNER_THREAD_ENV_STEPS_DROPPED), + ts_dropped, + reduce="sum", + ) + else: + # Enqueue to Learner thread's in-queue. + _LearnerThread.enqueue(self._out_queue, ma_batch_on_gpu, self.metrics) class _LearnerThread(threading.Thread): @@ -275,9 +287,6 @@ def __init__( in_queue: deque, out_queue: queue.Queue, metrics_logger, - num_epochs, - minibatch_size, - shuffle_batch_per_epoch, ): super().__init__() self.daemon = True @@ -285,13 +294,9 @@ def __init__( self.stopped = False self._update_method = update_method - self._in_queue: deque = in_queue + self._in_queue: Union[deque, CircularBuffer] = in_queue self._out_queue: queue.Queue = out_queue - self._num_epochs = num_epochs - self._minibatch_size = minibatch_size - self._shuffle_batch_per_epoch = shuffle_batch_per_epoch - def run(self) -> None: while not self.stopped: self.step() @@ -299,14 +304,19 @@ def run(self) -> None: def step(self): # Get a new batch from the GPU-data (deque.pop -> newest item first). with self.metrics.log_time((ALL_MODULES, LEARNER_THREAD_IN_QUEUE_WAIT_TIMER)): - if not self._in_queue: - time.sleep(0.001) - return - # Consume from the left (oldest batches first). - # If we consumed from the right, we would run into the danger of learning - # from newer batches (left side) most times, BUT sometimes grabbing a - # really old batches (right area of deque). - ma_batch_on_gpu = self._in_queue.popleft() + # Get a new batch from the GPU-data (learner queue OR circular buffer). + if isinstance(self._in_queue, CircularBuffer): + ma_batch_on_gpu = self._in_queue.sample() + else: + # Queue is empty: Sleep a tiny bit to avoid CPU-thrashing. + if not self._in_queue: + time.sleep(0.001) + return + # Consume from the left (oldest batches first). + # If we consumed from the right, we would run into the danger of + # learning from newer batches (left side) most times, BUT sometimes + # grabbing older batches (right area of deque). + ma_batch_on_gpu = self._in_queue.popleft() # Call the update method on the batch. with self.metrics.log_time((ALL_MODULES, LEARNER_THREAD_UPDATE_TIMER)): @@ -321,9 +331,6 @@ def step(self): (ALL_MODULES, NUM_ENV_STEPS_SAMPLED_LIFETIME), default=0 ) }, - num_epochs=self._num_epochs, - minibatch_size=self._minibatch_size, - shuffle_batch_per_epoch=self._shuffle_batch_per_epoch, ) # We have to deepcopy the results dict, b/c we must avoid having a returned # Stats object sit in the queue and getting a new (possibly even tensor) diff --git a/rllib/algorithms/impala/vtrace_torch.py b/rllib/algorithms/impala/vtrace_torch.py index 35d88822ca89..b63a5181c7ac 100644 --- a/rllib/algorithms/impala/vtrace_torch.py +++ b/rllib/algorithms/impala/vtrace_torch.py @@ -228,6 +228,7 @@ def multi_from_logits( behaviour_action_log_probs, device="cpu" ) behaviour_action_log_probs = force_list(behaviour_action_log_probs) + # log_rhos = target_logp - behavior_logp log_rhos = get_log_rhos(target_action_log_probs, behaviour_action_log_probs) vtrace_returns = from_importance_weights( diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py index c26cd6a22a94..5c3aa575ea9e 100644 --- a/rllib/core/learner/learner.py +++ b/rllib/core/learner/learner.py @@ -1134,6 +1134,9 @@ def _finalize_fn(batch: Dict[str, numpy.ndarray]) -> Dict[str, Any]: fwd_out, loss_per_module, tensor_metrics = self._update( batch.policy_batches ) + # Convert logged tensor metrics (logged during tensor-mode of MetricsLogger) + # to actual (numpy) values. + self.metrics.tensors_to_numpy(tensor_metrics) self._set_slicing_by_batch_id(batch, value=False) # If `num_iters` is reached break and return. @@ -1143,9 +1146,6 @@ def _finalize_fn(batch: Dict[str, numpy.ndarray]) -> Dict[str, Any]: logger.info( f"===> [Learner {id(self)}] number of iterations run in this epoch: {i}" ) - # Convert logged tensor metrics (logged during tensor-mode of MetricsLogger) - # to actual (numpy) values. - self.metrics.tensors_to_numpy(tensor_metrics) # Log all individual RLModules' loss terms and its registered optimizers' # current learning rates. @@ -1350,15 +1350,6 @@ def _update_from_batch_or_episodes( {next(iter(self.module.keys())): batch}, env_steps=len(batch) ) - # TODO (sven): Remove this leftover hack here for the situation in which we - # did not go through the learner connector. - # Options: - # a) Either also pass given batches through the learner connector (even if - # episodes is None). (preferred solution) - # b) Get rid of the option to pass in a batch altogether. - # if episodes is None: - # batch = self._convert_batch_type(batch) - # Check the MultiAgentBatch, whether our RLModule contains all ModuleIDs # found in this batch. If not, throw an error. unknown_module_ids = set(batch.policy_batches.keys()) - set(self.module.keys()) diff --git a/rllib/core/learner/learner_group.py b/rllib/core/learner/learner_group.py index 31994fa5dcce..a80de4cd2e76 100644 --- a/rllib/core/learner/learner_group.py +++ b/rllib/core/learner/learner_group.py @@ -171,11 +171,9 @@ def __init__( self._worker_manager = FaultTolerantActorManager( self._workers, - # TODO (sven): This probably works even without any restriction - # (allowing for any arbitrary number of requests in-flight). Test with - # 3 first, then with unlimited, and if both show the same behavior on - # an async algo, remove this restriction entirely. - max_remote_requests_in_flight_per_actor=3, + max_remote_requests_in_flight_per_actor=( + self.config.max_requests_in_flight_per_learner + ), ) # Counters for the tags for asynchronous update requests that are # in-flight. Used for keeping trakc of and grouping together the results of diff --git a/rllib/core/learner/torch/torch_learner.py b/rllib/core/learner/torch/torch_learner.py index c8cdff3b8ca3..e04a8b491c9a 100644 --- a/rllib/core/learner/torch/torch_learner.py +++ b/rllib/core/learner/torch/torch_learner.py @@ -147,23 +147,7 @@ def _uncompiled_update( # Activate tensor-mode on our MetricsLogger. self.metrics.activate_tensor_mode() - # Log off-policy'ness of this update. - off_policyness = { - (mid, DIFF_NUM_GRAD_UPDATES_VS_SAMPLER_POLICY): ( - (self._weights_seq_no - module_batch[WEIGHTS_SEQ_NO]).float() - ) - for mid, module_batch in batch.items() - if WEIGHTS_SEQ_NO in module_batch - } - for key in off_policyness.keys(): - mid = key[0] - if Columns.LOSS_MASK not in batch[mid]: - off_policyness[key] = torch.mean(off_policyness[key]) - else: - mask = batch[mid][Columns.LOSS_MASK] - num_valid = torch.sum(mask) - off_policyness[key] = torch.sum(off_policyness[key][mask]) / num_valid - self.metrics.log_dict(off_policyness, window=1) + self._compute_off_policyness(batch) fwd_out = self.module.forward_train(batch) loss_per_module = self.compute_losses(fwd_out=fwd_out, batch=batch) @@ -630,6 +614,25 @@ def _map_module_to_device(self, module: MultiRLModule) -> None: if isinstance(module[key], torch.nn.Module): module[key].to(self._device) + def _compute_off_policyness(self, batch): + # Log off-policy'ness of this batch wrt the current weights. + off_policyness = { + (mid, DIFF_NUM_GRAD_UPDATES_VS_SAMPLER_POLICY): ( + (self._weights_seq_no - module_batch[WEIGHTS_SEQ_NO]).float() + ) + for mid, module_batch in batch.items() + if WEIGHTS_SEQ_NO in module_batch + } + for key in off_policyness.keys(): + mid = key[0] + if Columns.LOSS_MASK not in batch[mid]: + off_policyness[key] = torch.mean(off_policyness[key]) + else: + mask = batch[mid][Columns.LOSS_MASK] + num_valid = torch.sum(mask) + off_policyness[key] = torch.sum(off_policyness[key][mask]) / num_valid + self.metrics.log_dict(off_policyness, window=1) + @override(Learner) def _get_tensor_variable( self, value, dtype=None, trainable=False diff --git a/rllib/examples/envs/custom_gym_env.py b/rllib/examples/envs/custom_gym_env.py index 01fa5ecc452f..2612575adb63 100644 --- a/rllib/examples/envs/custom_gym_env.py +++ b/rllib/examples/envs/custom_gym_env.py @@ -45,6 +45,8 @@ | 18.3034 | 28000 | 0.908918 | 12.9676 | +------------------+-------+----------+--------------------+ """ +# These tags allow extracting portions of this script on Anyscale. +# ws-template-imports-start import gymnasium as gym from gymnasium.spaces import Discrete, Box import numpy as np @@ -52,6 +54,8 @@ from typing import Optional +# ws-template-imports-end + from ray.rllib.utils.test_utils import ( add_rllib_example_script_args, run_rllib_example_script_experiment, @@ -71,6 +75,8 @@ ) +# These tags allow extracting portions of this script on Anyscale. +# ws-template-code-start class SimpleCorridor(gym.Env): """Example of a custom env in which the agent has to walk down a corridor. @@ -126,6 +132,8 @@ def step(self, action): ) +# ws-template-code-end + if __name__ == "__main__": args = parser.parse_args() diff --git a/rllib/examples/offline_rl/cartpole_recording.py b/rllib/examples/offline_rl/cartpole_recording.py new file mode 100644 index 000000000000..42258ac46fe0 --- /dev/null +++ b/rllib/examples/offline_rl/cartpole_recording.py @@ -0,0 +1,163 @@ +"""Example showing how to record expert data from a trained policy. + +This example: + - demonstrates how you can train a single-agent expert PPO Policy (RLModule) + and checkpoint it. + - shows how you can then record expert data from the trained PPO Policy to + disk during evaluation. + +How to run this script +---------------------- +`python [script file name].py --checkpoint-at-end` + +For debugging, use the following additional command line options +`--no-tune --num-env-runners=0` +which should allow you to set breakpoints anywhere in the RLlib code and +have the execution stop there for inspection and debugging. + +For logging to your WandB account, use: +`--wandb-key=[your WandB API key] --wandb-project=[some project name] +--wandb-run-name=[optional: WandB run name (within the defined project)]` + +Results to expect +----------------- +In the console output you can see that the episode return of 350.0 is reached +before the timestep stop criteria is touched. Afterwards evaluation starts and +runs 10 iterations while recording the data. The number of recorded experiences +might differ from evaluation run to evaluation run because evaluation +`EnvRunner`s sample episodes while recording timesteps and episodes contain +usually different numbers of timesteps. Note, this is different when recording +episodes - in this case each row is one episode. + ++-----------------------------+------------+----------------------+ +| Trial name | status | loc | +| | | | +|-----------------------------+------------+----------------------+ +| PPO_CartPole-v1_df83f_00000 | TERMINATED | 192.168.0.119:233661 | ++-----------------------------+------------+----------------------+ ++--------+------------------+------------------------+------------------------+ +| iter | total time (s) | num_training_step_ca | num_env_steps_sample | +| | | lls_per_iteration | d_lifetime | ++--------+------------------+------------------------+------------------------| +| 21 | 25.9162 | 1 | 84000 | ++--------+------------------+------------------------+------------------------+ + +... + +Number of experiences recorded: 26644 +""" + +import ray + +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.core import COMPONENT_RL_MODULE +from ray.rllib.core.columns import Columns +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.utils.metrics import ( + ENV_RUNNER_RESULTS, + EPISODE_RETURN_MEAN, + EVALUATION_RESULTS, + NUM_ENV_STEPS_SAMPLED_LIFETIME, +) +from ray.rllib.utils.test_utils import add_rllib_example_script_args + +parser = add_rllib_example_script_args( + default_timesteps=200000, + default_reward=350.0, +) +parser.set_defaults(checkpoint_at_end=True, max_concurrent_trials=1) +# Use `parser` to add your own custom command line options to this script +# and (if needed) use their values to set up `config` below. +args = parser.parse_args() + +config = ( + PPOConfig() + .env_runners( + num_env_runners=5, + ) + .environment("CartPole-v1") + .rl_module( + model_config=DefaultModelConfig( + fcnet_hiddens=[32], + fcnet_activation="linear", + vf_share_layers=True, + ), + ) + .training( + lr=0.0003, + num_epochs=6, + vf_loss_coeff=0.01, + ) + .evaluation( + evaluation_num_env_runners=1, + evaluation_interval=1, + evaluation_parallel_to_training=True, + evaluation_config=PPOConfig.overrides(explore=False), + ) +) + +stop = { + f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": args.stop_timesteps, + f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": ( + args.stop_reward + ), +} + + +if __name__ == "__main__": + from ray.rllib.utils.test_utils import run_rllib_example_script_experiment + + results = run_rllib_example_script_experiment(config, args, stop=stop) + + # Store the best checkpoint for recording. + best_checkpoint = results.get_best_result( + metric=f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}", + mode="max", + ).checkpoint.path + + # Configure the algorithm for offline recording. + config.offline_data( + output="local:///tmp/cartpole/", + # Store columnar (tabular) data. + output_write_episodes=False, + # Each file should hold 1,000 rows. + output_max_rows_per_file=1000, + output_write_remaining_data=True, + # LZ4-compress columns 'obs', 'new_obs', and 'actions' to + # save disk space and increase performance. Note, this means + # that you have to use `input_compress_columns` in the same + # way when using the data for training in `RLlib`. + output_compress_columns=[Columns.OBS, Columns.ACTIONS], + ) + # Change the evaluation settings to sample exactly 50 episodes + # per evaluation iteration and increase the number of evaluation + # env-runners to 5. + config.evaluation( + evaluation_num_env_runners=5, + evaluation_duration=50, + evaluation_duration_unit="episodes", + evaluation_interval=1, + evaluation_parallel_to_training=False, + evaluation_config=PPOConfig.overrides(explore=False), + ) + + # Build the algorithm for evaluation. + algo = config.build() + # Load the checkpoint stored above. + algo.restore_from_path( + best_checkpoint, + component=COMPONENT_RL_MODULE, + ) + + # Evaluate over 10 iterations and record the data. + for i in range(10): + print(f"Iteration: {i + 1}:\n") + res = algo.evaluate() + print(res) + + # Stop the algorithm. + algo.stop() + + # Check the number of rows in the dataset. + ds = ray.data.read_parquet("local:///tmp/cartpole") + print(f"Number of experiences recorded: {ds.count()}") diff --git a/rllib/offline/offline_env_runner.py b/rllib/offline/offline_env_runner.py index 9da38b60bd6a..a0fee273c007 100644 --- a/rllib/offline/offline_env_runner.py +++ b/rllib/offline/offline_env_runner.py @@ -29,6 +29,13 @@ def __init__(self, config: AlgorithmConfig, **kwargs): # Initialize the parent. super().__init__(config, **kwargs) + # Get the data context for this `EnvRunner`. + data_context = ray.data.DataContext.get_current() + # Limit the resources for Ray Data to the CPUs given to this `EnvRunner`. + data_context.execution_options.resource_limits.cpu = ( + config.num_cpus_per_env_runner + ) + # Set the output write method. self.output_write_method = self.config.output_write_method self.output_write_method_kwargs = self.config.output_write_method_kwargs @@ -92,6 +99,10 @@ def __init__(self, config: AlgorithmConfig, **kwargs): else: self.write_data_this_iter = True + # If the remaining data should be stored. Note, this is only + # relevant in case `output_max_rows_per_file` is defined. + self.write_remaining_data = self.config.output_write_remaining_data + # Counts how often `sample` is called to define the output path for # each file. self._sample_counter = 0 @@ -155,15 +166,18 @@ def sample( if self.output_max_rows_per_file: # Reset the event. self.write_data_this_iter = False - - # Extract the number of samples to be written to disk this iteration. - samples_to_write = self._samples[: self.output_max_rows_per_file] - # Reset the buffer to the remaining data. This only makes sense, if - # `rollout_fragment_length` is smaller `output_max_rows_per_file` or - # a 2 x `output_max_rows_per_file`. - # TODO (simon): Find a better way to write these data. - self._samples = self._samples[self.output_max_rows_per_file :] - samples_ds = ray.data.from_items(samples_to_write) + # Ensure that all data ready to be written is released from + # the buffer. Note, this is important in case we have many + # episodes sampled and a relatively small `output_max_rows_per_file`. + while len(self._samples) >= self.output_max_rows_per_file: + # Extract the number of samples to be written to disk this + # iteration. + samples_to_write = self._samples[: self.output_max_rows_per_file] + # Reset the buffer to the remaining data. This only makes sense, if + # `rollout_fragment_length` is smaller `output_max_rows_per_file` or + # a 2 x `output_max_rows_per_file`. + self._samples = self._samples[self.output_max_rows_per_file :] + samples_ds = ray.data.from_items(samples_to_write) # Otherwise, write the complete data. else: samples_ds = ray.data.from_items(self._samples) @@ -183,6 +197,11 @@ def sample( except Exception as e: logger.error(e) + self.metrics.log_value( + key="recording_buffer_size", + value=len(self._samples), + ) + # Finally return the samples as usual. return samples @@ -196,11 +215,11 @@ def stop(self) -> None: """ # If there are samples left over we have to write htem to disk. them # to a dataset. - if self._samples: + if self._samples and self.write_remaining_data: # Convert them to a `ray.data.Dataset`. samples_ds = ray.data.from_items(self._samples) # Increase the sample counter for the folder/file name. - self._sample_counter += 1.0 + self._sample_counter += 1 # Try to write the dataset to disk/cloud storage. try: # Setup the path for writing data. Each run will be written to diff --git a/rllib/offline/offline_prelearner.py b/rllib/offline/offline_prelearner.py index b000f2c965fc..f5ffca03e75a 100644 --- a/rllib/offline/offline_prelearner.py +++ b/rllib/offline/offline_prelearner.py @@ -1,10 +1,10 @@ import gymnasium as gym import logging import numpy as np -import random +import uuid + from typing import Any, Dict, List, Optional, Union, Set, Tuple, TYPE_CHECKING -import ray from ray.actor import ActorHandle from ray.rllib.core.columns import Columns from ray.rllib.core.learner import Learner @@ -86,8 +86,8 @@ def __init__( self, config: "AlgorithmConfig", learner: Union[Learner, list[ActorHandle]], + locality_hints: Optional[List[str]] = None, spaces: Optional[Tuple[gym.Space, gym.Space]] = None, - locality_hints: Optional[list] = None, module_spec: Optional[MultiRLModuleSpec] = None, module_state: Optional[Dict[ModuleID, Any]] = None, ): @@ -103,24 +103,6 @@ def __init__( self._module = self._learner._module # Otherwise we have remote `Learner`s. else: - # TODO (simon): Check with the data team how to get at - # initialization the data block location. - node_id = ray.get_runtime_context().get_node_id() - # Shuffle indices such that not each data block syncs weights - # with the same learner in case there are multiple learners - # on the same node like the `PreLearner`. - indices = list(range(len(locality_hints))) - random.shuffle(indices) - locality_hints = [locality_hints[i] for i in indices] - learner = [learner[i] for i in indices] - # Choose a learner from the same node. - for i, hint in enumerate(locality_hints): - if hint == node_id: - self._learner = learner[i] - # If no learner has been chosen, there is none on the same node. - if not self._learner: - # Then choose a learner randomly. - self._learner = learner[random.randint(0, len(learner) - 1)] self.learner_is_remote = True # Build the module from spec. Note, this will be a MultiRLModule. self._module = module_spec.build() @@ -525,21 +507,83 @@ def _map_sample_batch_to_episode( # TODO (simon): Add support for multi-agent episodes. NotImplementedError else: - # Unpack observations, if needed. - obs = ( - unpack_if_needed(obs.tolist()) - if schema[Columns.OBS] in input_compress_columns - else obs.tolist() - ) - # Append the last `new_obs` to get the correct length of observations. - obs.append( - unpack_if_needed(batch[schema[Columns.NEXT_OBS]][i][-1]) - if schema[Columns.OBS] in input_compress_columns - else batch[schema[Columns.NEXT_OBS]][i][-1] - ) + # Unpack observations, if needed. Note, observations could + # be either compressed by their entirety (the complete batch + # column) or individually (each column entry). + if isinstance(obs, str): + # Decompress the observations if we have a string, i.e. + # observations are compressed in their entirety. + obs = unpack_if_needed(obs) + # Convert to a list of arrays. This is needed as input by + # the `SingleAgentEpisode`. + obs = [obs[i, ...] for i in range(obs.shape[0])] + # Otherwise observations are only compressed inside of the + # batch column (if at all). + elif isinstance(obs, np.ndarray): + # Unpack observations, if they are compressed otherwise we + # simply convert to a list, which is needed by the + # `SingleAgentEpisode`. + obs = ( + unpack_if_needed(obs.tolist()) + if schema[Columns.OBS] in input_compress_columns + else obs.tolist() + ) + else: + raise TypeError( + f"Unknown observation type: {type(obs)}. When mapping " + "from old recorded `SampleBatches` batched " + "observations should be either of type `np.array` " + "or - if the column is compressed - of `str` type." + ) + + if schema[Columns.NEXT_OBS] in batch: + # Append the last `new_obs` to get the correct length of + # observations. + obs.append( + unpack_if_needed(batch[schema[Columns.NEXT_OBS]][i][-1]) + if schema[Columns.OBS] in input_compress_columns + else batch[schema[Columns.NEXT_OBS]][i][-1] + ) + else: + # Otherwise we duplicate the last observation. + obs.append(obs[-1]) + + # Check, if we have `done`, `truncated`, or `terminated`s in + # the batch. + if ( + schema[Columns.TRUNCATEDS] in batch + and schema[Columns.TERMINATEDS] in batch + ): + truncated = batch[schema[Columns.TRUNCATEDS]][i][-1] + terminated = batch[schema[Columns.TERMINATEDS]][i][-1] + elif ( + schema[Columns.TRUNCATEDS] in batch + and schema[Columns.TERMINATEDS] not in batch + ): + truncated = batch[schema[Columns.TRUNCATEDS]][i][-1] + terminated = False + elif ( + schema[Columns.TRUNCATEDS] not in batch + and schema[Columns.TERMINATEDS] in batch + ): + terminated = batch[schema[Columns.TERMINATEDS]][i][-1] + truncated = False + elif "done" in batch: + terminated = batch["done"][i][-1] + truncated = False + # Otherwise, if no `terminated`, nor `truncated` nor `done` + # is given, we consider the episode as terminated. + else: + terminated = True + truncated = False + # Create a `SingleAgentEpisode`. episode = SingleAgentEpisode( - id_=str(batch[schema[Columns.EPS_ID]][i][0]), + # If the recorded episode has an ID we use this ID, + # otherwise we generate a new one. + id_=str(batch[schema[Columns.EPS_ID]][i][0]) + if schema[Columns.EPS_ID] in batch + else uuid.uuid4().hex, agent_id=agent_id, observations=obs, infos=( @@ -554,16 +598,8 @@ def _map_sample_batch_to_episode( else batch[schema[Columns.ACTIONS]][i] ), rewards=batch[schema[Columns.REWARDS]][i], - terminated=( - any(batch[schema[Columns.TERMINATEDS]][i]) - if schema[Columns.TERMINATEDS] in batch - else any(batch["dones"][i]) - ), - truncated=( - any(batch[schema[Columns.TRUNCATEDS]][i]) - if schema[Columns.TRUNCATEDS] in batch - else False - ), + terminated=terminated, + truncated=truncated, # TODO (simon): Results in zero-length episodes in connector. # t_started=batch[Columns.T if Columns.T in batch else # "unroll_id"][i][0], diff --git a/rllib/offline/tests/test_offline_data.py b/rllib/offline/tests/test_offline_data.py index fad307cc7745..038e9cef383f 100644 --- a/rllib/offline/tests/test_offline_data.py +++ b/rllib/offline/tests/test_offline_data.py @@ -124,7 +124,7 @@ def test_sample_multiple_learners(self): num_samples=10, return_iterator=2, num_shards=2 ) self.assertIsInstance(batch, list) - # Ensure we have indeed two such `SStreamSplitDataIterator` instances. + # Ensure we have indeed two such `StreamSplitDataIterator` instances. self.assertEqual(len(batch), 2) from ray.data._internal.iterator.stream_split_iterator import ( StreamSplitDataIterator, diff --git a/rllib/tuned_examples/appo/cartpole_appo.py b/rllib/tuned_examples/appo/cartpole_appo.py index 0af651b6c607..a85a9120ba2a 100644 --- a/rllib/tuned_examples/appo/cartpole_appo.py +++ b/rllib/tuned_examples/appo/cartpole_appo.py @@ -16,6 +16,7 @@ APPOConfig() .environment("CartPole-v1") .training( + circular_buffer_iterations_per_batch=2, vf_loss_coeff=0.05, entropy_coeff=0.0, ) diff --git a/rllib/tuned_examples/appo/pong_appo.py b/rllib/tuned_examples/appo/pong_appo.py index d79dbaa13fc7..ca36ca60fb7c 100644 --- a/rllib/tuned_examples/appo/pong_appo.py +++ b/rllib/tuned_examples/appo/pong_appo.py @@ -65,7 +65,7 @@ def _env_creator(cfg): entropy_coeff=[[0, 0.05], [3000000, 0.0]], # <- crucial parameter to finetune # Only update connector states and model weights every n training_step calls. broadcast_interval=5, - learner_queue_size=1, + circular_buffer_num_batches=1, ) .rl_module( model_config=DefaultModelConfig( diff --git a/rllib/tuned_examples/bc/cartpole_recording.py b/rllib/tuned_examples/bc/cartpole_recording.py deleted file mode 100644 index a75cb31a9228..000000000000 --- a/rllib/tuned_examples/bc/cartpole_recording.py +++ /dev/null @@ -1,61 +0,0 @@ -from ray.rllib.algorithms.ppo import PPOConfig -from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig -from ray.rllib.utils.metrics import ( - ENV_RUNNER_RESULTS, - EPISODE_RETURN_MEAN, - EVALUATION_RESULTS, - NUM_ENV_STEPS_SAMPLED_LIFETIME, -) -from ray.rllib.utils.test_utils import add_rllib_example_script_args - -parser = add_rllib_example_script_args() -# Use `parser` to add your own custom command line options to this script -# and (if needed) use their values to set up `config` below. -args = parser.parse_args() - -config = ( - PPOConfig() - .env_runners( - rollout_fragment_length=1000, num_env_runners=0, batch_mode="truncate_episodes" - ) - .environment("CartPole-v1") - .rl_module( - model_config=DefaultModelConfig( - fcnet_hiddens=[32], - fcnet_activation="linear", - vf_share_layers=True, - ), - ) - .training( - lr=0.0003, - num_epochs=6, - vf_loss_coeff=0.01, - ) - .evaluation( - evaluation_num_env_runners=1, - evaluation_interval=1, - evaluation_parallel_to_training=True, - evaluation_config=PPOConfig.overrides(exploration=False), - ) - .offline_data( - output="local:///tmp/cartpole/", - output_write_episodes=False, - output_max_rows_per_file=1000, - # LZ4-compress columns 'obs', 'new_obs', and 'actions' to - # save disk space and increase performance. Note, this means - # that you have to use `input_compress_columns` in the same - # way when using the data for training in `RLlib`. - output_compress_columns=["obs", "new_obs", "actions"], - ) -) - -stop = { - f"{NUM_ENV_STEPS_SAMPLED_LIFETIME}": 200000, - f"{EVALUATION_RESULTS}/{ENV_RUNNER_RESULTS}/{EPISODE_RETURN_MEAN}": 350.0, -} - - -if __name__ == "__main__": - from ray.rllib.utils.test_utils import run_rllib_example_script_experiment - - run_rllib_example_script_experiment(config, args, stop=stop) diff --git a/rllib/tuned_examples/ppo/atari_ppo.py b/rllib/tuned_examples/ppo/atari_ppo.py index b4d881574f4e..c58c47898a1a 100644 --- a/rllib/tuned_examples/ppo/atari_ppo.py +++ b/rllib/tuned_examples/ppo/atari_ppo.py @@ -1,3 +1,5 @@ +# These tags allow extracting portions of this script on Anyscale. +# ws-template-imports-start import gymnasium as gym from ray import tune @@ -8,6 +10,7 @@ from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack from ray.rllib.utils.test_utils import add_rllib_example_script_args +# ws-template-imports-end parser = add_rllib_example_script_args( default_reward=float("inf"), @@ -22,7 +25,12 @@ # and (if needed) use their values to set up `config` below. args = parser.parse_args() +NUM_LEARNERS = args.num_learners or 1 +ENV = args.env + +# These tags allow extracting portions of this script on Anyscale. +# ws-template-code-start def _make_env_to_module_connector(env): return FrameStackingEnvToModule(num_frames=4) @@ -35,7 +43,7 @@ def _make_learner_connector(input_observation_space, input_action_space): # We would like our frame stacking connector to do this job. def _env_creator(cfg): return wrap_atari_for_new_api_stack( - gym.make(args.env, **cfg, render_mode="rgb_array"), + gym.make(ENV, **cfg, render_mode="rgb_array"), # Perform frame-stacking through ConnectorV2 API. framestack=None, ) @@ -43,7 +51,6 @@ def _env_creator(cfg): tune.register_env("env", _env_creator) - config = ( PPOConfig() .environment( @@ -57,20 +64,19 @@ def _env_creator(cfg): clip_rewards=True, ) .env_runners( - # num_envs_per_env_runner=5, # 5 on old yaml example env_to_module_connector=_make_env_to_module_connector, ) .training( learner_connector=_make_learner_connector, - train_batch_size_per_learner=4000, # 5000 on old yaml example - minibatch_size=128, # 500 on old yaml example + train_batch_size_per_learner=4000, + minibatch_size=128, lambda_=0.95, kl_coeff=0.5, clip_param=0.1, vf_clip_param=10.0, entropy_coeff=0.01, num_epochs=10, - lr=0.00015 * (args.num_learners or 1), + lr=0.00015 * NUM_LEARNERS, grad_clip=100.0, grad_clip_by="global_norm", ) @@ -83,7 +89,7 @@ def _env_creator(cfg): ), ) ) - +# ws-template-code-end if __name__ == "__main__": from ray.rllib.utils.test_utils import run_rllib_example_script_experiment diff --git a/rllib/utils/images.py b/rllib/utils/images.py index 0716ea5c45b4..7b0f1601d574 100644 --- a/rllib/utils/images.py +++ b/rllib/utils/images.py @@ -1,4 +1,5 @@ import logging +import importlib import numpy as np @@ -6,13 +7,29 @@ logger = logging.getLogger(__name__) + +@DeveloperAPI +def is_package_installed(package_name): + try: + importlib.metadata.version(package_name) + return True + except importlib.metadata.PackageNotFoundError: + return False + + try: import cv2 cv2.ocl.setUseOpenCL(False) logger.debug("CV2 found for image processing.") -except ImportError: +except ImportError as e: + if is_package_installed("opencv-python"): + raise ImportError( + f"OpenCV is installed, but we failed to import it. This may be because " + f"you need to install `opencv-python-headless` instead of " + f"`opencv-python`. Error message: {e}", + ) cv2 = None diff --git a/rllib/utils/metrics/__init__.py b/rllib/utils/metrics/__init__.py index d9a67933998c..5e7e3ad071bd 100644 --- a/rllib/utils/metrics/__init__.py +++ b/rllib/utils/metrics/__init__.py @@ -11,6 +11,9 @@ NUM_TRAINABLE_PARAMETERS = "num_trainable_parameters" NUM_NON_TRAINABLE_PARAMETERS = "num_non_trainable_parameters" +# Number of times `training_step()` was called in one iteration. +NUM_TRAINING_STEP_CALLS_PER_ITERATION = "num_training_step_calls_per_iteration" + # Counters for sampling, sampling (on eval workers) and # training steps (env- and agent steps). MEAN_NUM_EPISODE_LISTS_RECEIVED = "mean_num_episode_lists_received" diff --git a/rllib/utils/metrics/metrics_logger.py b/rllib/utils/metrics/metrics_logger.py index f1f6f4cc12e8..276d6891b14a 100644 --- a/rllib/utils/metrics/metrics_logger.py +++ b/rllib/utils/metrics/metrics_logger.py @@ -937,10 +937,10 @@ def deactivate_tensor_mode(self): assert self.tensor_mode self._tensor_mode = False # Return all logged tensors (logged during the tensor-mode phase). - ret = {key: self._get_key(key).peek() for key in self._tensor_keys} + logged_tensors = {key: self._get_key(key).peek() for key in self._tensor_keys} # Clear out logged tensor keys. self._tensor_keys.clear() - return ret + return logged_tensors def tensors_to_numpy(self, tensor_metrics): """Converts all previously logged and returned tensors back to numpy values.""" diff --git a/rllib/utils/metrics/stats.py b/rllib/utils/metrics/stats.py index 1929cec2b063..b13b7edb3b48 100644 --- a/rllib/utils/metrics/stats.py +++ b/rllib/utils/metrics/stats.py @@ -216,7 +216,8 @@ def __init__( # Code to execute when exiting a with-context. self._on_exit = on_exit - # On each `.reduce()` call, we store the result of this call in + # On each `.reduce()` call, we store the result of this call in hist[0] and the + # previous `reduce()` result in hist[1]. self._hist = (0, 0) def push(self, value) -> None: @@ -274,6 +275,7 @@ def peek(self, *, previous: bool = False) -> Any: The result of reducing the internal values list (or the previously computed reduced result, if `previous` is True). """ + # Return previously reduced value. if previous: return self._hist[1] return self._reduced_values()[0] @@ -355,10 +357,8 @@ def merge_in_parallel(self, *others: "Stats") -> None: # - Thereby always reducing across the different Stats objects' at the # current index. # - The resulting reduced value (across Stats at current index) is then - # repeated AND - # added to the new merged-values list n times (where n is the number of - # Stats, across - # which we merge). + # repeated AND added to the new merged-values list n times (where n is + # the number of Stats, across which we merge). # - The merged-values list is reversed. # Here: # index -1: [3, 6] -> [4.5, 4.5] @@ -381,13 +381,11 @@ def merge_in_parallel(self, *others: "Stats") -> None: stats.merge_in_parallel(stats1, stats2) # Same here: Fill new merged-values list: # - Start with index -1, moving to the start. - # - Thereby always reducing across the different Stats objects' at the + # - Thereby always reduce across the different Stats objects' at the # current index. # - The resulting reduced value (across Stats at current index) is then - # repeated AND - # added to the new merged-values list n times (where n is the number of - # Stats, across - # which we merge). + # repeated AND added to the new merged-values list n times (where n is the + # number of Stats, across which we merge). # - The merged-values list is reversed. # Here: # index -1: [3, 6] -> [6, 6] @@ -420,7 +418,7 @@ def merge_in_parallel(self, *others: "Stats") -> None: # Parallel-merge two (reduce=sum) stats with no window. # Note that when reduce="sum", we do NOT reduce across the indices of the - # parallel + # parallel values. stats = Stats(reduce="sum") stats1 = Stats(reduce="sum") stats1.push(1) @@ -435,7 +433,6 @@ def merge_in_parallel(self, *others: "Stats") -> None: # index -2: [0, 5] -> [3, 6, 0, 5] # index -3: [2, 4] -> [3, 6, 0, 5, 2, 4] # index -4: [1] -> [3, 6, 0, 5, 2, 4, 1] - # STOP after merged list contains >= 4 items (window size) # reverse: [1, 4, 2, 5, 0, 6, 3] stats.merge_in_parallel(stats1, stats2) check(stats.values, [1, 4, 2, 5, 0, 6, 3]) @@ -443,7 +440,7 @@ def merge_in_parallel(self, *others: "Stats") -> None: # Parallel-merge two "concat" (reduce=None) stats with no window. # Note that when reduce=None, we do NOT reduce across the indices of the - # parallel + # parallel values. stats = Stats(reduce=None, window=float("inf"), clear_on_reduce=True) stats1 = Stats(reduce=None, window=float("inf"), clear_on_reduce=True) stats1.push(1) @@ -586,7 +583,6 @@ def from_state(state: Dict[str, Any]) -> "Stats": def similar_to( other: "Stats", init_value: Optional[Any] = None, - prev_values: Optional[Tuple[Any, Any]] = None, ) -> "Stats": """Returns a new Stats object that's similar to `other`. diff --git a/src/mock/ray/core_worker/core_worker.h b/src/mock/ray/core_worker/core_worker.h index 60817fb7af1c..ff1e1d7ab130 100644 --- a/src/mock/ray/core_worker/core_worker.h +++ b/src/mock/ray/core_worker/core_worker.h @@ -14,28 +14,11 @@ #pragma once #include "gmock/gmock.h" #include "mock/ray/gcs/gcs_client/gcs_client.h" -namespace ray { -namespace core { -class MockCoreWorkerOptions : public CoreWorkerOptions { - public: -}; - -} // namespace core -} // namespace ray - -namespace ray { -namespace core { - -class MockCoreWorkerProcess : public CoreWorkerProcess { - public: -}; - -} // namespace core -} // namespace ray +namespace ray::core { -namespace ray { -namespace core { +class MockCoreWorkerOptions : public CoreWorkerOptions {}; +class MockCoreWorkerProcess : public CoreWorkerProcess {}; class MockCoreWorker : public CoreWorker { public: @@ -179,5 +162,4 @@ class MockCoreWorker : public CoreWorker { (override)); }; -} // namespace core -} // namespace ray +} // namespace ray::core diff --git a/src/mock/ray/core_worker/reference_count.h b/src/mock/ray/core_worker/reference_count.h index c0679dec135f..c9f7a1d0b415 100644 --- a/src/mock/ray/core_worker/reference_count.h +++ b/src/mock/ray/core_worker/reference_count.h @@ -41,7 +41,7 @@ class MockReferenceCounter : public ReferenceCounterInterface { bool add_local_ref, const absl::optional &pinned_at_raylet_id)); - MOCK_METHOD2(AddObjectPrimaryCopyDeleteCallback, + MOCK_METHOD2(AddObjectOutOfScopeOrFreedCallback, bool(const ObjectID &object_id, const std::function callback)); diff --git a/src/mock/ray/gcs/gcs_server/gcs_node_manager.h b/src/mock/ray/gcs/gcs_server/gcs_node_manager.h index 3a8f22949fae..7a3efe197529 100644 --- a/src/mock/ray/gcs/gcs_server/gcs_node_manager.h +++ b/src/mock/ray/gcs/gcs_server/gcs_node_manager.h @@ -18,7 +18,11 @@ namespace gcs { class MockGcsNodeManager : public GcsNodeManager { public: - MockGcsNodeManager() : GcsNodeManager(nullptr, nullptr, nullptr, ClusterID::Nil()) {} + MockGcsNodeManager() + : GcsNodeManager(/*gcs_publisher=*/nullptr, + /*gcs_table_storage=*/nullptr, + /*raylet_client_pool=*/nullptr, + /*cluster_id=*/ClusterID::Nil()) {} MOCK_METHOD(void, HandleRegisterNode, (rpc::RegisterNodeRequest request, diff --git a/src/ray/common/BUILD b/src/ray/common/BUILD index 416dd9659983..563c53007ecc 100644 --- a/src/ray/common/BUILD +++ b/src/ray/common/BUILD @@ -181,6 +181,7 @@ ray_cc_library( "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/container:flat_hash_set", "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", "@com_google_absl//absl/synchronization", ], ) diff --git a/src/ray/common/runtime_env_manager.cc b/src/ray/common/runtime_env_manager.cc index d765dbc4f1de..08187cf29932 100644 --- a/src/ray/common/runtime_env_manager.cc +++ b/src/ray/common/runtime_env_manager.cc @@ -31,12 +31,13 @@ void RuntimeEnvManager::AddURIReference(const std::string &hex_id, if (!uris.working_dir_uri().empty()) { const auto &uri = uris.working_dir_uri(); uri_reference_[uri]++; - id_to_uris_[hex_id].push_back(uri); + id_to_uris_[hex_id].emplace_back(uri); RAY_LOG(DEBUG) << "[working_dir] Added URI Reference " << uri << " for id " << hex_id; } + for (const auto &uri : uris.py_modules_uris()) { uri_reference_[uri]++; - id_to_uris_[hex_id].push_back(uri); + id_to_uris_[hex_id].emplace_back(uri); RAY_LOG(DEBUG) << "[py_modules] Added URI Reference " << uri << " for id " << hex_id; } PrintDebugString(); @@ -51,21 +52,24 @@ const std::vector &RuntimeEnvManager::GetReferences( void RuntimeEnvManager::RemoveURIReference(const std::string &hex_id) { RAY_LOG(DEBUG) << "Subtracting 1 from URI Reference for id " << hex_id; - if (!id_to_uris_.count(hex_id)) { + auto iter = id_to_uris_.find(hex_id); + if (iter == id_to_uris_.end()) { return; } - for (const auto &uri : id_to_uris_[hex_id]) { - --uri_reference_[uri]; - auto ref_count = uri_reference_[uri]; - RAY_CHECK(ref_count >= 0); - if (ref_count == 0) { - uri_reference_.erase(uri); + for (const auto &uri : iter->second) { + auto uri_ref_iter = uri_reference_.find(uri); + RAY_CHECK(uri_ref_iter != uri_reference_.end()); + --uri_ref_iter->second; + const auto new_ref_count = uri_ref_iter->second; + RAY_CHECK_GE(new_ref_count, 0); + if (new_ref_count == 0) { + uri_reference_.erase(uri_ref_iter); RAY_LOG(DEBUG) << "Deleting URI Reference " << uri; deleter_(uri, [](bool success) {}); } } - id_to_uris_.erase(hex_id); + id_to_uris_.erase(iter); PrintDebugString(); } diff --git a/src/ray/common/runtime_env_manager.h b/src/ray/common/runtime_env_manager.h index a6b282863307..ab58409d8d91 100644 --- a/src/ray/common/runtime_env_manager.h +++ b/src/ray/common/runtime_env_manager.h @@ -12,7 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. #pragma once + #include +#include #include "absl/container/flat_hash_map.h" #include "ray/common/id.h" @@ -32,9 +34,9 @@ class RuntimeEnvManager { public: using DeleteFunc = std::function)>; - explicit RuntimeEnvManager(DeleteFunc deleter) : deleter_(deleter) {} + explicit RuntimeEnvManager(DeleteFunc deleter) : deleter_(std::move(deleter)) {} - /// Increase the reference of URI by job or actor ID and runtime_env. + /// Increase the reference count of URI by job or actor ID and runtime_env. /// /// \param[in] hex_id The id of the runtime env. It can be an actor or job id. /// \param[in] runtime_env_info The runtime env used by the id. @@ -53,7 +55,7 @@ class RuntimeEnvManager { /// \return The URIs referenced by the id. const std::vector &GetReferences(const std::string &hex_id) const; - /// Decrease the reference of URI by job_id + /// Decrease the reference count of URI by job_id /// \param[in] hex_id The id of the runtime env. void RemoveURIReference(const std::string &hex_id); diff --git a/src/ray/common/task/task.cc b/src/ray/common/task/task.cc index e2ac8571c4e5..812c0598cc35 100644 --- a/src/ray/common/task/task.cc +++ b/src/ray/common/task/task.cc @@ -14,7 +14,7 @@ #include "ray/common/task/task.h" -#include +#include "absl/strings/str_format.h" namespace ray { @@ -42,9 +42,7 @@ const std::string &RayTask::GetPreferredNodeID() const { return preferred_node_i void RayTask::ComputeDependencies() { dependencies_ = task_spec_.GetDependencies(); } std::string RayTask::DebugString() const { - std::ostringstream stream; - stream << "task_spec={" << task_spec_.DebugString() << "}"; - return stream.str(); + return absl::StrFormat("task_spec={%s}", task_spec_.DebugString()); } } // namespace ray diff --git a/src/ray/common/task/task.h b/src/ray/common/task/task.h index 5a4a9e323de5..52165665da2c 100644 --- a/src/ray/common/task/task.h +++ b/src/ray/common/task/task.h @@ -21,8 +21,6 @@ namespace ray { -typedef std::function CancelTaskCallback; - /// \class RayTask /// /// A RayTask represents a Ray task and a specification of its execution (e.g., @@ -33,7 +31,7 @@ class RayTask { public: /// Construct an empty task. This should only be used to pass a task /// as an out parameter to a function or method. - RayTask() {} + RayTask() = default; /// Construct a `RayTask` object from a protobuf message. /// @@ -41,7 +39,7 @@ class RayTask { explicit RayTask(const rpc::Task &message); /// Construct a `RayTask` object from a `TaskSpecification`. - RayTask(TaskSpecification task_spec); + explicit RayTask(TaskSpecification task_spec); RayTask(TaskSpecification task_spec, std::string preferred_node_id); diff --git a/src/ray/common/task/task_spec.cc b/src/ray/common/task/task_spec.cc index a1b3c04f80b3..6ffc13f5a8e6 100644 --- a/src/ray/common/task/task_spec.cc +++ b/src/ray/common/task/task_spec.cc @@ -181,15 +181,15 @@ ray::FunctionDescriptor TaskSpecification::FunctionDescriptor() const { return ray::FunctionDescriptorBuilder::FromProto(message_->function_descriptor()); } -rpc::RuntimeEnvInfo TaskSpecification::RuntimeEnvInfo() const { +const rpc::RuntimeEnvInfo &TaskSpecification::RuntimeEnvInfo() const { return message_->runtime_env_info(); } -std::string TaskSpecification::SerializedRuntimeEnv() const { +const std::string &TaskSpecification::SerializedRuntimeEnv() const { return message_->runtime_env_info().serialized_runtime_env(); } -rpc::RuntimeEnvConfig TaskSpecification::RuntimeEnvConfig() const { +const rpc::RuntimeEnvConfig &TaskSpecification::RuntimeEnvConfig() const { return message_->runtime_env_info().runtime_env_config(); } @@ -208,7 +208,7 @@ int TaskSpecification::GetRuntimeEnvHash() const { return runtime_env_hash_; } const SchedulingClass TaskSpecification::GetSchedulingClass() const { if (!IsActorTask()) { // Actor task doesn't have scheudling id, so we don't need to check this. - RAY_CHECK(sched_cls_id_ > 0); + RAY_CHECK_GT(sched_cls_id_, 0); } return sched_cls_id_; } diff --git a/src/ray/common/task/task_spec.h b/src/ray/common/task/task_spec.h index 019835062d31..ac4a38c92cee 100644 --- a/src/ray/common/task/task_spec.h +++ b/src/ray/common/task/task_spec.h @@ -306,11 +306,11 @@ class TaskSpecification : public MessageWrapper { ray::FunctionDescriptor FunctionDescriptor() const; - [[nodiscard]] rpc::RuntimeEnvInfo RuntimeEnvInfo() const; + [[nodiscard]] const rpc::RuntimeEnvInfo &RuntimeEnvInfo() const; - std::string SerializedRuntimeEnv() const; + const std::string &SerializedRuntimeEnv() const; - rpc::RuntimeEnvConfig RuntimeEnvConfig() const; + const rpc::RuntimeEnvConfig &RuntimeEnvConfig() const; bool HasRuntimeEnv() const; diff --git a/src/ray/common/task/task_util.h b/src/ray/common/task/task_util.h index 488c52069aa4..4ecfab358c7b 100644 --- a/src/ray/common/task/task_util.h +++ b/src/ray/common/task/task_util.h @@ -135,7 +135,8 @@ class TaskSpecBuilder { const TaskID &submitter_task_id, const std::shared_ptr runtime_env_info = nullptr, const std::string &concurrency_group_name = "", - bool enable_task_events = true) { + bool enable_task_events = true, + const std::unordered_map &labels = {}) { message_->set_type(TaskType::NORMAL_TASK); message_->set_name(name); message_->set_language(language); @@ -165,6 +166,7 @@ class TaskSpecBuilder { } message_->set_concurrency_group_name(concurrency_group_name); message_->set_enable_task_events(enable_task_events); + message_->mutable_labels()->insert(labels.begin(), labels.end()); return *this; } diff --git a/src/ray/core_worker/actor_handle.cc b/src/ray/core_worker/actor_handle.cc index 4228cd392351..39257bbb7fcc 100644 --- a/src/ray/core_worker/actor_handle.cc +++ b/src/ray/core_worker/actor_handle.cc @@ -33,7 +33,8 @@ rpc::ActorHandle CreateInnerActorHandle( const std::string &ray_namespace, int32_t max_pending_calls, bool execute_out_of_order, - absl::optional enable_task_events) { + absl::optional enable_task_events, + const std::unordered_map &labels) { rpc::ActorHandle inner; inner.set_actor_id(actor_id.Data(), actor_id.Size()); inner.set_owner_id(owner_id.Binary()); @@ -50,6 +51,7 @@ rpc::ActorHandle CreateInnerActorHandle( inner.set_execute_out_of_order(execute_out_of_order); inner.set_max_pending_calls(max_pending_calls); inner.set_enable_task_events(enable_task_events.value_or(kDefaultTaskEventEnabled)); + inner.mutable_labels()->insert(labels.begin(), labels.end()); return inner; } @@ -82,6 +84,7 @@ rpc::ActorHandle CreateInnerActorHandleFromActorData( inner.set_execute_out_of_order( task_spec.actor_creation_task_spec().execute_out_of_order()); inner.set_max_pending_calls(task_spec.actor_creation_task_spec().max_pending_calls()); + inner.mutable_labels()->insert(task_spec.labels().begin(), task_spec.labels().end()); return inner; } } // namespace @@ -100,7 +103,8 @@ ActorHandle::ActorHandle( const std::string &ray_namespace, int32_t max_pending_calls, bool execute_out_of_order, - absl::optional enable_task_events) + absl::optional enable_task_events, + const std::unordered_map &labels) : ActorHandle(CreateInnerActorHandle(actor_id, owner_id, owner_address, @@ -114,7 +118,8 @@ ActorHandle::ActorHandle( ray_namespace, max_pending_calls, execute_out_of_order, - enable_task_events)) {} + enable_task_events, + labels)) {} ActorHandle::ActorHandle(const std::string &serialized) : ActorHandle(CreateInnerActorHandleFromString(serialized)) {} diff --git a/src/ray/core_worker/actor_handle.h b/src/ray/core_worker/actor_handle.h index 22f00c066dff..98306cb6d6b6 100644 --- a/src/ray/core_worker/actor_handle.h +++ b/src/ray/core_worker/actor_handle.h @@ -45,7 +45,8 @@ class ActorHandle { const std::string &ray_namespace, int32_t max_pending_calls, bool execute_out_of_order = false, - absl::optional enable_task_events = absl::nullopt); + absl::optional enable_task_events = absl::nullopt, + const std::unordered_map &labels = {}); /// Constructs an ActorHandle from a serialized string. explicit ActorHandle(const std::string &serialized); @@ -105,6 +106,10 @@ class ActorHandle { bool ExecuteOutOfOrder() const { return inner_.execute_out_of_order(); } + const ::google::protobuf::Map &GetLabels() const { + return inner_.labels(); + } + private: // Protobuf-defined persistent state of the actor handle. const rpc::ActorHandle inner_; diff --git a/src/ray/core_worker/actor_manager.cc b/src/ray/core_worker/actor_manager.cc index 02a89a7c65c9..a31c402fae11 100644 --- a/src/ray/core_worker/actor_manager.cc +++ b/src/ray/core_worker/actor_manager.cc @@ -172,7 +172,7 @@ bool ActorManager::AddActorHandle(std::unique_ptr actor_handle, } if (inserted && owned) { - RAY_CHECK(reference_counter_->AddObjectPrimaryCopyDeleteCallback( + RAY_CHECK(reference_counter_->AddObjectOutOfScopeOrFreedCallback( actor_creation_return_id, [this, actor_id](const ObjectID &object_id) { MarkActorKilledOrOutOfScope(GetActorHandle(actor_id)); })); diff --git a/src/ray/core_worker/actor_manager.h b/src/ray/core_worker/actor_manager.h index dcfe8e11a68d..a42cdcc13d6b 100644 --- a/src/ray/core_worker/actor_manager.h +++ b/src/ray/core_worker/actor_manager.h @@ -20,6 +20,7 @@ #include "ray/core_worker/actor_creator.h" #include "ray/core_worker/actor_handle.h" #include "ray/core_worker/reference_count.h" +#include "ray/core_worker/transport/actor_task_submitter.h" #include "ray/core_worker/transport/task_receiver.h" #include "ray/gcs/gcs_client/gcs_client.h" namespace ray { diff --git a/src/ray/core_worker/common.h b/src/ray/core_worker/common.h index 3a160cd302d8..bc4c18c22bb1 100644 --- a/src/ray/core_worker/common.h +++ b/src/ray/core_worker/common.h @@ -67,14 +67,16 @@ struct TaskOptions { const std::string &concurrency_group_name = "", int64_t generator_backpressure_num_objects = -1, const std::string &serialized_runtime_env_info = "{}", - bool enable_task_events = kDefaultTaskEventEnabled) + bool enable_task_events = kDefaultTaskEventEnabled, + const std::unordered_map &labels = {}) : name(name), num_returns(num_returns), resources(resources), concurrency_group_name(concurrency_group_name), serialized_runtime_env_info(serialized_runtime_env_info), generator_backpressure_num_objects(generator_backpressure_num_objects), - enable_task_events(enable_task_events) {} + enable_task_events(enable_task_events), + labels(labels) {} /// The name of this task. std::string name; @@ -95,6 +97,7 @@ struct TaskOptions { /// True if task events (worker::TaskEvent) from this task should be reported, default /// to true. bool enable_task_events = kDefaultTaskEventEnabled; + std::unordered_map labels; }; /// Options for actor creation tasks. @@ -115,7 +118,8 @@ struct ActorCreationOptions { const std::vector &concurrency_groups = {}, bool execute_out_of_order = false, int32_t max_pending_calls = -1, - bool enable_task_events = kDefaultTaskEventEnabled) + bool enable_task_events = kDefaultTaskEventEnabled, + const std::unordered_map &labels = {}) : max_restarts(max_restarts), max_task_retries(max_task_retries), max_concurrency(max_concurrency), @@ -132,7 +136,8 @@ struct ActorCreationOptions { execute_out_of_order(execute_out_of_order), max_pending_calls(max_pending_calls), scheduling_strategy(scheduling_strategy), - enable_task_events(enable_task_events) { + enable_task_events(enable_task_events), + labels(labels) { // Check that resources is a subset of placement resources. for (auto &resource : resources) { auto it = this->placement_resources.find(resource.first); @@ -187,6 +192,7 @@ struct ActorCreationOptions { /// True if task events (worker::TaskEvent) from this creation task should be reported /// default to true. const bool enable_task_events = kDefaultTaskEventEnabled; + const std::unordered_map labels; }; using PlacementStrategy = rpc::PlacementStrategy; @@ -285,11 +291,11 @@ template <> struct hash { size_t operator()(const ray::rpc::LineageReconstructionTask &task) const { size_t hash = std::hash()(task.name()); - for (const auto &resource : task.resources()) { - hash ^= std::hash()(resource.first); - hash ^= std::hash()(resource.second); - } hash ^= std::hash()(task.status()); + for (const auto &label : task.labels()) { + hash ^= std::hash()(label.first); + hash ^= std::hash()(label.second); + } return hash; } }; diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index 9a15e8702892..e04ade96a0fe 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -126,8 +126,129 @@ std::optional TryGetLocalObjectLocation( } // namespace -CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_id) - : options_(options), +TaskCounter::TaskCounter() { + counter_.SetOnChangeCallback( + [this](const std::tuple + &key) ABSL_EXCLUSIVE_LOCKS_REQUIRED(&mu_) mutable { + if (std::get<1>(key) != TaskStatusType::kRunning) { + return; + } + const auto &func_name = std::get<0>(key); + const auto is_retry = std::get<2>(key); + const int64_t running_total = counter_.Get(key); + const int64_t num_in_get = running_in_get_counter_.Get({func_name, is_retry}); + const int64_t num_in_wait = running_in_wait_counter_.Get({func_name, is_retry}); + const auto is_retry_label = is_retry ? "1" : "0"; + // RUNNING_IN_RAY_GET/WAIT are sub-states of RUNNING, so we need to subtract + // them out to avoid double-counting. + ray::stats::STATS_tasks.Record( + running_total - num_in_get - num_in_wait, + {{"State", rpc::TaskStatus_Name(rpc::TaskStatus::RUNNING)}, + {"Name", func_name}, + {"IsRetry", is_retry_label}, + {"JobId", job_id_}, + {"Source", "executor"}}); + // Negate the metrics recorded from the submitter process for these tasks. + ray::stats::STATS_tasks.Record( + -running_total, + {{"State", rpc::TaskStatus_Name(rpc::TaskStatus::SUBMITTED_TO_WORKER)}, + {"Name", func_name}, + {"IsRetry", is_retry_label}, + {"JobId", job_id_}, + {"Source", "executor"}}); + // Record sub-state for get. + ray::stats::STATS_tasks.Record( + num_in_get, + {{"State", rpc::TaskStatus_Name(rpc::TaskStatus::RUNNING_IN_RAY_GET)}, + {"Name", func_name}, + {"IsRetry", is_retry_label}, + {"JobId", job_id_}, + {"Source", "executor"}}); + // Record sub-state for wait. + ray::stats::STATS_tasks.Record( + num_in_wait, + {{"State", rpc::TaskStatus_Name(rpc::TaskStatus::RUNNING_IN_RAY_WAIT)}, + {"Name", func_name}, + {"IsRetry", is_retry_label}, + {"JobId", job_id_}, + {"Source", "executor"}}); + }); +} + +void TaskCounter::RecordMetrics() { + absl::MutexLock l(&mu_); + counter_.FlushOnChangeCallbacks(); + if (IsActor()) { + float running = 0.0; + float in_get = 0.0; + float in_wait = 0.0; + float idle = 0.0; + if (running_in_wait_counter_.Total() > 0) { + in_wait = 1.0; + } else if (running_in_get_counter_.Total() > 0) { + in_get = 1.0; + } else if (num_tasks_running_ > 0) { + running = 1.0; + } else { + idle = 1.0; + } + ray::stats::STATS_actors.Record(idle, + {{"State", "IDLE"}, + {"Name", actor_name_}, + {"Source", "executor"}, + {"JobId", job_id_}}); + ray::stats::STATS_actors.Record(running, + {{"State", "RUNNING_TASK"}, + {"Name", actor_name_}, + {"Source", "executor"}, + {"JobId", job_id_}}); + ray::stats::STATS_actors.Record(in_get, + {{"State", "RUNNING_IN_RAY_GET"}, + {"Name", actor_name_}, + {"Source", "executor"}, + {"JobId", job_id_}}); + ray::stats::STATS_actors.Record(in_wait, + {{"State", "RUNNING_IN_RAY_WAIT"}, + {"Name", actor_name_}, + {"Source", "executor"}, + {"JobId", job_id_}}); + } +} + +void TaskCounter::SetMetricStatus(const std::string &func_name, + rpc::TaskStatus status, + bool is_retry) { + absl::MutexLock l(&mu_); + // Add a no-op increment to counter_ so that + // it will invoke a callback upon RecordMetrics. + counter_.Increment({func_name, TaskStatusType::kRunning, is_retry}, 0); + if (status == rpc::TaskStatus::RUNNING_IN_RAY_GET) { + running_in_get_counter_.Increment({func_name, is_retry}); + } else if (status == rpc::TaskStatus::RUNNING_IN_RAY_WAIT) { + running_in_wait_counter_.Increment({func_name, is_retry}); + } else { + RAY_CHECK(false) << "Unexpected status " << rpc::TaskStatus_Name(status); + } +} + +void TaskCounter::UnsetMetricStatus(const std::string &func_name, + rpc::TaskStatus status, + bool is_retry) { + absl::MutexLock l(&mu_); + // Add a no-op decrement to counter_ so that + // it will invoke a callback upon RecordMetrics. + counter_.Decrement({func_name, TaskStatusType::kRunning, is_retry}, 0); + if (status == rpc::TaskStatus::RUNNING_IN_RAY_GET) { + running_in_get_counter_.Decrement({func_name, is_retry}); + } else if (status == rpc::TaskStatus::RUNNING_IN_RAY_WAIT) { + running_in_wait_counter_.Decrement({func_name, is_retry}); + } else { + RAY_LOG(FATAL) << "Unexpected status " << rpc::TaskStatus_Name(status); + } +} + +CoreWorker::CoreWorker(CoreWorkerOptions options, const WorkerID &worker_id) + : options_(std::move(options)), get_call_site_(RayConfig::instance().record_ref_creation_sites() ? options_.get_lang_stack : nullptr), @@ -339,7 +460,7 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_ << "max_pending_lease_requests_per_scheduling_category can't be 0"; lease_request_rate_limiter_ = std::make_shared( - /*kMinConcurrentLeaseCap*/ 10); + /*min_concurrent_lease_cap_*/ 10); } // Register a callback to monitor add/removed nodes. @@ -516,14 +637,13 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_ actor_creator_ = std::make_shared(gcs_client_); - actor_task_submitter_ = std::shared_ptr( - new ActorTaskSubmitter(*core_worker_client_pool_, - *memory_store_, - *task_manager_, - *actor_creator_, - on_excess_queueing, - io_service_, - reference_counter_)); + actor_task_submitter_ = std::make_shared(*core_worker_client_pool_, + *memory_store_, + *task_manager_, + *actor_creator_, + on_excess_queueing, + io_service_, + reference_counter_); auto node_addr_factory = [this](const NodeID &node_id) { absl::optional addr; @@ -564,15 +684,16 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_ uint64_t object_size) { reference_counter_->ReportLocalityData(object_id, locations, object_size); }; - future_resolver_.reset(new FutureResolver(memory_store_, - reference_counter_, - std::move(report_locality_data_callback), - core_worker_client_pool_, - rpc_address_)); + future_resolver_ = + std::make_unique(memory_store_, + reference_counter_, + std::move(report_locality_data_callback), + core_worker_client_pool_, + rpc_address_); // Unfortunately the raylet client has to be constructed after the receivers. if (task_receiver_ != nullptr) { - task_argument_waiter_.reset(new DependencyWaiterImpl(*local_raylet_client_)); + task_argument_waiter_ = std::make_unique(*local_raylet_client_); task_receiver_->Init(core_worker_client_pool_, rpc_address_, task_argument_waiter_); } @@ -592,13 +713,13 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_ for (const auto &node_id : object_locations.value()) { absl::optional addr = node_addr_factory(node_id); if (addr.has_value()) { - locations.push_back(addr.value()); - } else { - // We're getting potentially stale locations directly from the reference - // counter, so the location might be a dead node. - RAY_LOG(DEBUG).WithField(object_id).WithField(node_id) - << "Object location is dead, not using it in the recovery of object"; + locations.emplace_back(std::move(addr.value())); + continue; } + // We're getting potentially stale locations directly from the reference + // counter, so the location might be a dead node. + RAY_LOG(DEBUG).WithField(object_id).WithField(node_id) + << "Object location is dead, not using it in the recovery of object"; } } callback(object_id, locations); @@ -1014,7 +1135,7 @@ void CoreWorker::RegisterToGcs(int64_t worker_launch_time_ms, std::chrono::system_clock::now().time_since_epoch()) .count(); worker_info.emplace("driver_id", worker_id.Binary()); - worker_info.emplace("start_time", std::to_string(start_time)); + worker_info.emplace("start_time", absl::StrFormat("%d", start_time)); if (!options_.driver_name.empty()) { worker_info.emplace("name", options_.driver_name); } @@ -1033,7 +1154,8 @@ void CoreWorker::RegisterToGcs(int64_t worker_launch_time_ms, worker_data->mutable_worker_address()->set_port(rpc_address_.port()); worker_data->mutable_worker_address()->set_worker_id(worker_id.Binary()); worker_data->set_worker_type(options_.worker_type); - worker_data->mutable_worker_info()->insert(worker_info.begin(), worker_info.end()); + worker_data->mutable_worker_info()->insert(std::make_move_iterator(worker_info.begin()), + std::make_move_iterator(worker_info.end())); worker_data->set_is_alive(true); worker_data->set_pid(pid_); @@ -1067,7 +1189,7 @@ void CoreWorker::InternalHeartbeat() { absl::MutexLock lock(&mutex_); while (!to_resubmit_.empty() && current_time_ms() > to_resubmit_.top().execution_time_ms) { - tasks_to_resubmit.push_back(std::move(to_resubmit_.top())); + tasks_to_resubmit.emplace_back(to_resubmit_.top()); to_resubmit_.pop(); } } @@ -1168,15 +1290,16 @@ Status CoreWorker::GetOwnerAddress(const ObjectID &object_id, std::vector CoreWorker::GetObjectRefs( const std::vector &object_ids) const { std::vector refs; + refs.reserve(object_ids.size()); for (const auto &object_id : object_ids) { rpc::ObjectReference ref; ref.set_object_id(object_id.Binary()); rpc::Address owner_address; if (reference_counter_->GetOwner(object_id, &owner_address)) { // NOTE(swang): Detached actors do not have an owner address set. - ref.mutable_owner_address()->CopyFrom(owner_address); + *ref.mutable_owner_address() = std::move(owner_address); } - refs.push_back(std::move(ref)); + refs.emplace_back(std::move(ref)); } return refs; } @@ -2115,7 +2238,8 @@ void CoreWorker::BuildCommonTaskSpec( const std::string &concurrency_group_name, bool include_job_config, int64_t generator_backpressure_num_objects, - bool enable_task_events) { + bool enable_task_events, + const std::unordered_map &labels) { // Build common task spec. auto override_runtime_env_info = OverrideTaskOrActorRuntimeEnvInfo(serialized_runtime_env_info); @@ -2161,7 +2285,8 @@ void CoreWorker::BuildCommonTaskSpec( main_thread_current_task_id, override_runtime_env_info, concurrency_group_name, - enable_task_events); + enable_task_events, + labels); // Set task arguments. for (const auto &arg : args) { builder.AddArg(*arg); @@ -2217,7 +2342,8 @@ std::vector CoreWorker::SubmitTask( /*include_job_config*/ true, /*generator_backpressure_num_objects*/ task_options.generator_backpressure_num_objects, - /*enable_task_event*/ task_options.enable_task_events); + /*enable_task_event*/ task_options.enable_task_events, + task_options.labels); ActorID root_detached_actor_id; if (!worker_context_.GetRootDetachedActorID().IsNil()) { root_detached_actor_id = worker_context_.GetRootDetachedActorID(); @@ -2308,7 +2434,8 @@ Status CoreWorker::CreateActor(const RayFunction &function, /*concurrency_group_name*/ "", /*include_job_config*/ true, /*generator_backpressure_num_objects*/ -1, - /*enable_task_events*/ actor_creation_options.enable_task_events); + /*enable_task_events*/ actor_creation_options.enable_task_events, + actor_creation_options.labels); // If the namespace is not specified, get it from the job. const auto ray_namespace = (actor_creation_options.ray_namespace.empty() @@ -2328,7 +2455,8 @@ Status CoreWorker::CreateActor(const RayFunction &function, ray_namespace, actor_creation_options.max_pending_calls, actor_creation_options.execute_out_of_order, - actor_creation_options.enable_task_events); + actor_creation_options.enable_task_events, + actor_creation_options.labels); std::string serialized_actor_handle; actor_handle->Serialize(&serialized_actor_handle); ActorID root_detached_actor_id; @@ -2456,9 +2584,8 @@ Status CoreWorker::CreatePlacementGroup( << ". It is probably " "because GCS server is dead or there's a high load there."; return Status::TimedOut(stream.str()); - } else { - return status; } + return status; } Status CoreWorker::RemovePlacementGroup(const PlacementGroupID &placement_group_id) { @@ -2472,9 +2599,8 @@ Status CoreWorker::RemovePlacementGroup(const PlacementGroupID &placement_group_ << ". It is probably " "because GCS server is dead or there's a high load there."; return Status::TimedOut(stream.str()); - } else { - return status; } + return status; } Status CoreWorker::WaitPlacementGroupReady(const PlacementGroupID &placement_group_id, @@ -2486,9 +2612,8 @@ Status CoreWorker::WaitPlacementGroupReady(const PlacementGroupID &placement_gro stream << "There was timeout in waiting for placement group " << placement_group_id << " creation."; return Status::TimedOut(stream.str()); - } else { - return status; } + return status; } Status CoreWorker::SubmitActorTask( @@ -2741,7 +2866,7 @@ std::optional CoreWorker::GetLocalActorState( ActorID CoreWorker::DeserializeAndRegisterActorHandle(const std::string &serialized, const ObjectID &outer_object_id, bool add_local_ref) { - std::unique_ptr actor_handle(new ActorHandle(serialized)); + auto actor_handle = std::make_unique(serialized); return actor_manager_->RegisterActorHandle(std::move(actor_handle), outer_object_id, CurrentCallSite(), @@ -2796,20 +2921,16 @@ CoreWorker::ListNamedActors(bool all_namespaces) { stream << "There was timeout in getting the list of named actors, " "probably because the GCS server is dead or under high load ."; return std::make_pair(std::move(actors), Status::TimedOut(stream.str())); - } else if (!status.ok()) { - return std::make_pair(std::move(actors), status); - } else { - return std::make_pair(std::move(actors), status); } + return std::make_pair(std::move(actors), std::move(status)); } std::pair, Status> CoreWorker::GetNamedActorHandleLocalMode(const std::string &name) { auto it = local_mode_named_actor_registry_.find(name); if (it == local_mode_named_actor_registry_.end()) { - std::ostringstream stream; - stream << "Failed to look up actor with name '" << name; - return std::make_pair(nullptr, Status::NotFound(stream.str())); + std::string err_msg = absl::StrFormat("Failed to look up actor with name %s", name); + return std::make_pair(nullptr, Status::NotFound(std::move(err_msg))); } return std::make_pair(GetActorHandle(it->second), Status::OK()); @@ -2818,12 +2939,13 @@ CoreWorker::GetNamedActorHandleLocalMode(const std::string &name) { std::pair>, Status> CoreWorker::ListNamedActorsLocalMode() { std::vector> actors; + actors.reserve(local_mode_named_actor_registry_.size()); for (auto it = local_mode_named_actor_registry_.begin(); it != local_mode_named_actor_registry_.end(); it++) { - actors.push_back(std::make_pair(/*namespace=*/"", it->first)); + actors.emplace_back(/*namespace=*/"", it->first); } - return std::make_pair(actors, Status::OK()); + return std::make_pair(std::move(actors), Status::OK()); } const std::string CoreWorker::GetActorName() const { @@ -2853,7 +2975,8 @@ void CoreWorker::RunTaskExecutionLoop() { Exit(rpc::WorkerExitType::INTENDED_USER_EXIT, absl::StrCat("Worker exits by a signal. ", status.message()), nullptr); - } else if (status.IsUnexpectedSystemExit()) { + } + if (status.IsUnexpectedSystemExit()) { Exit( rpc::WorkerExitType::SYSTEM_ERROR, absl::StrCat("Worker exits unexpectedly by a signal. ", status.message()), @@ -3009,14 +3132,14 @@ Status CoreWorker::ExecuteTask( Status status; TaskType task_type = TaskType::NORMAL_TASK; if (task_spec.IsActorCreationTask()) { - RAY_CHECK(return_objects->size() > 0); + RAY_CHECK_GT(return_objects->size(), static_cast(0)); return_objects->pop_back(); task_type = TaskType::ACTOR_CREATION_TASK; SetActorId(task_spec.ActorCreationId()); task_counter_.BecomeActor(task_spec.FunctionDescriptor()->ClassName()); { - std::unique_ptr self_actor_handle( - new ActorHandle(task_spec.GetSerializedActorHandle())); + auto self_actor_handle = + std::make_unique(task_spec.GetSerializedActorHandle()); // Register the handle to the current actor itself. actor_manager_->RegisterActorHandle(std::move(self_actor_handle), ObjectID::Nil(), @@ -3058,9 +3181,9 @@ Status CoreWorker::ExecuteTask( defined_concurrency_groups, name_of_concurrency_group_to_execute, /*is_reattempt=*/task_spec.AttemptNumber() > 0, - /*is_streaming_generator*/ task_spec.IsStreamingGenerator(), - /*retry_exception*/ task_spec.ShouldRetryExceptions(), - /*generator_backpressure_num_objects*/ + /*is_streaming_generator=*/task_spec.IsStreamingGenerator(), + /*retry_exception=*/task_spec.ShouldRetryExceptions(), + /*generator_backpressure_num_objects=*/ task_spec.GeneratorBackpressureNumObjects()); // Get the reference counts for any IDs that we borrowed during this task, @@ -3073,7 +3196,7 @@ Status CoreWorker::ExecuteTask( if (!borrowed_ids.empty()) { reference_counter_->PopAndClearLocalBorrowers(borrowed_ids, borrowed_refs, &deleted); } - if (dynamic_return_objects != NULL) { + if (dynamic_return_objects != nullptr) { for (const auto &dynamic_return : *dynamic_return_objects) { reference_counter_->PopAndClearLocalBorrowers( {dynamic_return.first}, borrowed_refs, &deleted); @@ -3092,7 +3215,7 @@ Status CoreWorker::ExecuteTask( } if (!options_.is_local_mode) { - SetCurrentTaskId(TaskID::Nil(), /*attempt_number=*/0, ""); + SetCurrentTaskId(TaskID::Nil(), /*attempt_number=*/0, /*task_name=*/""); worker_context_.ResetCurrentTask(); } { @@ -3737,7 +3860,7 @@ void CoreWorker::ProcessSubscribeForObjectEviction( // Returns true if the object was present and the callback was added. It might have // already been evicted by the time we get this request, in which case we should // respond immediately so the raylet unpins the object. - if (!reference_counter_->AddObjectPrimaryCopyDeleteCallback(object_id, unpin_object)) { + if (!reference_counter_->AddObjectOutOfScopeOrFreedCallback(object_id, unpin_object)) { // If the object is already evicted (callback cannot be set), unregister the // subscription & publish the message so that the subscriber knows it. unpin_object(object_id); @@ -3939,7 +4062,7 @@ void CoreWorker::ProcessSubscribeObjectLocations( std::unordered_map CoreWorker::GetLocalOngoingLineageReconstructionTasks() const { - return task_manager_->GetOngoingLineageReconstructionTasks(); + return task_manager_->GetOngoingLineageReconstructionTasks(*actor_manager_); } Status CoreWorker::GetLocalObjectLocations( @@ -4731,11 +4854,11 @@ void CoreWorker::UpdateTaskIsDebuggerPaused(const TaskID &task_id, ClusterSizeBasedLeaseRequestRateLimiter::ClusterSizeBasedLeaseRequestRateLimiter( size_t min_concurrent_lease_limit) - : kMinConcurrentLeaseCap(min_concurrent_lease_limit), num_alive_nodes_(0) {} + : min_concurrent_lease_cap_(min_concurrent_lease_limit), num_alive_nodes_(0) {} size_t ClusterSizeBasedLeaseRequestRateLimiter:: GetMaxPendingLeaseRequestsPerSchedulingCategory() { - return std::max(kMinConcurrentLeaseCap, num_alive_nodes_.load()); + return std::max(min_concurrent_lease_cap_, num_alive_nodes_.load()); } void ClusterSizeBasedLeaseRequestRateLimiter::OnNodeChanges( diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h index 2d135a5983f8..4ab8a5cd0f3c 100644 --- a/src/ray/core_worker/core_worker.h +++ b/src/ray/core_worker/core_worker.h @@ -59,8 +59,7 @@ /// "RPC_SERVICE_HANDLER(CoreWorkerService, ExampleCall, 1)" /// 4) Add a method to the CoreWorker class below: "CoreWorker::HandleExampleCall" -namespace ray { -namespace core { +namespace ray::core { JobID GetProcessJobID(const CoreWorkerOptions &options); @@ -69,57 +68,10 @@ JobID GetProcessJobID(const CoreWorkerOptions &options); class TaskCounter { /// A task can only be one of the following state. Received state in particular /// covers from the point of RPC call to beginning execution. - enum TaskStatusType { kPending, kRunning, kFinished }; + enum class TaskStatusType { kPending, kRunning, kFinished }; public: - TaskCounter() { - counter_.SetOnChangeCallback( - [this](const std::tuple &key) - ABSL_EXCLUSIVE_LOCKS_REQUIRED(&mu_) mutable { - if (std::get<1>(key) != kRunning) { - return; - } - auto func_name = std::get<0>(key); - auto is_retry = std::get<2>(key); - int64_t running_total = counter_.Get(key); - int64_t num_in_get = running_in_get_counter_.Get({func_name, is_retry}); - int64_t num_in_wait = running_in_wait_counter_.Get({func_name, is_retry}); - auto is_retry_label = is_retry ? "1" : "0"; - // RUNNING_IN_RAY_GET/WAIT are sub-states of RUNNING, so we need to subtract - // them out to avoid double-counting. - ray::stats::STATS_tasks.Record( - running_total - num_in_get - num_in_wait, - {{"State", rpc::TaskStatus_Name(rpc::TaskStatus::RUNNING)}, - {"Name", func_name}, - {"IsRetry", is_retry_label}, - {"JobId", job_id_}, - {"Source", "executor"}}); - // Negate the metrics recorded from the submitter process for these tasks. - ray::stats::STATS_tasks.Record( - -running_total, - {{"State", rpc::TaskStatus_Name(rpc::TaskStatus::SUBMITTED_TO_WORKER)}, - {"Name", func_name}, - {"IsRetry", is_retry_label}, - {"JobId", job_id_}, - {"Source", "executor"}}); - // Record sub-state for get. - ray::stats::STATS_tasks.Record( - num_in_get, - {{"State", rpc::TaskStatus_Name(rpc::TaskStatus::RUNNING_IN_RAY_GET)}, - {"Name", func_name}, - {"IsRetry", is_retry_label}, - {"JobId", job_id_}, - {"Source", "executor"}}); - // Record sub-state for wait. - ray::stats::STATS_tasks.Record( - num_in_wait, - {{"State", rpc::TaskStatus_Name(rpc::TaskStatus::RUNNING_IN_RAY_WAIT)}, - {"Name", func_name}, - {"IsRetry", is_retry_label}, - {"JobId", job_id_}, - {"Source", "executor"}}); - }); - } + TaskCounter(); void BecomeActor(const std::string &actor_name) { absl::MutexLock l(&mu_); @@ -133,95 +85,35 @@ class TaskCounter { bool IsActor() ABSL_EXCLUSIVE_LOCKS_REQUIRED(&mu_) { return actor_name_.size() > 0; } - void RecordMetrics() { - absl::MutexLock l(&mu_); - counter_.FlushOnChangeCallbacks(); - if (IsActor()) { - float running = 0.0; - float in_get = 0.0; - float in_wait = 0.0; - float idle = 0.0; - if (running_in_wait_counter_.Total() > 0) { - in_wait = 1.0; - } else if (running_in_get_counter_.Total() > 0) { - in_get = 1.0; - } else if (num_tasks_running_ > 0) { - running = 1.0; - } else { - idle = 1.0; - } - ray::stats::STATS_actors.Record(idle, - {{"State", "IDLE"}, - {"Name", actor_name_}, - {"Source", "executor"}, - {"JobId", job_id_}}); - ray::stats::STATS_actors.Record(running, - {{"State", "RUNNING_TASK"}, - {"Name", actor_name_}, - {"Source", "executor"}, - {"JobId", job_id_}}); - ray::stats::STATS_actors.Record(in_get, - {{"State", "RUNNING_IN_RAY_GET"}, - {"Name", actor_name_}, - {"Source", "executor"}, - {"JobId", job_id_}}); - ray::stats::STATS_actors.Record(in_wait, - {{"State", "RUNNING_IN_RAY_WAIT"}, - {"Name", actor_name_}, - {"Source", "executor"}, - {"JobId", job_id_}}); - } - } + void RecordMetrics(); void IncPending(const std::string &func_name, bool is_retry) { absl::MutexLock l(&mu_); - counter_.Increment({func_name, kPending, is_retry}); + counter_.Increment({func_name, TaskStatusType::kPending, is_retry}); } void MovePendingToRunning(const std::string &func_name, bool is_retry) { absl::MutexLock l(&mu_); - counter_.Swap({func_name, kPending, is_retry}, {func_name, kRunning, is_retry}); + counter_.Swap({func_name, TaskStatusType::kPending, is_retry}, + {func_name, TaskStatusType::kRunning, is_retry}); num_tasks_running_++; } void MoveRunningToFinished(const std::string &func_name, bool is_retry) { absl::MutexLock l(&mu_); - counter_.Swap({func_name, kRunning, is_retry}, {func_name, kFinished, is_retry}); + counter_.Swap({func_name, TaskStatusType::kRunning, is_retry}, + {func_name, TaskStatusType::kFinished, is_retry}); num_tasks_running_--; - RAY_CHECK(num_tasks_running_ >= 0); + RAY_CHECK_GE(num_tasks_running_, 0); } void SetMetricStatus(const std::string &func_name, rpc::TaskStatus status, - bool is_retry) { - absl::MutexLock l(&mu_); - // Add a no-op increment to counter_ so that - // it will invoke a callback upon RecordMetrics. - counter_.Increment({func_name, TaskStatusType::kRunning, is_retry}, 0); - if (status == rpc::TaskStatus::RUNNING_IN_RAY_GET) { - running_in_get_counter_.Increment({func_name, is_retry}); - } else if (status == rpc::TaskStatus::RUNNING_IN_RAY_WAIT) { - running_in_wait_counter_.Increment({func_name, is_retry}); - } else { - RAY_CHECK(false) << "Unexpected status " << rpc::TaskStatus_Name(status); - } - } + bool is_retry); void UnsetMetricStatus(const std::string &func_name, rpc::TaskStatus status, - bool is_retry) { - absl::MutexLock l(&mu_); - // Add a no-op decrement to counter_ so that - // it will invoke a callback upon RecordMetrics. - counter_.Decrement({func_name, TaskStatusType::kRunning, is_retry}, 0); - if (status == rpc::TaskStatus::RUNNING_IN_RAY_GET) { - running_in_get_counter_.Decrement({func_name, is_retry}); - } else if (status == rpc::TaskStatus::RUNNING_IN_RAY_WAIT) { - running_in_wait_counter_.Decrement({func_name, is_retry}); - } else { - RAY_CHECK(false) << "Unexpected status " << rpc::TaskStatus_Name(status); - } - } + bool is_retry); private: mutable absl::Mutex mu_; @@ -274,7 +166,7 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { /// /// \param[in] options The various initialization options. /// \param[in] worker_id ID of this worker. - CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_id); + CoreWorker(CoreWorkerOptions options, const WorkerID &worker_id); CoreWorker(CoreWorker const &) = delete; @@ -335,6 +227,19 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { const TaskID &GetCurrentTaskId() const { return worker_context_.GetCurrentTaskID(); } + const std::string GetCurrentTaskName() const { + return worker_context_.GetCurrentTask() != nullptr + ? worker_context_.GetCurrentTask()->GetName() + : ""; + } + + const std::string GetCurrentTaskFunctionName() const { + return (worker_context_.GetCurrentTask() != nullptr && + worker_context_.GetCurrentTask()->FunctionDescriptor() != nullptr) + ? worker_context_.GetCurrentTask()->FunctionDescriptor()->CallSiteString() + : ""; + } + /// Controls the is debugger paused flag. /// /// \param task_id The task id of the task to update. @@ -402,11 +307,10 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { } bool GetCurrentTaskRetryExceptions() const { - if (!options_.is_local_mode) { - return worker_context_.GetCurrentTask()->ShouldRetryExceptions(); - } else { + if (options_.is_local_mode) { return false; } + return worker_context_.GetCurrentTask()->ShouldRetryExceptions(); } void SetWebuiDisplay(const std::string &key, const std::string &message); @@ -1100,9 +1004,7 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { const ResourceMappingType GetResourceIDs() const; /// Create a profile event and push it the TaskEventBuffer when the event is destructed. - std::unique_ptr CreateProfileEvent( - - const std::string &event_name); + std::unique_ptr CreateProfileEvent(const std::string &event_name); int64_t GetNumTasksSubmitted() const { return normal_task_submitter_->GetNumTasksSubmitted(); @@ -1467,7 +1369,8 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { const std::string &concurrency_group_name = "", bool include_job_config = false, int64_t generator_backpressure_num_objects = -1, - bool enable_task_events = true); + bool enable_task_events = true, + const std::unordered_map &labels = {}); void SetCurrentTaskId(const TaskID &task_id, uint64_t attempt_number, const std::string &task_name); @@ -1970,8 +1873,7 @@ class ClusterSizeBasedLeaseRequestRateLimiter : public LeaseRequestRateLimiter { void OnNodeChanges(const rpc::GcsNodeInfo &data); private: - const size_t kMinConcurrentLeaseCap; + const size_t min_concurrent_lease_cap_; std::atomic num_alive_nodes_; }; -} // namespace core -} // namespace ray +} // namespace ray::core diff --git a/src/ray/core_worker/reference_count.cc b/src/ray/core_worker/reference_count.cc index eb5abe2ea387..6dd4c8bf6b7d 100644 --- a/src/ray/core_worker/reference_count.cc +++ b/src/ray/core_worker/reference_count.cc @@ -543,7 +543,7 @@ int64_t ReferenceCounter::ReleaseLineageReferences(ReferenceTable::iterator ref) RAY_LOG(DEBUG) << "Releasing lineage internal for argument " << argument_id; arg_it->second.lineage_ref_count--; if (arg_it->second.OutOfScope(lineage_pinning_enabled_)) { - DeleteObjectPrimaryCopy(arg_it); + OnObjectOutOfScopeOrFreed(arg_it); } if (arg_it->second.ShouldDelete(lineage_pinning_enabled_)) { RAY_CHECK(arg_it->second.on_ref_removed == nullptr); @@ -663,7 +663,7 @@ void ReferenceCounter::FreePlasmaObjects(const std::vector &object_ids } // Free only the plasma value. We must keep the reference around so that we // have the ownership information. - DeleteObjectPrimaryCopy(it); + OnObjectOutOfScopeOrFreed(it); } } @@ -700,8 +700,7 @@ void ReferenceCounter::DeleteReferenceInternal(ReferenceTable::iterator it, DeleteReferenceInternal(inner_it, deleted); } } - // Perform the deletion. - DeleteObjectPrimaryCopy(it); + OnObjectOutOfScopeOrFreed(it); if (deleted) { deleted->push_back(id); } @@ -764,20 +763,20 @@ int64_t ReferenceCounter::EvictLineage(int64_t min_bytes_to_evict) { return lineage_bytes_evicted; } -void ReferenceCounter::DeleteObjectPrimaryCopy(ReferenceTable::iterator it) { - RAY_LOG(DEBUG) << "Calling on_object_primary_copy_delete for object " << it->first - << " num callbacks: " - << it->second.on_object_primary_copy_delete_callbacks.size(); - for (const auto &callback : it->second.on_object_primary_copy_delete_callbacks) { +void ReferenceCounter::OnObjectOutOfScopeOrFreed(ReferenceTable::iterator it) { + RAY_LOG(DEBUG) << "Calling on_object_out_of_scope_or_freed_callbacks for object " + << it->first << " num callbacks: " + << it->second.on_object_out_of_scope_or_freed_callbacks.size(); + for (const auto &callback : it->second.on_object_out_of_scope_or_freed_callbacks) { callback(it->first); } - it->second.on_object_primary_copy_delete_callbacks.clear(); + it->second.on_object_out_of_scope_or_freed_callbacks.clear(); + UnsetObjectPrimaryCopy(it); +} + +void ReferenceCounter::UnsetObjectPrimaryCopy(ReferenceTable::iterator it) { it->second.pinned_at_raylet_id.reset(); if (it->second.spilled && !it->second.spilled_node_id.IsNil()) { - // The spilled copy of the object should get deleted during the - // on_object_primary_copy_delete callback, so reset the spill location metadata here. - // NOTE(swang): Spilled copies in cloud storage are not GCed, so we do not - // reset the spilled metadata. it->second.spilled = false; it->second.spilled_url = ""; it->second.spilled_node_id = NodeID::Nil(); @@ -795,7 +794,7 @@ bool ReferenceCounter::SetObjectRefDeletedCallback( return true; } -bool ReferenceCounter::AddObjectPrimaryCopyDeleteCallback( +bool ReferenceCounter::AddObjectOutOfScopeOrFreedCallback( const ObjectID &object_id, const std::function callback) { absl::MutexLock lock(&mutex_); auto it = object_id_refs_.find(object_id); @@ -812,7 +811,7 @@ bool ReferenceCounter::AddObjectPrimaryCopyDeleteCallback( return false; } - it->second.on_object_primary_copy_delete_callbacks.emplace_back(callback); + it->second.on_object_out_of_scope_or_freed_callbacks.emplace_back(callback); return true; } @@ -822,7 +821,7 @@ void ReferenceCounter::ResetObjectsOnRemovedNode(const NodeID &raylet_id) { const auto &object_id = it->first; if (it->second.pinned_at_raylet_id.value_or(NodeID::Nil()) == raylet_id || it->second.spilled_node_id == raylet_id) { - DeleteObjectPrimaryCopy(it); + UnsetObjectPrimaryCopy(it); if (!it->second.OutOfScope(lineage_pinning_enabled_)) { objects_to_recover_.push_back(object_id); } @@ -862,7 +861,7 @@ void ReferenceCounter::UpdateObjectPinnedAtRaylet(const ObjectID &object_id, if (check_node_alive_(raylet_id)) { it->second.pinned_at_raylet_id = raylet_id; } else { - DeleteObjectPrimaryCopy(it); + UnsetObjectPrimaryCopy(it); objects_to_recover_.push_back(object_id); } } @@ -1429,7 +1428,7 @@ bool ReferenceCounter::HandleObjectSpilled(const ObjectID &object_id, } else { RAY_LOG(DEBUG).WithField(spilled_node_id).WithField(object_id) << "Object spilled to dead node "; - DeleteObjectPrimaryCopy(it); + UnsetObjectPrimaryCopy(it); objects_to_recover_.push_back(object_id); } return true; diff --git a/src/ray/core_worker/reference_count.h b/src/ray/core_worker/reference_count.h index 4ef6e14a0016..5eb228301c34 100644 --- a/src/ray/core_worker/reference_count.h +++ b/src/ray/core_worker/reference_count.h @@ -49,7 +49,7 @@ class ReferenceCounterInterface { bool is_reconstructable, bool add_local_ref, const absl::optional &pinned_at_raylet_id = absl::optional()) = 0; - virtual bool AddObjectPrimaryCopyDeleteCallback( + virtual bool AddObjectOutOfScopeOrFreedCallback( const ObjectID &object_id, const std::function callback) = 0; virtual bool SetObjectRefDeletedCallback( @@ -320,7 +320,7 @@ class ReferenceCounter : public ReferenceCounterInterface, /// Adds the callback that will be run when the object goes out of scope /// (Reference.OutOfScope() returns true). /// Returns true if the object was in scope and the callback was added, else false. - bool AddObjectPrimaryCopyDeleteCallback( + bool AddObjectOutOfScopeOrFreedCallback( const ObjectID &object_id, const std::function callback) ABSL_LOCKS_EXCLUDED(mutex_); @@ -783,13 +783,13 @@ class ReferenceCounter : public ReferenceCounterInterface, /// Metadata related to borrowing. std::unique_ptr borrow_info; - /// Callback that will be called when this Object's primary copy - /// should be deleted: out of scope or internal_api.free + /// Callback that will be called when this object + /// is out of scope or manually freed. /// Note: when an object is out of scope, it can still /// have lineage ref count and on_object_ref_delete /// will be called when lineage ref count is also 0. std::vector> - on_object_primary_copy_delete_callbacks; + on_object_out_of_scope_or_freed_callbacks; /// Callback that will be called when the object ref is deleted /// from the reference table (all refs including lineage ref count go to 0). std::function on_object_ref_delete; @@ -847,9 +847,12 @@ class ReferenceCounter : public ReferenceCounterInterface, rpc::Address *owner_address = nullptr) const ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_); - /// Delete the object primary copy, if any. Also unsets the raylet address - /// that the object was pinned at, if the address was set. - void DeleteObjectPrimaryCopy(ReferenceTable::iterator it); + /// Unsets the raylet address + /// that the object was pinned at or spilled at, if the address was set. + void UnsetObjectPrimaryCopy(ReferenceTable::iterator it); + + /// This should be called whenever the object is out of scope or manually freed. + void OnObjectOutOfScopeOrFreed(ReferenceTable::iterator it); /// Shutdown if all references have gone out of scope and shutdown /// is scheduled. diff --git a/src/ray/core_worker/task_manager.cc b/src/ray/core_worker/task_manager.cc index bc5a78c7862e..cb175bfd2ebb 100644 --- a/src/ray/core_worker/task_manager.cc +++ b/src/ray/core_worker/task_manager.cc @@ -16,6 +16,7 @@ #include "ray/common/buffer.h" #include "ray/common/common_protocol.h" +#include "ray/core_worker/actor_manager.h" #include "ray/gcs/pb_util.h" #include "ray/util/exponential_backoff.h" #include "ray/util/util.h" @@ -1475,7 +1476,8 @@ void TaskManager::SetTaskStatus( } std::unordered_map -TaskManager::GetOngoingLineageReconstructionTasks() const { +TaskManager::GetOngoingLineageReconstructionTasks( + const ActorManager &actor_manager) const { absl::MutexLock lock(&mu_); std::unordered_map result; for (const auto &task_it : submissible_tasks_) { @@ -1491,9 +1493,16 @@ TaskManager::GetOngoingLineageReconstructionTasks() const { rpc::LineageReconstructionTask task; task.set_name(task_entry.spec.GetName()); - auto resources = task_entry.spec.GetRequiredResources().GetResourceUnorderedMap(); - task.mutable_resources()->insert(resources.begin(), resources.end()); task.set_status(task_entry.GetStatus()); + if (task_entry.spec.IsNormalTask()) { + task.mutable_labels()->insert(task_entry.spec.GetMessage().labels().begin(), + task_entry.spec.GetMessage().labels().end()); + } else if (task_entry.spec.IsActorTask()) { + auto actor_handle = actor_manager.GetActorHandle(task_entry.spec.ActorId()); + RAY_CHECK(actor_handle) << "Actor task must be submitted via actor handle"; + const auto &labels = actor_handle->GetLabels(); + task.mutable_labels()->insert(labels.begin(), labels.end()); + } if (result.find(task) != result.end()) { result[task] += 1; diff --git a/src/ray/core_worker/task_manager.h b/src/ray/core_worker/task_manager.h index 196e18beb277..49188e9a630f 100644 --- a/src/ray/core_worker/task_manager.h +++ b/src/ray/core_worker/task_manager.h @@ -30,6 +30,8 @@ namespace ray { namespace core { +class ActorManager; + class TaskFinisherInterface { public: virtual void CompletePendingTask(const TaskID &task_id, @@ -603,7 +605,7 @@ class TaskManager : public TaskFinisherInterface, public TaskResubmissionInterfa /// Key is the lineage reconstruction task info. /// Value is the number of ongoing lineage reconstruction tasks of this type. std::unordered_map - GetOngoingLineageReconstructionTasks() const; + GetOngoingLineageReconstructionTasks(const ActorManager &actor_manager) const; /// Returns the generator ID that contains the dynamically allocated /// ObjectRefs, if the task is dynamic. Else, returns Nil. diff --git a/src/ray/core_worker/test/actor_manager_test.cc b/src/ray/core_worker/test/actor_manager_test.cc index b5c938c6c0ce..8f68f28cd845 100644 --- a/src/ray/core_worker/test/actor_manager_test.cc +++ b/src/ray/core_worker/test/actor_manager_test.cc @@ -168,7 +168,7 @@ class ActorManagerTest : public ::testing::Test { ray_namespace, -1, false); - EXPECT_CALL(*reference_counter_, AddObjectPrimaryCopyDeleteCallback(_, _)) + EXPECT_CALL(*reference_counter_, AddObjectOutOfScopeOrFreedCallback(_, _)) .WillRepeatedly(testing::Return(true)); actor_manager_->AddNewActorHandle(std::move(actor_handle), call_site, @@ -207,7 +207,7 @@ TEST_F(ActorManagerTest, TestAddAndGetActorHandleEndToEnd) { "", -1, false); - EXPECT_CALL(*reference_counter_, AddObjectPrimaryCopyDeleteCallback(_, _)) + EXPECT_CALL(*reference_counter_, AddObjectOutOfScopeOrFreedCallback(_, _)) .WillRepeatedly(testing::Return(true)); // Add an actor handle. @@ -284,7 +284,7 @@ TEST_F(ActorManagerTest, RegisterActorHandles) { "", -1, false); - EXPECT_CALL(*reference_counter_, AddObjectPrimaryCopyDeleteCallback(_, _)) + EXPECT_CALL(*reference_counter_, AddObjectOutOfScopeOrFreedCallback(_, _)) .WillRepeatedly(testing::Return(true)); ObjectID outer_object_id = ObjectID::Nil(); diff --git a/src/ray/core_worker/test/reference_count_test.cc b/src/ray/core_worker/test/reference_count_test.cc index 4351692284e2..67423a3ed75a 100644 --- a/src/ray/core_worker/test/reference_count_test.cc +++ b/src/ray/core_worker/test/reference_count_test.cc @@ -572,9 +572,9 @@ TEST_F(ReferenceCountTest, TestUnreconstructableObjectOutOfScope) { // The object goes out of scope once it has no more refs. std::vector out; - ASSERT_FALSE(rc->AddObjectPrimaryCopyDeleteCallback(id, callback)); + ASSERT_FALSE(rc->AddObjectOutOfScopeOrFreedCallback(id, callback)); rc->AddOwnedObject(id, {}, address, "", 0, false, /*add_local_ref=*/true); - ASSERT_TRUE(rc->AddObjectPrimaryCopyDeleteCallback(id, callback)); + ASSERT_TRUE(rc->AddObjectOutOfScopeOrFreedCallback(id, callback)); ASSERT_FALSE(*out_of_scope); rc->RemoveLocalReference(id, &out); ASSERT_TRUE(*out_of_scope); @@ -582,9 +582,9 @@ TEST_F(ReferenceCountTest, TestUnreconstructableObjectOutOfScope) { // Unreconstructable objects go out of scope even if they have a nonzero // lineage ref count. *out_of_scope = false; - ASSERT_FALSE(rc->AddObjectPrimaryCopyDeleteCallback(id, callback)); + ASSERT_FALSE(rc->AddObjectOutOfScopeOrFreedCallback(id, callback)); rc->AddOwnedObject(id, {}, address, "", 0, false, /*add_local_ref=*/false); - ASSERT_TRUE(rc->AddObjectPrimaryCopyDeleteCallback(id, callback)); + ASSERT_TRUE(rc->AddObjectOutOfScopeOrFreedCallback(id, callback)); rc->UpdateSubmittedTaskReferences({}, {id}); ASSERT_FALSE(*out_of_scope); rc->UpdateFinishedTaskReferences({}, {id}, false, empty_borrower, empty_refs, &out); @@ -2437,9 +2437,9 @@ TEST_F(ReferenceCountLineageEnabledTest, TestUnreconstructableObjectOutOfScope) // The object goes out of scope once it has no more refs. std::vector out; - ASSERT_FALSE(rc->AddObjectPrimaryCopyDeleteCallback(id, callback)); + ASSERT_FALSE(rc->AddObjectOutOfScopeOrFreedCallback(id, callback)); rc->AddOwnedObject(id, {}, address, "", 0, false, /*add_local_ref=*/true); - ASSERT_TRUE(rc->AddObjectPrimaryCopyDeleteCallback(id, callback)); + ASSERT_TRUE(rc->AddObjectOutOfScopeOrFreedCallback(id, callback)); ASSERT_FALSE(*out_of_scope); ASSERT_FALSE(*out_of_scope); rc->RemoveLocalReference(id, &out); @@ -2450,9 +2450,9 @@ TEST_F(ReferenceCountLineageEnabledTest, TestUnreconstructableObjectOutOfScope) // Unreconstructable objects stay in scope if they have a nonzero lineage ref // count. *out_of_scope = false; - ASSERT_FALSE(rc->AddObjectPrimaryCopyDeleteCallback(id, callback)); + ASSERT_FALSE(rc->AddObjectOutOfScopeOrFreedCallback(id, callback)); rc->AddOwnedObject(id, {}, address, "", 0, false, /*add_local_ref=*/false); - ASSERT_TRUE(rc->AddObjectPrimaryCopyDeleteCallback(id, callback)); + ASSERT_TRUE(rc->AddObjectOutOfScopeOrFreedCallback(id, callback)); rc->UpdateSubmittedTaskReferences({return_id}, {id}); ASSERT_TRUE(rc->IsObjectPendingCreation(return_id)); ASSERT_FALSE(*out_of_scope); @@ -2541,7 +2541,7 @@ TEST_F(ReferenceCountLineageEnabledTest, TestPinLineageRecursive) { rc->UpdateFinishedTaskReferences({}, {id}, false, empty_borrower, empty_refs, &out); // We should fail to set the deletion callback because the object has // already gone out of scope. - ASSERT_FALSE(rc->AddObjectPrimaryCopyDeleteCallback( + ASSERT_FALSE(rc->AddObjectOutOfScopeOrFreedCallback( id, [&](const ObjectID &object_id) { ASSERT_FALSE(true); })); ASSERT_EQ(out.size(), 1); @@ -2658,7 +2658,7 @@ TEST_F(ReferenceCountLineageEnabledTest, TestPlasmaLocation) { ObjectID id = ObjectID::FromRandom(); NodeID node_id = NodeID::FromRandom(); rc->AddOwnedObject(id, {}, rpc::Address(), "", 0, true, /*add_local_ref=*/true); - ASSERT_TRUE(rc->AddObjectPrimaryCopyDeleteCallback(id, callback)); + ASSERT_TRUE(rc->AddObjectOutOfScopeOrFreedCallback(id, callback)); ASSERT_TRUE(rc->IsPlasmaObjectPinnedOrSpilled(id, &owned_by_us, &pinned_at, &spilled)); ASSERT_TRUE(owned_by_us); ASSERT_TRUE(pinned_at.IsNil()); @@ -2674,7 +2674,7 @@ TEST_F(ReferenceCountLineageEnabledTest, TestPlasmaLocation) { deleted->clear(); rc->AddOwnedObject(id, {}, rpc::Address(), "", 0, true, /*add_local_ref=*/true); - ASSERT_TRUE(rc->AddObjectPrimaryCopyDeleteCallback(id, callback)); + ASSERT_TRUE(rc->AddObjectOutOfScopeOrFreedCallback(id, callback)); rc->UpdateObjectPinnedAtRaylet(id, node_id); rc->ResetObjectsOnRemovedNode(node_id); auto objects = rc->FlushObjectsToRecover(); @@ -2683,7 +2683,7 @@ TEST_F(ReferenceCountLineageEnabledTest, TestPlasmaLocation) { ASSERT_TRUE(rc->IsPlasmaObjectPinnedOrSpilled(id, &owned_by_us, &pinned_at, &spilled)); ASSERT_TRUE(owned_by_us); ASSERT_TRUE(pinned_at.IsNil()); - ASSERT_TRUE(deleted->count(id) > 0); + ASSERT_TRUE(deleted->empty()); deleted->clear(); } @@ -2699,7 +2699,7 @@ TEST_F(ReferenceCountTest, TestFree) { ASSERT_FALSE(rc->IsPlasmaObjectFreed(id)); rc->FreePlasmaObjects({id}); ASSERT_TRUE(rc->IsPlasmaObjectFreed(id)); - ASSERT_FALSE(rc->AddObjectPrimaryCopyDeleteCallback(id, callback)); + ASSERT_FALSE(rc->AddObjectOutOfScopeOrFreedCallback(id, callback)); ASSERT_EQ(deleted->count(id), 0); rc->UpdateObjectPinnedAtRaylet(id, node_id); bool owned_by_us; @@ -2714,7 +2714,7 @@ TEST_F(ReferenceCountTest, TestFree) { // Test free after receiving information about where the object is pinned. rc->AddOwnedObject(id, {}, rpc::Address(), "", 0, true, /*add_local_ref=*/true); - ASSERT_TRUE(rc->AddObjectPrimaryCopyDeleteCallback(id, callback)); + ASSERT_TRUE(rc->AddObjectOutOfScopeOrFreedCallback(id, callback)); rc->UpdateObjectPinnedAtRaylet(id, node_id); ASSERT_FALSE(rc->IsPlasmaObjectFreed(id)); rc->FreePlasmaObjects({id}); diff --git a/src/ray/core_worker/transport/actor_task_submitter.cc b/src/ray/core_worker/transport/actor_task_submitter.cc index c54b9deb16ec..af2600e0a6b8 100644 --- a/src/ray/core_worker/transport/actor_task_submitter.cc +++ b/src/ray/core_worker/transport/actor_task_submitter.cc @@ -44,7 +44,7 @@ void ActorTaskSubmitter::NotifyGCSWhenActorOutOfScope( })); }; - if (!reference_counter_->AddObjectPrimaryCopyDeleteCallback( + if (!reference_counter_->AddObjectOutOfScopeOrFreedCallback( actor_creation_return_id, [actor_out_of_scope_callback](const ObjectID &object_id) { actor_out_of_scope_callback(object_id); diff --git a/src/ray/gcs/gcs_client/test/gcs_client_reconnection_test.cc b/src/ray/gcs/gcs_client/test/gcs_client_reconnection_test.cc index 99a92034aed2..2e003ec39977 100644 --- a/src/ray/gcs/gcs_client/test/gcs_client_reconnection_test.cc +++ b/src/ray/gcs/gcs_client/test/gcs_client_reconnection_test.cc @@ -14,6 +14,7 @@ #include #include +#include #include "absl/strings/substitute.h" #include "gtest/gtest.h" @@ -198,7 +199,7 @@ TEST_F(GcsClientReconnectionTest, ReconnectionBasic) { ASSERT_TRUE(status.ok()) << status.ToString(); p1.set_value(*p); })); - ASSERT_EQ(f1.wait_for(1s), std::future_status::timeout); + ASSERT_EQ(std::future_status::timeout, f1.wait_for(1s)); // Make sure io context is not blocked std::promise p2; @@ -210,7 +211,7 @@ TEST_F(GcsClientReconnectionTest, ReconnectionBasic) { StartGCS(); // Make sure the request is executed - ASSERT_EQ(f1.get(), "B"); + ASSERT_EQ("B", f1.get()); } TEST_F(GcsClientReconnectionTest, ReconnectionBackoff) { @@ -241,8 +242,14 @@ TEST_F(GcsClientReconnectionTest, ReconnectionBackoff) { ShutdownGCS(); + std::promise p2; + auto f2 = p2.get_future(); RAY_UNUSED(client->InternalKV().AsyncInternalKVPut( - "", "A", "B", false, gcs::GetGcsTimeoutMs(), [](auto, auto) {})); + "", "A", "B", false, gcs::GetGcsTimeoutMs(), [&p2](auto status, auto) { + ASSERT_TRUE(status.ok()) << status.ToString(); + p2.set_value(); + })); + ASSERT_EQ(std::future_status::timeout, f2.wait_for(1s)); ASSERT_TRUE(WaitUntil( [channel]() { @@ -305,27 +312,41 @@ TEST_F(GcsClientReconnectionTest, QueueingAndBlocking) { ShutdownGCS(); // Send one request which should fail - RAY_UNUSED(client->InternalKV().AsyncInternalKVPut( - "", "A", "B", false, gcs::GetGcsTimeoutMs(), [](auto status, auto) {})); - - // Make sure it's not blocking std::promise p2; - client_io_service_->post([&p2]() { p2.set_value(); }, ""); auto f2 = p2.get_future(); - ASSERT_EQ(std::future_status::ready, f2.wait_for(1s)); - - // Send the second one and it should block the thread RAY_UNUSED(client->InternalKV().AsyncInternalKVPut( - "", "A", "B", false, gcs::GetGcsTimeoutMs(), [](auto status, auto) {})); - std::this_thread::sleep_for(1s); + "", "A", "B", false, gcs::GetGcsTimeoutMs(), [&p2](auto status, auto) { + ASSERT_TRUE(status.ok()) << status.ToString(); + p2.set_value(); + })); + ASSERT_EQ(std::future_status::timeout, f2.wait_for(1s)); + + // Make sure it's not blocking std::promise p3; client_io_service_->post([&p3]() { p3.set_value(); }, ""); auto f3 = p3.get_future(); - ASSERT_EQ(std::future_status::timeout, f3.wait_for(1s)); + ASSERT_EQ(std::future_status::ready, f3.wait_for(1s)); + + // Send the second one and it should block the thread + std::promise p4; + auto f4 = p4.get_future(); + RAY_UNUSED(client->InternalKV().AsyncInternalKVPut( + "", "A", "B", false, gcs::GetGcsTimeoutMs(), [&p4](auto status, auto) { + ASSERT_TRUE(status.ok()) << status.ToString(); + p4.set_value(); + })); + ASSERT_EQ(std::future_status::timeout, f4.wait_for(1s)); + + std::promise p5; + client_io_service_->post([&p5]() { p5.set_value(); }, ""); + auto f5 = p5.get_future(); + ASSERT_EQ(std::future_status::timeout, f5.wait_for(1s)); // Resume GCS server and it should unblock StartGCS(); - ASSERT_EQ(std::future_status::ready, f3.wait_for(5s)); + ASSERT_EQ(std::future_status::ready, f5.wait_for(5s)); + ASSERT_EQ(std::future_status::ready, f2.wait_for(1s)); + ASSERT_EQ(std::future_status::ready, f4.wait_for(1s)); } TEST_F(GcsClientReconnectionTest, Timeout) { @@ -335,7 +356,7 @@ TEST_F(GcsClientReconnectionTest, Timeout) { "gcs_rpc_server_reconnect_timeout_s": 60, "gcs_storage": "redis", "gcs_grpc_max_request_queued_max_bytes": 10, - "gcs_server_request_timeout_seconds": 3 + "gcs_server_request_timeout_seconds": 10 } )"); StartGCS(); @@ -346,11 +367,11 @@ TEST_F(GcsClientReconnectionTest, Timeout) { ASSERT_TRUE(added); ShutdownGCS(); - std::vector values; ASSERT_TRUE( client->InternalKV().Keys("", "A", gcs::GetGcsTimeoutMs(), values).IsTimedOut()); ASSERT_TRUE(values.empty()); + StartGCS(); ASSERT_TRUE(client->InternalKV().Keys("", "A", gcs::GetGcsTimeoutMs(), values).ok()); ASSERT_EQ(std::vector{"A"}, values); diff --git a/src/ray/gcs/gcs_server/gcs_actor_manager.cc b/src/ray/gcs/gcs_server/gcs_actor_manager.cc index aa609bacb445..8bf3f3d484f1 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_actor_manager.cc @@ -749,11 +749,8 @@ Status GcsActorManager::RegisterActor(const ray::rpc::RegisterActorRequest &requ "explicitly connect to this namespace with ray.init(namespace=\"" << actor->GetRayNamespace() << "\", ...)"; - auto error_data_ptr = - gcs::CreateErrorTableData("detached_actor_anonymous_namespace", - stream.str(), - absl::GetCurrentTimeNanos(), - job_id); + auto error_data_ptr = gcs::CreateErrorTableData( + "detached_actor_anonymous_namespace", stream.str(), absl::Now(), job_id); RAY_LOG(WARNING) << error_data_ptr->SerializeAsString(); RAY_CHECK_OK( diff --git a/src/ray/gcs/gcs_server/gcs_actor_manager.h b/src/ray/gcs/gcs_server/gcs_actor_manager.h index dadb90498379..afdf55be80d6 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_manager.h +++ b/src/ray/gcs/gcs_server/gcs_actor_manager.h @@ -321,7 +321,7 @@ class GcsActorManager : public rpc::ActorInfoHandler { std::function destroy_owned_placement_group_if_needed, const rpc::CoreWorkerClientFactoryFn &worker_client_factory = nullptr); - ~GcsActorManager() = default; + ~GcsActorManager() override = default; void HandleRegisterActor(rpc::RegisterActorRequest request, rpc::RegisterActorReply *reply, diff --git a/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc b/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc index f562c0f9034e..5daa1a992257 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc +++ b/src/ray/gcs/gcs_server/gcs_actor_scheduler.cc @@ -27,17 +27,17 @@ GcsActorScheduler::GcsActorScheduler( instrumented_io_context &io_context, GcsActorTable &gcs_actor_table, const GcsNodeManager &gcs_node_manager, - std::shared_ptr cluster_task_manager, + ClusterTaskManager &cluster_task_manager, GcsActorSchedulerFailureCallback schedule_failure_handler, GcsActorSchedulerSuccessCallback schedule_success_handler, - std::shared_ptr raylet_client_pool, + rpc::NodeManagerClientPool &raylet_client_pool, rpc::CoreWorkerClientFactoryFn client_factory, std::function normal_task_resources_changed_callback) : io_context_(io_context), gcs_actor_table_(gcs_actor_table), gcs_node_manager_(gcs_node_manager), - cluster_task_manager_(std::move(cluster_task_manager)), + cluster_task_manager_(cluster_task_manager), schedule_failure_handler_(std::move(schedule_failure_handler)), schedule_success_handler_(std::move(schedule_success_handler)), raylet_client_pool_(raylet_client_pool), @@ -97,11 +97,11 @@ void GcsActorScheduler::ScheduleByGcs(std::shared_ptr actor) { const auto &owner_node = gcs_node_manager_.GetAliveNode(actor->GetOwnerNodeID()); RayTask task(actor->GetCreationTaskSpecification(), owner_node.has_value() ? actor->GetOwnerNodeID().Binary() : std::string()); - cluster_task_manager_->QueueAndScheduleTask(task, - /*grant_or_reject*/ false, - /*is_selected_based_on_locality*/ false, - /*reply*/ reply.get(), - send_reply_callback); + cluster_task_manager_.QueueAndScheduleTask(task, + /*grant_or_reject*/ false, + /*is_selected_based_on_locality*/ false, + /*reply*/ reply.get(), + send_reply_callback); } void GcsActorScheduler::ScheduleByRaylet(std::shared_ptr actor) { @@ -218,7 +218,7 @@ std::vector GcsActorScheduler::CancelOnNode(const NodeID &node_id) { } } - raylet_client_pool_->Disconnect(node_id); + raylet_client_pool_.Disconnect(node_id); return actor_ids; } @@ -531,7 +531,7 @@ void GcsActorScheduler::DoRetryCreatingActorOnWorker( std::shared_ptr GcsActorScheduler::GetOrConnectLeaseClient( const rpc::Address &raylet_address) { - return raylet_client_pool_->GetOrConnectByAddress(raylet_address); + return raylet_client_pool_.GetOrConnectByAddress(raylet_address); } bool GcsActorScheduler::KillActorOnWorker(const rpc::Address &worker_address, @@ -664,13 +664,13 @@ void GcsActorScheduler::HandleWorkerLeaseRejectedReply( void GcsActorScheduler::OnActorDestruction(std::shared_ptr actor) { if (!actor->GetAcquiredResources().IsEmpty()) { ReturnActorAcquiredResources(actor); - cluster_task_manager_->ScheduleAndDispatchTasks(); + cluster_task_manager_.ScheduleAndDispatchTasks(); } } void GcsActorScheduler::ReturnActorAcquiredResources(std::shared_ptr actor) { auto &cluster_resource_manager = - cluster_task_manager_->GetClusterResourceScheduler()->GetClusterResourceManager(); + cluster_task_manager_.GetClusterResourceScheduler()->GetClusterResourceManager(); cluster_resource_manager.AddNodeAvailableResources( scheduling::NodeID(actor->GetNodeID().Binary()), actor->GetAcquiredResources().GetResourceSet()); @@ -678,14 +678,13 @@ void GcsActorScheduler::ReturnActorAcquiredResources(std::shared_ptr a } size_t GcsActorScheduler::GetPendingActorsCount() const { - return cluster_task_manager_->GetInfeasibleQueueSize() + - cluster_task_manager_->GetPendingQueueSize(); + return cluster_task_manager_.GetInfeasibleQueueSize() + + cluster_task_manager_.GetPendingQueueSize(); } bool GcsActorScheduler::CancelInFlightActorScheduling( const std::shared_ptr &actor) { - return cluster_task_manager_->CancelTask( - actor->GetCreationTaskSpecification().TaskId()); + return cluster_task_manager_.CancelTask(actor->GetCreationTaskSpecification().TaskId()); } } // namespace gcs diff --git a/src/ray/gcs/gcs_server/gcs_actor_scheduler.h b/src/ray/gcs/gcs_server/gcs_actor_scheduler.h index 1ea66d0ddbe0..048d1da8939c 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_scheduler.h +++ b/src/ray/gcs/gcs_server/gcs_actor_scheduler.h @@ -129,14 +129,14 @@ class GcsActorScheduler : public GcsActorSchedulerInterface { instrumented_io_context &io_context, GcsActorTable &gcs_actor_table, const GcsNodeManager &gcs_node_manager, - std::shared_ptr cluster_task_manager_, + ClusterTaskManager &cluster_task_manager_, GcsActorSchedulerFailureCallback schedule_failure_handler, GcsActorSchedulerSuccessCallback schedule_success_handler, - std::shared_ptr raylet_client_pool, + rpc::NodeManagerClientPool &raylet_client_pool, rpc::CoreWorkerClientFactoryFn client_factory = nullptr, std::function normal_task_resources_changed_callback = nullptr); - virtual ~GcsActorScheduler() = default; + ~GcsActorScheduler() override = default; /// Schedule the specified actor. /// If there is no available nodes then the actor would be queued in the @@ -377,7 +377,7 @@ class GcsActorScheduler : public GcsActorSchedulerInterface { /// Reference of GcsNodeManager. const GcsNodeManager &gcs_node_manager_; /// The cluster task manager. - std::shared_ptr cluster_task_manager_; + ClusterTaskManager &cluster_task_manager_; /// The handler to handle the scheduling failures. GcsActorSchedulerFailureCallback schedule_failure_handler_; /// The handler to handle the successful scheduling. @@ -385,7 +385,7 @@ class GcsActorScheduler : public GcsActorSchedulerInterface { /// The nodes which are releasing unused workers. absl::flat_hash_set nodes_of_releasing_unused_workers_; /// The cached raylet clients used to communicate with raylet. - std::shared_ptr raylet_client_pool_; + rpc::NodeManagerClientPool &raylet_client_pool_; /// The cached core worker clients which are used to communicate with leased worker. rpc::CoreWorkerClientPool core_worker_clients_; diff --git a/src/ray/gcs/gcs_server/gcs_autoscaler_state_manager.cc b/src/ray/gcs/gcs_server/gcs_autoscaler_state_manager.cc index e4ef13371b53..c166dbbf6398 100644 --- a/src/ray/gcs/gcs_server/gcs_autoscaler_state_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_autoscaler_state_manager.cc @@ -28,12 +28,12 @@ GcsAutoscalerStateManager::GcsAutoscalerStateManager( GcsNodeManager &gcs_node_manager, GcsActorManager &gcs_actor_manager, const GcsPlacementGroupManager &gcs_placement_group_manager, - std::shared_ptr raylet_client_pool) + rpc::NodeManagerClientPool &raylet_client_pool) : session_name_(session_name), gcs_node_manager_(gcs_node_manager), gcs_actor_manager_(gcs_actor_manager), gcs_placement_group_manager_(gcs_placement_group_manager), - raylet_client_pool_(std::move(raylet_client_pool)), + raylet_client_pool_(raylet_client_pool), last_cluster_resource_state_version_(0), last_seen_autoscaler_state_version_(0) {} @@ -396,7 +396,7 @@ void GcsAutoscalerStateManager::HandleDrainNode( raylet_address.set_ip_address(node->node_manager_address()); raylet_address.set_port(node->node_manager_port()); - const auto raylet_client = raylet_client_pool_->GetOrConnectByAddress(raylet_address); + const auto raylet_client = raylet_client_pool_.GetOrConnectByAddress(raylet_address); raylet_client->DrainRaylet( request.reason(), request.reason_message(), diff --git a/src/ray/gcs/gcs_server/gcs_autoscaler_state_manager.h b/src/ray/gcs/gcs_server/gcs_autoscaler_state_manager.h index c00d8d465202..c592a7a484d6 100644 --- a/src/ray/gcs/gcs_server/gcs_autoscaler_state_manager.h +++ b/src/ray/gcs/gcs_server/gcs_autoscaler_state_manager.h @@ -29,12 +29,11 @@ class GcsResourceManager; class GcsAutoscalerStateManager : public rpc::autoscaler::AutoscalerStateHandler { public: - GcsAutoscalerStateManager( - const std::string &session_name, - GcsNodeManager &gcs_node_manager, - GcsActorManager &gcs_actor_manager, - const GcsPlacementGroupManager &gcs_placement_group_manager, - std::shared_ptr raylet_client_pool); + GcsAutoscalerStateManager(const std::string &session_name, + GcsNodeManager &gcs_node_manager, + GcsActorManager &gcs_actor_manager, + const GcsPlacementGroupManager &gcs_placement_group_manager, + rpc::NodeManagerClientPool &raylet_client_pool); void HandleGetClusterResourceState( rpc::autoscaler::GetClusterResourceStateRequest request, @@ -152,7 +151,7 @@ class GcsAutoscalerStateManager : public rpc::autoscaler::AutoscalerStateHandler const GcsPlacementGroupManager &gcs_placement_group_manager_; /// Raylet client pool. - std::shared_ptr raylet_client_pool_; + rpc::NodeManagerClientPool &raylet_client_pool_; // The default value of the last seen version for the request is 0, which indicates // no version has been reported. So the first reported version should be 1. diff --git a/src/ray/gcs/gcs_server/gcs_health_check_manager.cc b/src/ray/gcs/gcs_server/gcs_health_check_manager.cc index 2cefb37f6b7c..d6e858482185 100644 --- a/src/ray/gcs/gcs_server/gcs_health_check_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_health_check_manager.cc @@ -14,15 +14,17 @@ #include "ray/gcs/gcs_server/gcs_health_check_manager.h" +#include + #include "ray/stats/metric.h" + DEFINE_stats(health_check_rpc_latency_ms, "Latency of rpc request for health check.", (), ({1, 10, 100, 1000, 10000}, ), ray::stats::HISTOGRAM); -namespace ray { -namespace gcs { +namespace ray::gcs { GcsHealthCheckManager::GcsHealthCheckManager( instrumented_io_context &io_service, @@ -38,17 +40,18 @@ GcsHealthCheckManager::GcsHealthCheckManager( period_ms_(period_ms), failure_threshold_(failure_threshold) { RAY_CHECK(on_node_death_callback != nullptr); - RAY_CHECK(initial_delay_ms >= 0); - RAY_CHECK(timeout_ms >= 0); - RAY_CHECK(period_ms >= 0); - RAY_CHECK(failure_threshold >= 0); + RAY_CHECK_GE(initial_delay_ms, 0); + RAY_CHECK_GE(timeout_ms, 0); + RAY_CHECK_GE(period_ms, 0); + RAY_CHECK_GE(failure_threshold, 0); } -GcsHealthCheckManager::~GcsHealthCheckManager() {} +GcsHealthCheckManager::~GcsHealthCheckManager() = default; void GcsHealthCheckManager::RemoveNode(const NodeID &node_id) { io_service_.dispatch( [this, node_id]() { + thread_checker_.IsOnSameThread(); auto iter = health_check_contexts_.find(node_id); if (iter == health_check_contexts_.end()) { return; @@ -61,6 +64,7 @@ void GcsHealthCheckManager::RemoveNode(const NodeID &node_id) { void GcsHealthCheckManager::FailNode(const NodeID &node_id) { RAY_LOG(WARNING).WithField(node_id) << "Node is dead because the health check failed."; + thread_checker_.IsOnSameThread(); auto iter = health_check_contexts_.find(node_id); if (iter != health_check_contexts_.end()) { on_node_death_callback_(node_id); @@ -69,7 +73,9 @@ void GcsHealthCheckManager::FailNode(const NodeID &node_id) { } std::vector GcsHealthCheckManager::GetAllNodes() const { + thread_checker_.IsOnSameThread(); std::vector nodes; + nodes.reserve(health_check_contexts_.size()); for (const auto &[node_id, _] : health_check_contexts_) { nodes.emplace_back(node_id); } @@ -84,24 +90,26 @@ void GcsHealthCheckManager::HealthCheckContext::StartHealthCheck() { new (&context_) grpc::ClientContext(); response_.Clear(); - auto deadline = - std::chrono::system_clock::now() + std::chrono::milliseconds(manager_->timeout_ms_); - context_.set_deadline(deadline); + const auto now = absl::Now(); + const auto deadline = now + absl::Milliseconds(manager_->timeout_ms_); + context_.set_deadline(absl::ToChronoTime(deadline)); stub_->async()->Check( - &context_, &request_, &response_, [this, now = absl::Now()](::grpc::Status status) { + &context_, &request_, &response_, [this, start = now](::grpc::Status status) { // This callback is done in gRPC's thread pool. STATS_health_check_rpc_latency_ms.Record( - absl::ToInt64Milliseconds(absl::Now() - now)); + absl::ToInt64Milliseconds(absl::Now() - start)); manager_->io_service_.post( [this, status]() { if (stopped_) { delete this; return; } - RAY_LOG(DEBUG) << "Health check status: " << int(response_.status()); + RAY_LOG(DEBUG) << "Health check status: " + << HealthCheckResponse_ServingStatus_Name( + response_.status()); if (status.ok() && response_.status() == HealthCheckResponse::SERVING) { - // Health check passed + // Health check passed. health_check_remaining_ = manager_->failure_threshold_; } else { --health_check_remaining_; @@ -118,6 +126,9 @@ void GcsHealthCheckManager::HealthCheckContext::StartHealthCheck() { delete this; } else { // Do another health check. + // + // TODO(hjiang): Able to reduce a few health check based on know resource + // usage communication between GCS and raylet. timer_.expires_from_now( boost::posix_time::milliseconds(manager_->period_ms_)); timer_.async_wait([this](auto) { StartHealthCheck(); }); @@ -132,13 +143,13 @@ void GcsHealthCheckManager::HealthCheckContext::Stop() { stopped_ = true; } void GcsHealthCheckManager::AddNode(const NodeID &node_id, std::shared_ptr channel) { io_service_.dispatch( - [this, channel, node_id]() { - RAY_CHECK(health_check_contexts_.count(node_id) == 0); + [this, channel = std::move(channel), node_id]() { + thread_checker_.IsOnSameThread(); auto context = new HealthCheckContext(this, channel, node_id); - health_check_contexts_.emplace(std::make_pair(node_id, context)); + auto [_, is_new] = health_check_contexts_.emplace(node_id, context); + RAY_CHECK(is_new); }, "GcsHealthCheckManager::AddNode"); } -} // namespace gcs -} // namespace ray +} // namespace ray::gcs diff --git a/src/ray/gcs/gcs_server/gcs_health_check_manager.h b/src/ray/gcs/gcs_server/gcs_health_check_manager.h index d877a217d803..a6e36d82972a 100644 --- a/src/ray/gcs/gcs_server/gcs_health_check_manager.h +++ b/src/ray/gcs/gcs_server/gcs_health_check_manager.h @@ -16,16 +16,19 @@ #include +#include +#include +#include +#include + #include "absl/container/flat_hash_map.h" #include "ray/common/asio/instrumented_io_context.h" #include "ray/common/id.h" #include "ray/common/ray_config.h" +#include "ray/util/thread_checker.h" #include "src/proto/grpc/health/v1/health.grpc.pb.h" -class GcsHealthCheckManagerTest; - -namespace ray { -namespace gcs { +namespace ray::gcs { /// GcsHealthCheckManager is used to track the healthiness of the nodes in the ray /// cluster. The health check is done in pull based way, which means this module will send @@ -35,6 +38,9 @@ namespace gcs { /// node will be removed from GcsHealthCheckManager. The node can be added into this class /// later. Although the same node id is not supposed to be reused in ray cluster, this is /// not enforced in this class. +/// +/// All IO operations happens on the same thread, which is managed by the pass-ed in +/// [io_service]. /// TODO (iycheng): Move the GcsHealthCheckManager to ray/common. class GcsHealthCheckManager { public: @@ -58,24 +64,27 @@ class GcsHealthCheckManager { ~GcsHealthCheckManager(); /// Start to track the healthiness of a node. + /// Safe to call from non-io-context threads. /// /// \param node_id The id of the node. /// \param channel The gRPC channel to the node. void AddNode(const NodeID &node_id, std::shared_ptr channel); /// Stop tracking the healthiness of a node. + /// Safe to call from non-io-context threads. /// /// \param node_id The id of the node to stop tracking. void RemoveNode(const NodeID &node_id); - /// Return all the nodes monitored. + /// Return all the nodes monitored and alive. + /// Notice: have to invoke from io-context thread. /// /// \return A list of node id which are being monitored by this class. std::vector GetAllNodes() const; private: /// Fail a node when health check failed. It'll stop the health checking and - /// call on_node_death_callback. + /// call `on_node_death_callback_`. /// /// \param node_id The id of the node. void FailNode(const NodeID &node_id); @@ -133,8 +142,12 @@ class GcsHealthCheckManager { std::function on_node_death_callback_; /// The context of the health check for each nodes. + /// Only living nodes are bookkept, while failed one will be removed. absl::flat_hash_map health_check_contexts_; + /// Checker to make sure there's no concurrent access for node addition and removal. + const ThreadChecker thread_checker_; + /// The delay for the first health check request. const int64_t initial_delay_ms_; /// Timeout for each health check request. @@ -145,5 +158,4 @@ class GcsHealthCheckManager { const int64_t failure_threshold_; }; -} // namespace gcs -} // namespace ray +} // namespace ray::gcs diff --git a/src/ray/gcs/gcs_server/gcs_node_manager.cc b/src/ray/gcs/gcs_server/gcs_node_manager.cc index a232ecf10903..15aa488cb65f 100644 --- a/src/ray/gcs/gcs_server/gcs_node_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_node_manager.cc @@ -29,14 +29,13 @@ namespace ray { namespace gcs { ////////////////////////////////////////////////////////////////////////////////////////// -GcsNodeManager::GcsNodeManager( - std::shared_ptr gcs_publisher, - std::shared_ptr gcs_table_storage, - std::shared_ptr raylet_client_pool, - const ClusterID &cluster_id) +GcsNodeManager::GcsNodeManager(std::shared_ptr gcs_publisher, + std::shared_ptr gcs_table_storage, + rpc::NodeManagerClientPool *raylet_client_pool, + const ClusterID &cluster_id) : gcs_publisher_(std::move(gcs_publisher)), gcs_table_storage_(std::move(gcs_table_storage)), - raylet_client_pool_(std::move(raylet_client_pool)), + raylet_client_pool_(raylet_client_pool), cluster_id_(cluster_id) {} void GcsNodeManager::WriteNodeExportEvent(rpc::GcsNodeInfo node_info) const { @@ -393,8 +392,8 @@ std::shared_ptr GcsNodeManager::RemoveNode( .WithField("ip", removed_node->node_manager_address()) << error_message.str(); RAY_LOG(WARNING) << error_message.str(); - auto error_data_ptr = - gcs::CreateErrorTableData(type, error_message.str(), current_time_ms()); + auto error_data_ptr = gcs::CreateErrorTableData( + type, error_message.str(), absl::FromUnixMillis(current_time_ms())); RAY_CHECK_OK(gcs_publisher_->PublishError(node_id.Hex(), *error_data_ptr, nullptr)); } diff --git a/src/ray/gcs/gcs_server/gcs_node_manager.h b/src/ray/gcs/gcs_server/gcs_node_manager.h index db258d4cb00c..b924fec264c9 100644 --- a/src/ray/gcs/gcs_server/gcs_node_manager.h +++ b/src/ray/gcs/gcs_server/gcs_node_manager.h @@ -50,7 +50,7 @@ class GcsNodeManager : public rpc::NodeInfoHandler { /// \param gcs_table_storage GCS table external storage accessor. explicit GcsNodeManager(std::shared_ptr gcs_publisher, std::shared_ptr gcs_table_storage, - std::shared_ptr raylet_client_pool, + rpc::NodeManagerClientPool *raylet_client_pool, const ClusterID &cluster_id); /// Handle register rpc request come from raylet. @@ -248,7 +248,7 @@ class GcsNodeManager : public rpc::NodeInfoHandler { /// Storage for GCS tables. std::shared_ptr gcs_table_storage_; /// Raylet client pool. - std::shared_ptr raylet_client_pool_; + rpc::NodeManagerClientPool *raylet_client_pool_ = nullptr; /// Cluster ID to be shared with clients when connecting. const ClusterID cluster_id_; diff --git a/src/ray/gcs/gcs_server/gcs_placement_group_manager.cc b/src/ray/gcs/gcs_server/gcs_placement_group_manager.cc index c60bcd43cc45..1aec60e9603c 100644 --- a/src/ray/gcs/gcs_server/gcs_placement_group_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_placement_group_manager.cc @@ -14,6 +14,8 @@ #include "ray/gcs/gcs_server/gcs_placement_group_manager.h" +#include + #include "ray/common/asio/asio_util.h" #include "ray/common/asio/instrumented_io_context.h" #include "ray/common/ray_config.h" @@ -181,15 +183,15 @@ rpc::PlacementGroupStats *GcsPlacementGroup::GetMutableStats() { GcsPlacementGroupManager::GcsPlacementGroupManager( instrumented_io_context &io_context, - std::shared_ptr scheduler, + GcsPlacementGroupSchedulerInterface *scheduler, std::shared_ptr gcs_table_storage, GcsResourceManager &gcs_resource_manager, std::function get_ray_namespace) : io_context_(io_context), - gcs_placement_group_scheduler_(std::move(scheduler)), + gcs_placement_group_scheduler_(scheduler), gcs_table_storage_(std::move(gcs_table_storage)), gcs_resource_manager_(gcs_resource_manager), - get_ray_namespace_(get_ray_namespace) { + get_ray_namespace_(std::move(get_ray_namespace)) { placement_group_state_counter_.reset( new CounterMap()); placement_group_state_counter_->SetOnChangeCallback( diff --git a/src/ray/gcs/gcs_server/gcs_placement_group_manager.h b/src/ray/gcs/gcs_server/gcs_placement_group_manager.h index d90fdccf3a8a..a7d91388e264 100644 --- a/src/ray/gcs/gcs_server/gcs_placement_group_manager.h +++ b/src/ray/gcs/gcs_server/gcs_placement_group_manager.h @@ -237,7 +237,7 @@ class GcsPlacementGroupManager : public rpc::PlacementGroupInfoHandler { /// \param gcs_resource_manager Reference of GcsResourceManager. /// \param get_ray_namespace A callback to get the ray namespace. GcsPlacementGroupManager(instrumented_io_context &io_context, - std::shared_ptr scheduler, + GcsPlacementGroupSchedulerInterface *scheduler, std::shared_ptr gcs_table_storage, GcsResourceManager &gcs_resource_manager, std::function get_ray_namespace); @@ -480,8 +480,8 @@ class GcsPlacementGroupManager : public rpc::PlacementGroupInfoHandler { std::deque> infeasible_placement_groups_; /// The scheduler to schedule all registered placement_groups. - std::shared_ptr - gcs_placement_group_scheduler_; + /// Scheduler's lifecycle lies in [GcsServer]. + gcs::GcsPlacementGroupSchedulerInterface *gcs_placement_group_scheduler_ = nullptr; /// Used to update placement group information upon creation, deletion, etc. std::shared_ptr gcs_table_storage_; diff --git a/src/ray/gcs/gcs_server/gcs_placement_group_scheduler.cc b/src/ray/gcs/gcs_server/gcs_placement_group_scheduler.cc index 6bc2737c14a6..85a94f863598 100644 --- a/src/ray/gcs/gcs_server/gcs_placement_group_scheduler.cc +++ b/src/ray/gcs/gcs_server/gcs_placement_group_scheduler.cc @@ -26,13 +26,13 @@ GcsPlacementGroupScheduler::GcsPlacementGroupScheduler( std::shared_ptr gcs_table_storage, const gcs::GcsNodeManager &gcs_node_manager, ClusterResourceScheduler &cluster_resource_scheduler, - std::shared_ptr raylet_client_pool) + rpc::NodeManagerClientPool &raylet_client_pool) : io_context_(io_context), return_timer_(io_context), gcs_table_storage_(std::move(gcs_table_storage)), gcs_node_manager_(gcs_node_manager), cluster_resource_scheduler_(cluster_resource_scheduler), - raylet_client_pool_(std::move(raylet_client_pool)) {} + raylet_client_pool_(raylet_client_pool) {} void GcsPlacementGroupScheduler::ScheduleUnplacedBundles( const SchedulePgRequest &request) { @@ -279,7 +279,7 @@ void GcsPlacementGroupScheduler::CancelResourceReserve( std::shared_ptr GcsPlacementGroupScheduler::GetOrConnectLeaseClient(const rpc::Address &raylet_address) { - return raylet_client_pool_->GetOrConnectByAddress(raylet_address); + return raylet_client_pool_.GetOrConnectByAddress(raylet_address); } std::shared_ptr diff --git a/src/ray/gcs/gcs_server/gcs_placement_group_scheduler.h b/src/ray/gcs/gcs_server/gcs_placement_group_scheduler.h index ec7ac53941bd..df16f025d082 100644 --- a/src/ray/gcs/gcs_server/gcs_placement_group_scheduler.h +++ b/src/ray/gcs/gcs_server/gcs_placement_group_scheduler.h @@ -290,12 +290,11 @@ class GcsPlacementGroupScheduler : public GcsPlacementGroupSchedulerInterface { /// \param cluster_resource_scheduler The resource scheduler which is used when /// scheduling. /// \param lease_client_factory Factory to create remote lease client. - GcsPlacementGroupScheduler( - instrumented_io_context &io_context, - std::shared_ptr gcs_table_storage, - const GcsNodeManager &gcs_node_manager, - ClusterResourceScheduler &cluster_resource_scheduler, - std::shared_ptr raylet_client_pool); + GcsPlacementGroupScheduler(instrumented_io_context &io_context, + std::shared_ptr gcs_table_storage, + const GcsNodeManager &gcs_node_manager, + ClusterResourceScheduler &cluster_resource_scheduler, + rpc::NodeManagerClientPool &raylet_client_pool); virtual ~GcsPlacementGroupScheduler() = default; @@ -502,7 +501,7 @@ class GcsPlacementGroupScheduler : public GcsPlacementGroupSchedulerInterface { placement_group_leasing_in_progress_; /// The cached raylet clients used to communicate with raylets. - std::shared_ptr raylet_client_pool_; + rpc::NodeManagerClientPool &raylet_client_pool_; /// The nodes which are releasing unused bundles. absl::flat_hash_set nodes_of_releasing_unused_bundles_; diff --git a/src/ray/gcs/gcs_server/gcs_resource_manager.h b/src/ray/gcs/gcs_server/gcs_resource_manager.h index dff95380cd21..47ecf9fff3a3 100644 --- a/src/ray/gcs/gcs_server/gcs_resource_manager.h +++ b/src/ray/gcs/gcs_server/gcs_resource_manager.h @@ -67,7 +67,7 @@ class GcsResourceManager : public rpc::NodeResourceInfoHandler, NodeID local_node_id, std::shared_ptr cluster_task_manager = nullptr); - virtual ~GcsResourceManager() {} + virtual ~GcsResourceManager() = default; /// Handle the resource update. void ConsumeSyncMessage(std::shared_ptr message) override; diff --git a/src/ray/gcs/gcs_server/gcs_server.cc b/src/ray/gcs/gcs_server/gcs_server.cc index c51c14bbb0dd..a8ad05ff3c9d 100644 --- a/src/ray/gcs/gcs_server/gcs_server.cc +++ b/src/ray/gcs/gcs_server/gcs_server.cc @@ -64,7 +64,7 @@ GcsServer::GcsServer(const ray::gcs::GcsServerConfig &config, ClusterID::Nil(), RayConfig::instance().gcs_server_rpc_client_thread_num()), raylet_client_pool_( - std::make_shared(client_call_manager_)), + std::make_unique(client_call_manager_)), pubsub_periodical_runner_(io_context_provider_.GetIOContext()), periodical_runner_(io_context_provider_.GetDefaultIOContext()), is_started_(false), @@ -289,7 +289,7 @@ void GcsServer::InitGcsNodeManager(const GcsInitData &gcs_init_data) { RAY_CHECK(gcs_table_storage_ && gcs_publisher_); gcs_node_manager_ = std::make_unique(gcs_publisher_, gcs_table_storage_, - raylet_client_pool_, + raylet_client_pool_.get(), rpc_server_.GetClusterId()); // Initialize by gcs tables data. gcs_node_manager_->Initialize(gcs_init_data); @@ -323,7 +323,7 @@ void GcsServer::InitGcsHealthCheckManager(const GcsInitData &gcs_init_data) { void GcsServer::InitGcsResourceManager(const GcsInitData &gcs_init_data) { RAY_CHECK(cluster_resource_scheduler_ && cluster_task_manager_); - gcs_resource_manager_ = std::make_shared( + gcs_resource_manager_ = std::make_unique( io_context_provider_.GetDefaultIOContext(), cluster_resource_scheduler_->GetClusterResourceManager(), *gcs_node_manager_, @@ -446,25 +446,25 @@ void GcsServer::InitGcsActorManager(const GcsInitData &gcs_init_data) { const rpc::PushTaskReply &reply) { gcs_actor_manager_->OnActorCreationSuccess(std::move(actor), reply); }; - auto client_factory = [this](const rpc::Address &address) { - return std::make_shared(address, client_call_manager_); - }; RAY_CHECK(gcs_resource_manager_ && cluster_task_manager_); scheduler = std::make_unique( io_context_provider_.GetDefaultIOContext(), gcs_table_storage_->ActorTable(), *gcs_node_manager_, - cluster_task_manager_, + *cluster_task_manager_, schedule_failure_handler, schedule_success_handler, - raylet_client_pool_, - client_factory, + *raylet_client_pool_, + /*factory=*/ + [this](const rpc::Address &address) { + return std::make_shared(address, client_call_manager_); + }, /*normal_task_resources_changed_callback=*/ [this](const NodeID &node_id, const rpc::ResourcesData &resources) { gcs_resource_manager_->UpdateNodeNormalTaskResources(node_id, resources); }); - gcs_actor_manager_ = std::make_shared( + gcs_actor_manager_ = std::make_unique( std::move(scheduler), gcs_table_storage_, gcs_publisher_, @@ -480,23 +480,23 @@ void GcsServer::InitGcsActorManager(const GcsInitData &gcs_init_data) { // Initialize by gcs tables data. gcs_actor_manager_->Initialize(gcs_init_data); // Register service. - actor_info_service_.reset(new rpc::ActorInfoGrpcService( - io_context_provider_.GetDefaultIOContext(), *gcs_actor_manager_)); + actor_info_service_ = std::make_unique( + io_context_provider_.GetDefaultIOContext(), *gcs_actor_manager_); rpc_server_.RegisterService(*actor_info_service_); } void GcsServer::InitGcsPlacementGroupManager(const GcsInitData &gcs_init_data) { RAY_CHECK(gcs_table_storage_ && gcs_node_manager_); - gcs_placement_group_scheduler_ = std::make_shared( + gcs_placement_group_scheduler_ = std::make_unique( io_context_provider_.GetDefaultIOContext(), gcs_table_storage_, *gcs_node_manager_, *cluster_resource_scheduler_, - raylet_client_pool_); + *raylet_client_pool_); - gcs_placement_group_manager_ = std::make_shared( + gcs_placement_group_manager_ = std::make_unique( io_context_provider_.GetDefaultIOContext(), - gcs_placement_group_scheduler_, + gcs_placement_group_scheduler_.get(), gcs_table_storage_, *gcs_resource_manager_, [this](const JobID &job_id) { @@ -671,7 +671,7 @@ void GcsServer::InitGcsAutoscalerStateManager(const GcsInitData &gcs_init_data) *gcs_node_manager_, *gcs_actor_manager_, *gcs_placement_group_manager_, - raylet_client_pool_); + *raylet_client_pool_); gcs_autoscaler_state_manager_->Initialize(gcs_init_data); autoscaler_state_service_.reset(new rpc::autoscaler::AutoscalerStateGrpcService( @@ -826,7 +826,7 @@ std::shared_ptr GcsServer::GetOrConnectRedis() { RAY_CHECK(status.ok()) << "Failed to init redis gcs client as " << status; // Init redis failure detector. - gcs_redis_failure_detector_ = std::make_shared( + gcs_redis_failure_detector_ = std::make_unique( io_context_provider_.GetDefaultIOContext(), redis_client_, []() { RAY_LOG(FATAL) << "Redis connection failed. Shutdown GCS."; }); diff --git a/src/ray/gcs/gcs_server/gcs_server.h b/src/ray/gcs/gcs_server/gcs_server.h index 22ece4dda229..6c37e9d8210c 100644 --- a/src/ray/gcs/gcs_server/gcs_server.h +++ b/src/ray/gcs/gcs_server/gcs_server.h @@ -14,6 +14,8 @@ #pragma once +#include + #include "ray/common/asio/asio_util.h" #include "ray/common/asio/instrumented_io_context.h" #include "ray/common/ray_syncer/ray_syncer.h" @@ -218,9 +220,9 @@ class GcsServer { /// The `ClientCallManager` object that is shared by all `NodeManagerWorkerClient`s. rpc::ClientCallManager client_call_manager_; /// Node manager client pool. - std::shared_ptr raylet_client_pool_; + std::unique_ptr raylet_client_pool_; /// The gcs resource manager. - std::shared_ptr gcs_resource_manager_; + std::unique_ptr gcs_resource_manager_; /// The cluster resource scheduler. std::shared_ptr cluster_resource_scheduler_; /// The cluster task manager. @@ -230,15 +232,17 @@ class GcsServer { /// The gcs node manager. std::unique_ptr gcs_node_manager_; /// The health check manager. - std::shared_ptr gcs_healthcheck_manager_; + std::unique_ptr gcs_healthcheck_manager_; /// The gcs redis failure detector. - std::shared_ptr gcs_redis_failure_detector_; + std::unique_ptr gcs_redis_failure_detector_; /// The gcs actor manager. - std::shared_ptr gcs_actor_manager_; + std::unique_ptr gcs_actor_manager_; /// The gcs placement group scheduler. - std::shared_ptr gcs_placement_group_scheduler_; + /// [gcs_placement_group_scheduler_] depends on [raylet_client_pool_]. + std::unique_ptr gcs_placement_group_scheduler_; /// The gcs placement group manager. - std::shared_ptr gcs_placement_group_manager_; + /// [gcs_placement_group_manager_] depends on [gcs_placement_group_scheduler_]. + std::unique_ptr gcs_placement_group_manager_; /// Job info handler and service. std::unique_ptr gcs_job_manager_; std::unique_ptr job_info_service_; diff --git a/src/ray/gcs/gcs_server/test/export_api/gcs_node_manager_export_event_test.cc b/src/ray/gcs/gcs_server/test/export_api/gcs_node_manager_export_event_test.cc index 742716da4b45..61d2d0e8b932 100644 --- a/src/ray/gcs/gcs_server/test/export_api/gcs_node_manager_export_event_test.cc +++ b/src/ray/gcs/gcs_server/test/export_api/gcs_node_manager_export_event_test.cc @@ -41,7 +41,7 @@ class GcsNodeManagerExportAPITest : public ::testing::Test { public: GcsNodeManagerExportAPITest() { raylet_client_ = std::make_shared(); - client_pool_ = std::make_shared( + client_pool_ = std::make_unique( [this](const rpc::Address &) { return raylet_client_; }); gcs_publisher_ = std::make_shared( std::make_unique()); @@ -72,7 +72,7 @@ class GcsNodeManagerExportAPITest : public ::testing::Test { protected: std::shared_ptr gcs_table_storage_; std::shared_ptr raylet_client_; - std::shared_ptr client_pool_; + std::unique_ptr client_pool_; std::shared_ptr gcs_publisher_; instrumented_io_context io_service_; std::string log_dir_; @@ -81,7 +81,7 @@ class GcsNodeManagerExportAPITest : public ::testing::Test { TEST_F(GcsNodeManagerExportAPITest, TestExportEventRegisterNode) { // Test export event is written when a node is added with HandleRegisterNode gcs::GcsNodeManager node_manager( - gcs_publisher_, gcs_table_storage_, client_pool_, ClusterID::Nil()); + gcs_publisher_, gcs_table_storage_, client_pool_.get(), ClusterID::Nil()); auto node = Mocker::GenNodeInfo(); rpc::RegisterNodeRequest register_request; @@ -103,7 +103,7 @@ TEST_F(GcsNodeManagerExportAPITest, TestExportEventRegisterNode) { TEST_F(GcsNodeManagerExportAPITest, TestExportEventUnregisterNode) { // Test export event is written when a node is removed with HandleUnregisterNode gcs::GcsNodeManager node_manager( - gcs_publisher_, gcs_table_storage_, client_pool_, ClusterID::Nil()); + gcs_publisher_, gcs_table_storage_, client_pool_.get(), ClusterID::Nil()); auto node = Mocker::GenNodeInfo(); auto node_id = NodeID::FromBinary(node->node_id()); node_manager.AddNode(node); diff --git a/src/ray/gcs/gcs_server/test/gcs_actor_manager_test.cc b/src/ray/gcs/gcs_server/test/gcs_actor_manager_test.cc index c3b8c9f2a421..9bb274af97bd 100644 --- a/src/ray/gcs/gcs_server/test/gcs_actor_manager_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_actor_manager_test.cc @@ -1424,8 +1424,3 @@ TEST_F(GcsActorManagerTest, TestDestroyActorWhenActorIsCreating) { } // namespace gcs } // namespace ray - -int main(int argc, char **argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_mock_test.cc b/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_mock_test.cc index 6beeb8b7504c..aca66ca39c09 100644 --- a/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_mock_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_mock_test.cc @@ -42,7 +42,7 @@ class GcsActorSchedulerMockTest : public Test { std::make_unique(nullptr, nullptr, nullptr, ClusterID::Nil()); raylet_client = std::make_shared(); core_worker_client = std::make_shared(); - client_pool = std::make_shared( + client_pool = std::make_unique( [this](const rpc::Address &) { return raylet_client; }); local_node_id = NodeID::FromRandom(); auto cluster_resource_scheduler = std::make_shared( @@ -52,7 +52,7 @@ class GcsActorSchedulerMockTest : public Test { /*is_node_available_fn=*/ [](auto) { return true; }, /*is_local_node_with_raylet=*/false); - auto cluster_task_manager = std::make_shared( + cluster_task_manager = std::make_unique( local_node_id, cluster_resource_scheduler, /*get_node_info=*/ @@ -70,10 +70,10 @@ class GcsActorSchedulerMockTest : public Test { io_context, *actor_table, *gcs_node_manager, - cluster_task_manager, + *cluster_task_manager, [this](auto a, auto b, auto c) { schedule_failure_handler(a); }, [this](auto a, const rpc::PushTaskReply) { schedule_success_handler(a); }, - client_pool, + *client_pool, [this](const rpc::Address &) { return core_worker_client; }); auto node_info = std::make_shared(); node_info->set_state(rpc::GcsNodeInfo::ALIVE); @@ -82,14 +82,16 @@ class GcsActorSchedulerMockTest : public Test { worker_id = WorkerID::FromRandom(); gcs_node_manager->AddNode(node_info); } + std::shared_ptr raylet_client; instrumented_io_context io_context; std::shared_ptr store_client; std::unique_ptr actor_table; - std::unique_ptr actor_scheduler; std::unique_ptr gcs_node_manager; + std::unique_ptr cluster_task_manager; + std::unique_ptr actor_scheduler; std::shared_ptr core_worker_client; - std::shared_ptr client_pool; + std::unique_ptr client_pool; std::shared_ptr>> counter; MockCallback schedule_failure_handler; diff --git a/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc b/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc index c14497db7eaa..6302ee02ed63 100644 --- a/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_actor_scheduler_test.cc @@ -12,10 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include + #include // clang-format off -#include "gtest/gtest.h" #include "ray/common/asio/instrumented_io_context.h" #include "ray/gcs/gcs_server/gcs_actor_scheduler.h" #include "ray/gcs/gcs_server/test/gcs_server_test_util.h" @@ -39,7 +40,7 @@ class GcsActorSchedulerTest : public ::testing::Test { store_client_ = std::make_shared(io_service_); gcs_table_storage_ = std::make_shared(io_service_); gcs_node_manager_ = std::make_shared( - gcs_publisher_, gcs_table_storage_, raylet_client_pool_, ClusterID::Nil()); + gcs_publisher_, gcs_table_storage_, raylet_client_pool_.get(), ClusterID::Nil()); gcs_actor_table_ = std::make_shared(store_client_); local_node_id_ = NodeID::FromRandom(); @@ -73,7 +74,7 @@ class GcsActorSchedulerTest : public ::testing::Test { io_service_, *gcs_actor_table_, *gcs_node_manager_, - cluster_task_manager_, + *cluster_task_manager_, /*schedule_failure_handler=*/ [this](std::shared_ptr actor, const rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type, @@ -84,7 +85,7 @@ class GcsActorSchedulerTest : public ::testing::Test { [this](std::shared_ptr actor, const rpc::PushTaskReply &reply) { success_actors_.emplace_back(std::move(actor)); }, - raylet_client_pool_, + *raylet_client_pool_, /*client_factory=*/ [this](const rpc::Address &address) { return worker_client_; }, /*normal_task_resources_changed_callback=*/ @@ -1191,8 +1192,3 @@ TEST_F(GcsActorSchedulerTest, TestReleaseUnusedActorWorkersByGcs) { } // namespace gcs } // namespace ray - -int main(int argc, char **argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/src/ray/gcs/gcs_server/test/gcs_autoscaler_state_manager_test.cc b/src/ray/gcs/gcs_server/test/gcs_autoscaler_state_manager_test.cc index b0b31b182d77..2f281fa31844 100644 --- a/src/ray/gcs/gcs_server/test/gcs_autoscaler_state_manager_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_autoscaler_state_manager_test.cc @@ -61,7 +61,7 @@ class GcsAutoscalerStateManagerTest : public ::testing::Test { void SetUp() override { raylet_client_ = std::make_shared(); - client_pool_ = std::make_shared( + client_pool_ = std::make_unique( [this](const rpc::Address &) { return raylet_client_; }); cluster_resource_manager_ = std::make_unique(io_service_); gcs_node_manager_ = std::make_shared(); @@ -86,7 +86,7 @@ class GcsAutoscalerStateManagerTest : public ::testing::Test { *gcs_node_manager_, *gcs_actor_manager_, *gcs_placement_group_manager_, - client_pool_)); + *client_pool_)); } public: @@ -847,8 +847,3 @@ TEST_F(GcsAutoscalerStateManagerTest, TestGcsKvManagerInternalConfig) { } // namespace gcs } // namespace ray - -int main(int argc, char **argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/src/ray/gcs/gcs_server/test/gcs_health_check_manager_test.cc b/src/ray/gcs/gcs_server/test/gcs_health_check_manager_test.cc index 1f0722a6d6d7..35fc308f28a9 100644 --- a/src/ray/gcs/gcs_server/test/gcs_health_check_manager_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_health_check_manager_test.cc @@ -271,17 +271,3 @@ TEST_F(GcsHealthCheckManagerTest, StressTest) { io_service.stop(); t->join(); } - -int main(int argc, char **argv) { - InitShutdownRAII ray_log_shutdown_raii(ray::RayLog::StartRayLog, - ray::RayLog::ShutDownRayLog, - argv[0], - ray::RayLogLevel::INFO, - /*log_dir=*/""); - - ray::RayLog::InstallFailureSignalHandler(argv[0]); - ray::RayLog::InstallTerminateHandler(); - - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/src/ray/gcs/gcs_server/test/gcs_job_manager_test.cc b/src/ray/gcs/gcs_server/test/gcs_job_manager_test.cc index 107af6752a5d..b18658dffc95 100644 --- a/src/ray/gcs/gcs_server/test/gcs_job_manager_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_job_manager_test.cc @@ -720,8 +720,4 @@ TEST_F(GcsJobManagerTest, TestNodeFailure) { EXPECT_TRUE(WaitForCondition(condition, 2000)); } -int main(int argc, char **argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} } // namespace ray diff --git a/src/ray/gcs/gcs_server/test/gcs_node_manager_test.cc b/src/ray/gcs/gcs_server/test/gcs_node_manager_test.cc index a8a0157e0d54..eb12d59dbdb3 100644 --- a/src/ray/gcs/gcs_server/test/gcs_node_manager_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_node_manager_test.cc @@ -28,7 +28,7 @@ class GcsNodeManagerTest : public ::testing::Test { public: GcsNodeManagerTest() { raylet_client_ = std::make_shared(); - client_pool_ = std::make_shared( + client_pool_ = std::make_unique( [this](const rpc::Address &) { return raylet_client_; }); gcs_publisher_ = std::make_shared( std::make_unique()); @@ -37,13 +37,13 @@ class GcsNodeManagerTest : public ::testing::Test { protected: std::shared_ptr gcs_table_storage_; std::shared_ptr raylet_client_; - std::shared_ptr client_pool_; + std::unique_ptr client_pool_; std::shared_ptr gcs_publisher_; }; TEST_F(GcsNodeManagerTest, TestManagement) { gcs::GcsNodeManager node_manager( - gcs_publisher_, gcs_table_storage_, client_pool_, ClusterID::Nil()); + gcs_publisher_, gcs_table_storage_, client_pool_.get(), ClusterID::Nil()); // Test Add/Get/Remove functionality. auto node = Mocker::GenNodeInfo(); auto node_id = NodeID::FromBinary(node->node_id()); @@ -58,7 +58,7 @@ TEST_F(GcsNodeManagerTest, TestManagement) { TEST_F(GcsNodeManagerTest, TestListener) { gcs::GcsNodeManager node_manager( - gcs_publisher_, gcs_table_storage_, client_pool_, ClusterID::Nil()); + gcs_publisher_, gcs_table_storage_, client_pool_.get(), ClusterID::Nil()); // Test AddNodeAddedListener. int node_count = 1000; std::vector> added_nodes; @@ -97,8 +97,3 @@ TEST_F(GcsNodeManagerTest, TestListener) { } } // namespace ray - -int main(int argc, char **argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_mock_test.cc b/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_mock_test.cc index 6cfd689ac168..1e3ef61060c8 100644 --- a/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_mock_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_mock_test.cc @@ -47,7 +47,7 @@ class GcsPlacementGroupManagerMockTest : public Test { gcs_placement_group_manager_ = std::make_unique(io_context_, - gcs_placement_group_scheduler_, + gcs_placement_group_scheduler_.get(), gcs_table_storage_, *resource_manager_, [](auto &) { return ""; }); diff --git a/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_test.cc b/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_test.cc index 268096815cbe..ad808b644b67 100644 --- a/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_placement_group_manager_test.cc @@ -89,7 +89,7 @@ class GcsPlacementGroupManagerTest : public ::testing::Test { io_service_, cluster_resource_manager_, *gcs_node_manager_, NodeID::FromRandom()); gcs_placement_group_manager_.reset(new gcs::GcsPlacementGroupManager( io_service_, - mock_placement_group_scheduler_, + mock_placement_group_scheduler_.get(), gcs_table_storage_, *gcs_resource_manager_, [this](const JobID &job_id) { return job_namespace_table_[job_id]; })); @@ -1011,8 +1011,3 @@ TEST_F(GcsPlacementGroupManagerTest, TestCheckCreatorJobIsDeadWhenGcsRestart) { } // namespace gcs } // namespace ray - -int main(int argc, char **argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/src/ray/gcs/gcs_server/test/gcs_placement_group_scheduler_test.cc b/src/ray/gcs/gcs_server/test/gcs_placement_group_scheduler_test.cc index 5d3f11ed39b0..093bdaf13fcc 100644 --- a/src/ray/gcs/gcs_server/test/gcs_placement_group_scheduler_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_placement_group_scheduler_test.cc @@ -55,21 +55,21 @@ class GcsPlacementGroupSchedulerTest : public ::testing::Test { [](auto) { return true; }, /*is_local_node_with_raylet=*/false); gcs_node_manager_ = std::make_shared( - gcs_publisher_, gcs_table_storage_, raylet_client_pool_, ClusterID::Nil()); + gcs_publisher_, gcs_table_storage_, raylet_client_pool_.get(), ClusterID::Nil()); gcs_resource_manager_ = std::make_shared( io_service_, cluster_resource_scheduler_->GetClusterResourceManager(), *gcs_node_manager_, local_node_id); store_client_ = std::make_shared(io_service_); - raylet_client_pool_ = std::make_shared( + raylet_client_pool_ = std::make_unique( [this](const rpc::Address &addr) { return raylet_clients_[addr.port()]; }); scheduler_ = std::make_shared( io_service_, gcs_table_storage_, *gcs_node_manager_, *cluster_resource_scheduler_, - raylet_client_pool_); + *raylet_client_pool_); counter_.reset(new CounterMap()); } @@ -296,7 +296,7 @@ class GcsPlacementGroupSchedulerTest : public ::testing::Test { ABSL_GUARDED_BY(placement_group_requests_mutex_); std::shared_ptr gcs_publisher_; std::shared_ptr gcs_table_storage_; - std::shared_ptr raylet_client_pool_; + std::unique_ptr raylet_client_pool_; std::shared_ptr> counter_; }; @@ -1476,8 +1476,3 @@ TEST_F(GcsPlacementGroupSchedulerTest, TestBundlesRemovedWhenNodeDead) { } } // namespace ray - -int main(int argc, char **argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/src/ray/gcs/gcs_server/test/gcs_resource_manager_test.cc b/src/ray/gcs/gcs_server/test/gcs_resource_manager_test.cc index 31fdc58530d8..cc2d3dec33a8 100644 --- a/src/ray/gcs/gcs_server/test/gcs_resource_manager_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_resource_manager_test.cc @@ -255,8 +255,3 @@ TEST_F(GcsResourceManagerTest, TestGetDrainingNodes) { } } // namespace ray - -int main(int argc, char **argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/src/ray/gcs/gcs_server/test/gcs_worker_manager_test.cc b/src/ray/gcs/gcs_server/test/gcs_worker_manager_test.cc index 3dd5c1e720e2..37d6a67b7b0d 100644 --- a/src/ray/gcs/gcs_server/test/gcs_worker_manager_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_worker_manager_test.cc @@ -297,8 +297,3 @@ TEST_F(GcsWorkerManagerTest, TestUpdateWorkerNumPausedThreads) { ASSERT_EQ(reply.worker_table_data(0).num_paused_threads(), num_paused_threads_delta); } } - -int main(int argc, char **argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/src/ray/gcs/gcs_server/test/in_memory_gcs_table_storage_test.cc b/src/ray/gcs/gcs_server/test/in_memory_gcs_table_storage_test.cc index 0a63277536c8..dba6ddce5922 100644 --- a/src/ray/gcs/gcs_server/test/in_memory_gcs_table_storage_test.cc +++ b/src/ray/gcs/gcs_server/test/in_memory_gcs_table_storage_test.cc @@ -35,8 +35,3 @@ TEST_F(InMemoryGcsTableStorageTest, TestGcsTableWithJobIdApi) { } } // namespace ray - -int main(int argc, char **argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/src/ray/gcs/gcs_server/test/usage_stats_client_test.cc b/src/ray/gcs/gcs_server/test/usage_stats_client_test.cc index 21e660b1f1cd..9448f0000b9f 100644 --- a/src/ray/gcs/gcs_server/test/usage_stats_client_test.cc +++ b/src/ray/gcs/gcs_server/test/usage_stats_client_test.cc @@ -45,8 +45,3 @@ TEST_F(UsageStatsClientTest, TestRecordExtraUsageTag) { ASSERT_EQ(value.value(), "value2"); }); } - -int main(int argc, char **argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/src/ray/gcs/pb_util.h b/src/ray/gcs/pb_util.h index cb3c518072b2..0bdd056b0b8d 100644 --- a/src/ray/gcs/pb_util.h +++ b/src/ray/gcs/pb_util.h @@ -16,6 +16,7 @@ #include +#include "absl/time/time.h" #include "ray/common/constants.h" #include "ray/common/id.h" #include "ray/common/ray_config.h" @@ -61,27 +62,11 @@ inline std::shared_ptr CreateJobTableData( } /// Helper function to produce error table data. -inline std::shared_ptr CreateErrorTableData( +std::shared_ptr CreateErrorTableData( const std::string &error_type, const std::string &error_msg, - double timestamp, - const JobID &job_id = JobID::Nil()) { - uint32_t max_error_msg_size_bytes = RayConfig::instance().max_error_msg_size_bytes(); - auto error_info_ptr = std::make_shared(); - error_info_ptr->set_type(error_type); - if (error_msg.length() > max_error_msg_size_bytes) { - std::ostringstream stream; - stream << "The message size exceeds " << std::to_string(max_error_msg_size_bytes) - << " bytes. Find the full log from the log files. Here is abstract: " - << error_msg.substr(0, max_error_msg_size_bytes); - error_info_ptr->set_error_message(stream.str()); - } else { - error_info_ptr->set_error_message(error_msg); - } - error_info_ptr->set_timestamp(timestamp); - error_info_ptr->set_job_id(job_id.Binary()); - return error_info_ptr; -} + absl::Time timestamp, + const JobID &job_id = JobID::Nil()); /// Helper function to produce worker failure data. inline std::shared_ptr CreateWorkerFailureData( diff --git a/src/ray/gcs/pb_utils.cc b/src/ray/gcs/pb_utils.cc new file mode 100644 index 000000000000..9330f80c27ce --- /dev/null +++ b/src/ray/gcs/pb_utils.cc @@ -0,0 +1,47 @@ +// Copyright 2024 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// TODO(hjiang): Move all functions from `pb_utils.h` to this implementation file. + +#include + +#include "absl/strings/str_format.h" +#include "ray/gcs/pb_util.h" + +namespace ray::gcs { + +std::shared_ptr CreateErrorTableData( + const std::string &error_type, + const std::string &error_msg, + absl::Time timestamp, + const JobID &job_id) { + uint32_t max_error_msg_size_bytes = RayConfig::instance().max_error_msg_size_bytes(); + auto error_info_ptr = std::make_shared(); + error_info_ptr->set_type(error_type); + if (error_msg.length() > max_error_msg_size_bytes) { + std::string formatted_error_message = absl::StrFormat( + "The message size exceeds %d bytes. Find the full log from the log files. Here " + "is abstract: %s", + max_error_msg_size_bytes, + std::string_view{error_msg}.substr(0, max_error_msg_size_bytes)); + error_info_ptr->set_error_message(std::move(formatted_error_message)); + } else { + error_info_ptr->set_error_message(error_msg); + } + error_info_ptr->set_timestamp(absl::ToUnixMillis(timestamp)); + error_info_ptr->set_job_id(job_id.Binary()); + return error_info_ptr; +} + +} // namespace ray::gcs diff --git a/src/ray/gcs/redis_context.cc b/src/ray/gcs/redis_context.cc index 6de20bfe34af..f26333299b11 100644 --- a/src/ray/gcs/redis_context.cc +++ b/src/ray/gcs/redis_context.cc @@ -431,6 +431,114 @@ void ValidateRedisDB(RedisContext &context) { } } +bool isRedisSentinel(RedisContext &context) { + auto reply = context.RunArgvSync(std::vector{"INFO", "SENTINEL"}); + if (reply->IsNil() || reply->IsError() || reply->ReadAsString().length() == 0) { + return false; + } else { + return true; + } +} + +Status ConnectRedisCluster(RedisContext &context, + const std::string &password, + bool enable_ssl, + const std::string &redis_address) { + RAY_LOG(INFO) << "Connect to Redis Cluster"; + // Ray has some restrictions for RedisDB. Validate it here. + ValidateRedisDB(context); + + // Find the true leader + std::vector argv; + std::vector argc; + std::vector cmds = {"DEL", "DUMMY"}; + for (const auto &arg : cmds) { + argv.push_back(arg.data()); + argc.push_back(arg.size()); + } + + auto redis_reply = reinterpret_cast( + ::redisCommandArgv(context.sync_context(), cmds.size(), argv.data(), argc.data())); + + if (redis_reply->type == REDIS_REPLY_ERROR) { + // This should be a MOVED error + // MOVED 14946 10.xx.xx.xx:7001 + std::string error_msg(redis_reply->str, redis_reply->len); + freeReplyObject(redis_reply); + auto maybe_ip_port = ParseIffMovedError(error_msg); + RAY_CHECK(maybe_ip_port.has_value()) + << "Setup Redis cluster failed in the dummy deletion: " << error_msg; + context.Disconnect(); + const auto &[ip, port] = maybe_ip_port.value(); + // Connect to the true leader. + RAY_LOG(INFO) << "Redis cluster leader is " << ip << ":" << port + << ". Reconnect to it."; + return context.Connect(ip, port, password, enable_ssl); + } else { + RAY_LOG(INFO) << "Redis cluster leader is " << redis_address; + freeReplyObject(redis_reply); + } + + return Status::OK(); +} + +Status ConnectRedisSentinel(RedisContext &context, + const std::string &password, + bool enable_ssl) { + RAY_LOG(INFO) << "Connect to Redis sentinel"; + + std::vector argv; + std::vector argc; + std::vector cmds = {"SENTINEL", "MASTERS"}; + for (const auto &arg : cmds) { + argv.push_back(arg.data()); + argc.push_back(arg.size()); + } + + // use raw redis context since we need to parse a complex reply. + // sample reply (array of arrays): + // 1) 1) "name" + // 2) "redis-ha" + // 3) "ip" + // 4) "10.112.202.115" + // 5) "port" + // 6) "6379" + // 7) "runid" + // 8) "18a76cedbf445bd25bbd412c92e237137b5c7d4d" + auto redis_reply = reinterpret_cast( + ::redisCommandArgv(context.sync_context(), cmds.size(), argv.data(), argc.data())); + + RAY_CHECK(redis_reply) << "Failed to get redis sentinel masters info"; + RAY_CHECK_EQ(redis_reply->type, REDIS_REPLY_ARRAY) + << "Redis sentinel master info should be REDIS_REPLY_ARRAY but got " + << redis_reply->type; + RAY_CHECK_EQ(redis_reply->elements, 1UL) + << "There should be only one primary behind the Redis sentinel"; + auto primary = redis_reply->element[0]; + std::string actual_ip, actual_port; + for (size_t i = 0; i < primary->elements; i += 2) { + std::string key = primary->element[i]->str; // Key (e.g., "name", "ip") + std::string value = primary->element[i + 1]->str; // Value corresponding to the key + if ("ip" == key) { + actual_ip = value; + } else if ("port" == key) { + actual_port = value; + } + } + freeReplyObject(redis_reply); + if (actual_ip.empty() || actual_port.empty()) { + RAY_LOG(ERROR) + << "Failed to get the ip and port of the primary node from Redis sentinel"; + return Status::RedisError( + "Failed to get the ip and port of the primary node from Redis sentinel"); + } else { + RAY_LOG(INFO) << "Connecting to the Redis primary node behind sentinel: " << actual_ip + << ":" << actual_port; + context.Disconnect(); + return context.Connect(actual_ip, std::stoi(actual_port), password, enable_ssl); + } +} + std::vector ResolveDNS(const std::string &address, int port) { using namespace boost::asio; io_context ctx; @@ -503,41 +611,13 @@ Status RedisContext::Connect(const std::string &address, redis_async_context_.reset(new RedisAsyncContext(std::move(async_context))); SetDisconnectCallback(redis_async_context_.get()); - // Ray has some restrictions for RedisDB. Validate it here. - ValidateRedisDB(*this); - - // Find the true leader - std::vector argv; - std::vector argc; - std::vector cmds = {"DEL", "DUMMY"}; - for (const auto &arg : cmds) { - argv.push_back(arg.data()); - argc.push_back(arg.size()); - } - - auto redis_reply = reinterpret_cast( - ::redisCommandArgv(context_.get(), cmds.size(), argv.data(), argc.data())); - - if (redis_reply->type == REDIS_REPLY_ERROR) { - // This should be a MOVED error - // MOVED 14946 10.xx.xx.xx:7001 - std::string error_msg(redis_reply->str, redis_reply->len); - freeReplyObject(redis_reply); - auto maybe_ip_port = ParseIffMovedError(error_msg); - RAY_CHECK(maybe_ip_port.has_value()) - << "Setup Redis cluster failed in the dummy deletion: " << error_msg; - Disconnect(); - const auto &[ip, port] = maybe_ip_port.value(); - // Connect to the true leader. - RAY_LOG(INFO) << "Redis cluster leader is " << ip << ":" << port - << ". Reconnect to it."; - return Connect(ip, port, password, enable_ssl); + // handle validation and primary connection for different types of redis + if (isRedisSentinel(*this)) { + return ConnectRedisSentinel(*this, password, enable_ssl); } else { - RAY_LOG(INFO) << "Redis cluster leader is " << ip_addresses[0] << ":" << port; - freeReplyObject(redis_reply); + return ConnectRedisCluster( + *this, password, enable_ssl, ip_addresses[0] + ":" + std::to_string(port)); } - - return Status::OK(); } std::unique_ptr RedisContext::RunArgvSync( diff --git a/src/ray/gcs/store_client/in_memory_store_client.cc b/src/ray/gcs/store_client/in_memory_store_client.cc index 39306b1254c9..1d1cb3451350 100644 --- a/src/ray/gcs/store_client/in_memory_store_client.cc +++ b/src/ray/gcs/store_client/in_memory_store_client.cc @@ -14,9 +14,7 @@ #include "ray/gcs/store_client/in_memory_store_client.h" -namespace ray { - -namespace gcs { +namespace ray::gcs { Status InMemoryStoreClient::AsyncPut(const std::string &table_name, const std::string &key, @@ -35,9 +33,10 @@ Status InMemoryStoreClient::AsyncPut(const std::string &table_name, table->records_[key] = data; inserted = true; } - if (callback != nullptr) { - main_io_service_.post([callback, inserted]() { callback(inserted); }, - "GcsInMemoryStore.Put"); + if (callback) { + main_io_service_.post( + [callback = std::move(callback), inserted]() { callback(inserted); }, + "GcsInMemoryStore.Put"); } return Status::OK(); } @@ -45,7 +44,7 @@ Status InMemoryStoreClient::AsyncPut(const std::string &table_name, Status InMemoryStoreClient::AsyncGet(const std::string &table_name, const std::string &key, const OptionalItemCallback &callback) { - RAY_CHECK(callback != nullptr); + RAY_CHECK(callback); auto table = GetOrCreateTable(table_name); absl::MutexLock lock(&(table->mutex_)); auto iter = table->records_.find(key); @@ -66,9 +65,10 @@ Status InMemoryStoreClient::AsyncGetAll( const std::string &table_name, const MapCallback &callback) { RAY_CHECK(callback); + auto result = absl::flat_hash_map(); auto table = GetOrCreateTable(table_name); absl::MutexLock lock(&(table->mutex_)); - auto result = absl::flat_hash_map(); + result.reserve(table->records_.size()); result.insert(table->records_.begin(), table->records_.end()); main_io_service_.post( [result = std::move(result), callback]() mutable { callback(std::move(result)); }, @@ -81,10 +81,10 @@ Status InMemoryStoreClient::AsyncMultiGet( const std::vector &keys, const MapCallback &callback) { RAY_CHECK(callback); + auto result = absl::flat_hash_map(); auto table = GetOrCreateTable(table_name); absl::MutexLock lock(&(table->mutex_)); - auto result = absl::flat_hash_map(); - for (auto &key : keys) { + for (const auto &key : keys) { auto it = table->records_.find(key); if (it == table->records_.end()) { continue; @@ -104,7 +104,7 @@ Status InMemoryStoreClient::AsyncDelete(const std::string &table_name, absl::MutexLock lock(&(table->mutex_)); auto num = table->records_.erase(key); if (callback != nullptr) { - main_io_service_.post([callback, num]() { callback(num > 0); }, + main_io_service_.post([callback = std::move(callback), num]() { callback(num > 0); }, "GcsInMemoryStore.Delete"); } return Status::OK(); @@ -120,7 +120,7 @@ Status InMemoryStoreClient::AsyncBatchDelete(const std::string &table_name, num += table->records_.erase(key); } if (callback != nullptr) { - main_io_service_.post([callback, num]() { callback(num); }, + main_io_service_.post([callback = std::move(callback), num]() { callback(num); }, "GcsInMemoryStore.BatchDelete"); } return Status::OK(); @@ -138,11 +138,10 @@ std::shared_ptr InMemoryStoreClient::GetOrCr auto iter = tables_.find(table_name); if (iter != tables_.end()) { return iter->second; - } else { - auto table = std::make_shared(); - tables_[table_name] = table; - return table; } + auto table = std::make_shared(); + tables_[table_name] = table; + return table; } Status InMemoryStoreClient::AsyncGetKeys( @@ -150,16 +149,18 @@ Status InMemoryStoreClient::AsyncGetKeys( const std::string &prefix, std::function)> callback) { RAY_CHECK(callback); - auto table = GetOrCreateTable(table_name); std::vector result; + auto table = GetOrCreateTable(table_name); absl::MutexLock lock(&(table->mutex_)); - for (auto &pair : table->records_) { - if (pair.first.find(prefix) == 0) { - result.push_back(pair.first); + for (const auto &[key, _] : table->records_) { + if (key.find(prefix) == 0) { + result.emplace_back(key); } } main_io_service_.post( - [result = std::move(result), callback]() mutable { callback(std::move(result)); }, + [result = std::move(result), callback = std::move(callback)]() mutable { + callback(std::move(result)); + }, "GcsInMemoryStore.Keys"); return Status::OK(); } @@ -171,11 +172,9 @@ Status InMemoryStoreClient::AsyncExists(const std::string &table_name, auto table = GetOrCreateTable(table_name); absl::MutexLock lock(&(table->mutex_)); bool result = table->records_.contains(key); - main_io_service_.post([result, callback]() mutable { callback(result); }, + main_io_service_.post([result, callback = std::move(callback)]() { callback(result); }, "GcsInMemoryStore.Exists"); return Status::OK(); } -} // namespace gcs - -} // namespace ray +} // namespace ray::gcs diff --git a/src/ray/gcs/store_client/in_memory_store_client.h b/src/ray/gcs/store_client/in_memory_store_client.h index a4ea7bc47ac6..6590d608f833 100644 --- a/src/ray/gcs/store_client/in_memory_store_client.h +++ b/src/ray/gcs/store_client/in_memory_store_client.h @@ -14,15 +14,17 @@ #pragma once +#include +#include +#include + #include "absl/container/flat_hash_map.h" #include "absl/synchronization/mutex.h" #include "ray/common/asio/instrumented_io_context.h" #include "ray/gcs/store_client/store_client.h" #include "src/ray/protobuf/gcs.pb.h" -namespace ray { - -namespace gcs { +namespace ray::gcs { /// \class InMemoryStoreClient /// Please refer to StoreClient for API semantics. @@ -77,7 +79,7 @@ class InMemoryStoreClient : public StoreClient { }; std::shared_ptr GetOrCreateTable( - const std::string &table_name); + const std::string &table_name) ABSL_LOCKS_EXCLUDED(mutex_); /// Mutex to protect the tables_ field. absl::Mutex mutex_; @@ -88,9 +90,8 @@ class InMemoryStoreClient : public StoreClient { /// of the callback. instrumented_io_context &main_io_service_; - int job_id_ = 0; + /// Current job id, auto-increment when request next-id. + int job_id_ ABSL_GUARDED_BY(mutex_) = 0; }; -} // namespace gcs - -} // namespace ray +} // namespace ray::gcs diff --git a/src/ray/protobuf/common.proto b/src/ray/protobuf/common.proto index 604c92d0d4db..f18175a12f2c 100644 --- a/src/ray/protobuf/common.proto +++ b/src/ray/protobuf/common.proto @@ -556,6 +556,8 @@ message TaskSpec { // this field contains the detached actor id. // Otherwise it's empty and is originated from a driver. bytes root_detached_actor_id = 40; + // The key-value labels for task and actor. + map labels = 41; } message TaskInfoEntry { @@ -986,6 +988,10 @@ message NamedActorInfo { message LineageReconstructionTask { string name = 1; - map resources = 2; - TaskStatus status = 3; + TaskStatus status = 2; + // If the task is a normal task, + // this has the labels of the normal task. + // If the task is an actor task, + // this has the labels of the corresponding actor. + map labels = 3; } diff --git a/src/ray/protobuf/core_worker.proto b/src/ray/protobuf/core_worker.proto index caa869ce18e9..9b6dad5191dd 100644 --- a/src/ray/protobuf/core_worker.proto +++ b/src/ray/protobuf/core_worker.proto @@ -70,6 +70,9 @@ message ActorHandle { // Whether task events will be reported from this actor. bool enable_task_events = 14; + + // The key-value labels for actor. + map labels = 15; } message PushTaskRequest { diff --git a/src/ray/protobuf/gcs.proto b/src/ray/protobuf/gcs.proto index b34f5bd9f39b..2d0ff6dfac4a 100644 --- a/src/ray/protobuf/gcs.proto +++ b/src/ray/protobuf/gcs.proto @@ -165,6 +165,8 @@ message ErrorTableData { // The error message. string error_message = 3; // The timestamp of the error message. + // Unit: millisecond. + // TODO(hjiang): Update field naming from `timestamp` to `timestamp_millisec`. double timestamp = 4; } diff --git a/src/ray/raylet/local_task_manager.cc b/src/ray/raylet/local_task_manager.cc index f53fca5a365a..aae37371e29f 100644 --- a/src/ray/raylet/local_task_manager.cc +++ b/src/ray/raylet/local_task_manager.cc @@ -252,7 +252,7 @@ void LocalTaskManager::DispatchScheduledTasksToWorkers() { for (auto work_it = dispatch_queue.begin(); work_it != dispatch_queue.end();) { auto &work = *work_it; const auto &task = work->task; - const auto spec = task.GetTaskSpecification(); + const auto &spec = task.GetTaskSpecification(); TaskID task_id = spec.TaskId(); if (work->GetState() == internal::WorkStatus::WAITING_FOR_WORKER) { work_it++; @@ -436,7 +436,8 @@ void LocalTaskManager::SpillWaitingTasks() { while (it != waiting_task_queue_.begin()) { it--; const auto &task = (*it)->task; - const auto &task_id = task.GetTaskSpecification().TaskId(); + const auto &spec = task.GetTaskSpecification(); + const auto &task_id = spec.TaskId(); // Check whether this task's dependencies are blocked (not being actively // pulled). If this is true, then we should force the task onto a remote @@ -452,9 +453,9 @@ void LocalTaskManager::SpillWaitingTasks() { // object store memory availability. Ideally, we should pick the node with // the most memory availability. scheduling::NodeID scheduling_node_id; - if (!task.GetTaskSpecification().IsSpreadSchedulingStrategy()) { + if (!spec.IsSpreadSchedulingStrategy()) { scheduling_node_id = cluster_resource_scheduler_->GetBestSchedulableNode( - task.GetTaskSpecification(), + spec, /*preferred_node_id*/ self_node_id_.Binary(), /*exclude_local_node*/ task_dependencies_blocked, /*requires_object_store_memory*/ true, @@ -470,9 +471,8 @@ void LocalTaskManager::SpillWaitingTasks() { scheduling_node_id.Binary() != self_node_id_.Binary()) { NodeID node_id = NodeID::FromBinary(scheduling_node_id.Binary()); Spillback(node_id, *it); - if (!task.GetTaskSpecification().GetDependencies().empty()) { - task_dependency_manager_.RemoveTaskDependencies( - task.GetTaskSpecification().TaskId()); + if (!spec.GetDependencies().empty()) { + task_dependency_manager_.RemoveTaskDependencies(spec.TaskId()); } num_waiting_task_spilled_++; waiting_tasks_index_.erase(task_id); @@ -495,14 +495,15 @@ void LocalTaskManager::SpillWaitingTasks() { bool LocalTaskManager::TrySpillback(const std::shared_ptr &work, bool &is_infeasible) { + const auto &spec = work->task.GetTaskSpecification(); auto scheduling_node_id = cluster_resource_scheduler_->GetBestSchedulableNode( - work->task.GetTaskSpecification(), + spec, // We should prefer to stay local if possible // to avoid unnecessary spillback // since this node is already selected by the cluster scheduler. - /*preferred_node_id*/ self_node_id_.Binary(), - /*exclude_local_node*/ false, - /*requires_object_store_memory*/ false, + /*preferred_node_id=*/self_node_id_.Binary(), + /*exclude_local_node=*/false, + /*requires_object_store_memory=*/false, &is_infeasible); if (is_infeasible || scheduling_node_id.IsNil() || @@ -513,9 +514,8 @@ bool LocalTaskManager::TrySpillback(const std::shared_ptr &work, NodeID node_id = NodeID::FromBinary(scheduling_node_id.Binary()); Spillback(node_id, work); num_unschedulable_task_spilled_++; - if (!work->task.GetTaskSpecification().GetDependencies().empty()) { - task_dependency_manager_.RemoveTaskDependencies( - work->task.GetTaskSpecification().TaskId()); + if (!spec.GetDependencies().empty()) { + task_dependency_manager_.RemoveTaskDependencies(spec.TaskId()); } return true; } diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index 879edff0bb6c..18384711de58 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -143,7 +143,7 @@ NodeManager::NodeManager( /*starting_worker_timeout_callback=*/ [this] { cluster_task_manager_->ScheduleAndDispatchTasks(); }, config.ray_debugger_external, - /*get_time=*/[]() { return absl::GetCurrentTimeNanos() / 1e6; }), + /*get_time=*/[]() { return absl::Now(); }), client_call_manager_(io_service), worker_rpc_pool_(client_call_manager_), core_worker_subscriber_(std::make_unique( @@ -1562,8 +1562,8 @@ void NodeManager::DisconnectClient(const std::shared_ptr &clie .WithField("node_id", self_node_id_.Hex()) .WithField("job_id", worker->GetAssignedJobId().Hex()) << error_message_str; - auto error_data_ptr = - gcs::CreateErrorTableData(type, error_message_str, current_time_ms(), job_id); + auto error_data_ptr = gcs::CreateErrorTableData( + type, error_message_str, absl::FromUnixMillis(current_time_ms()), job_id); RAY_CHECK_OK(gcs_client_->Errors().AsyncReportJobError(error_data_ptr, nullptr)); } } @@ -1762,9 +1762,11 @@ void NodeManager::ProcessPushErrorRequestMessage(const uint8_t *message_data) { auto const &type = string_from_flatbuf(*message->type()); auto const &error_message = string_from_flatbuf(*message->error_message()); + // TODO(hjiang): Figure out what's the unit for `PushErrorRequest`. double timestamp = message->timestamp(); JobID job_id = from_flatbuf(*message->job_id()); - auto error_data_ptr = gcs::CreateErrorTableData(type, error_message, timestamp, job_id); + auto error_data_ptr = gcs::CreateErrorTableData( + type, error_message, absl::FromUnixMillis(timestamp), job_id); RAY_CHECK_OK(gcs_client_->Errors().AsyncReportJobError(error_data_ptr, nullptr)); } @@ -2126,8 +2128,8 @@ void NodeManager::MarkObjectsAsFailed( << " object may hang forever."; std::string error_message = stream.str(); RAY_LOG(ERROR) << error_message; - auto error_data_ptr = - gcs::CreateErrorTableData("task", error_message, current_time_ms(), job_id); + auto error_data_ptr = gcs::CreateErrorTableData( + "task", error_message, absl::FromUnixMillis(current_time_ms()), job_id); RAY_CHECK_OK(gcs_client_->Errors().AsyncReportJobError(error_data_ptr, nullptr)); } } diff --git a/src/ray/raylet/raylet.h b/src/ray/raylet/raylet.h index c220e2bc2fab..619fcd7b69ec 100644 --- a/src/ray/raylet/raylet.h +++ b/src/ray/raylet/raylet.h @@ -23,9 +23,7 @@ #include "ray/common/asio/instrumented_io_context.h" // clang-format on -namespace ray { - -namespace raylet { +namespace ray::raylet { using rpc::GcsNodeInfo; using rpc::NodeSnapshot; @@ -109,6 +107,4 @@ class Raylet { local_stream_socket socket_; }; -} // namespace raylet - -} // namespace ray +} // namespace ray::raylet diff --git a/src/ray/raylet/scheduling/cluster_task_manager.h b/src/ray/raylet/scheduling/cluster_task_manager.h index 058c40f97fcf..752cbd6b3e98 100644 --- a/src/ray/raylet/scheduling/cluster_task_manager.h +++ b/src/ray/raylet/scheduling/cluster_task_manager.h @@ -161,6 +161,7 @@ class ClusterTaskManager : public ClusterTaskManagerInterface { const NodeID &self_node_id_; /// Responsible for resource tracking/view of the cluster. + /// TODO(hjiang): Use reference instead of shared pointer. std::shared_ptr cluster_resource_scheduler_; /// Function to get the node information of a given node id. diff --git a/src/ray/raylet/worker_pool.cc b/src/ray/raylet/worker_pool.cc index 943ea89b24b5..5964063a3205 100644 --- a/src/ray/raylet/worker_pool.cc +++ b/src/ray/raylet/worker_pool.cc @@ -89,10 +89,10 @@ WorkerPool::WorkerPool(instrumented_io_context &io_service, const std::vector &worker_ports, std::shared_ptr gcs_client, const WorkerCommandMap &worker_commands, - const std::string &native_library_path, + std::string native_library_path, std::function starting_worker_timeout_callback, int ray_debugger_external, - const std::function get_time) + std::function get_time) : worker_startup_token_counter_(0), io_service_(&io_service), node_id_(node_id), @@ -105,15 +105,15 @@ WorkerPool::WorkerPool(instrumented_io_context &io_service, RayConfig::instance().worker_maximum_startup_concurrency() : maximum_startup_concurrency), gcs_client_(std::move(gcs_client)), - native_library_path_(native_library_path), - starting_worker_timeout_callback_(starting_worker_timeout_callback), + native_library_path_(std::move(native_library_path)), + starting_worker_timeout_callback_(std::move(starting_worker_timeout_callback)), ray_debugger_external(ray_debugger_external), first_job_registered_python_worker_count_(0), first_job_driver_wait_num_python_workers_( std::min(num_prestarted_python_workers, maximum_startup_concurrency_)), num_prestart_python_workers(num_prestarted_python_workers), periodical_runner_(io_service), - get_time_(get_time) { + get_time_(std::move(get_time)) { RAY_CHECK_GT(maximum_startup_concurrency_, 0); // We need to record so that the metric exists. This way, we report that 0 // processes have started before a task runs on the node (as opposed to the @@ -593,7 +593,8 @@ void WorkerPool::MonitorPopWorkerRequestForRegistration( auto &requests = state.pending_registration_requests; auto it = std::find(requests.begin(), requests.end(), pop_worker_request); if (it != requests.end()) { - // Fail the task... + // Pop and fail the task... + requests.erase(it); PopWorkerStatus status = PopWorkerStatus::WorkerPendingRegistration; PopWorkerCallbackAsync(pop_worker_request->callback, nullptr, status); } @@ -1042,7 +1043,7 @@ void WorkerPool::PushWorker(const std::shared_ptr &worker) { } void WorkerPool::TryKillingIdleWorkers() { - int64_t now = get_time_(); + const auto now = get_time_(); // Filter out all idle workers that are already dead and/or associated with // jobs that have already finished. @@ -1055,14 +1056,14 @@ void WorkerPool::TryKillingIdleWorkers() { } const auto &job_id = idle_worker->GetAssignedJobId(); - if (finished_jobs_.count(job_id) > 0) { + if (finished_jobs_.contains(job_id)) { // The job has finished, so we should kill the worker immediately. KillIdleWorker(idle_worker, it->second); it = idle_of_all_languages_.erase(it); } else { - if (it->second == -1 || - now - it->second > - RayConfig::instance().idle_worker_killing_time_threshold_ms()) { + if (now - it->second > + absl::Milliseconds( + RayConfig::instance().idle_worker_killing_time_threshold_ms())) { // The job has not yet finished and the worker has been idle for longer // than the timeout. num_killable_idle_workers++; @@ -1084,9 +1085,9 @@ void WorkerPool::TryKillingIdleWorkers() { auto it = idle_of_all_languages_.begin(); while (num_killable_idle_workers > num_desired_idle_workers && it != idle_of_all_languages_.end()) { - if (it->second == -1 || - now - it->second > - RayConfig::instance().idle_worker_killing_time_threshold_ms()) { + if (now - it->second > + absl::Milliseconds( + RayConfig::instance().idle_worker_killing_time_threshold_ms())) { RAY_LOG(DEBUG) << "Number of idle workers " << num_killable_idle_workers << " is larger than the number of desired workers " << num_desired_idle_workers << " killing idle worker with PID " @@ -1101,7 +1102,7 @@ void WorkerPool::TryKillingIdleWorkers() { } void WorkerPool::KillIdleWorker(std::shared_ptr idle_worker, - int64_t last_time_used_ms) { + absl::Time last_time_used) { // To avoid object lost issue caused by forcibly killing, send an RPC request to the // worker to allow it to do cleanup before exiting. We kill it anyway if the driver // is already exited. @@ -1121,8 +1122,8 @@ void WorkerPool::KillIdleWorker(std::shared_ptr idle_worker, } rpc_client->Exit( request, - [this, idle_worker, last_time_used_ms](const ray::Status &status, - const rpc::ExitReply &r) { + [this, idle_worker, last_time_used](const ray::Status &status, + const rpc::ExitReply &r) { RAY_CHECK(pending_exit_idle_workers_.erase(idle_worker->WorkerId())); if (!status.ok()) { RAY_LOG(ERROR) << "Failed to send exit request: " << status.ToString(); @@ -1148,8 +1149,7 @@ void WorkerPool::KillIdleWorker(std::shared_ptr idle_worker, // kill the worker (e.g., when the worker owns the object). Without this, // if the first N workers own objects, it can't kill idle workers that are // >= N+1. - idle_of_all_languages_.push_back( - std::make_pair(idle_worker, last_time_used_ms)); + idle_of_all_languages_.emplace_back(idle_worker, last_time_used); } }); } @@ -1310,7 +1310,7 @@ void WorkerPool::PopWorker(const TaskSpecification &task_spec, auto worker_fits_for_task_fn = [this, &pop_worker_request, &skip_reason_count]( - const std::pair, int64_t> &pair) -> bool { + const std::pair, absl::Time> &pair) -> bool { const auto &worker = pair.first; WorkerUnfitForTaskReason reason = WorkerFitsForTask(*worker, *pop_worker_request); if (reason == WorkerUnfitForTaskReason::NONE) { @@ -1543,6 +1543,7 @@ void WorkerPool::WarnAboutSize() { << "some discussion of workarounds)."; std::string warning_message_str = warning_message.str(); RAY_LOG(WARNING) << warning_message_str; + auto error_data_ptr = gcs::CreateErrorTableData( "worker_pool_large", warning_message_str, get_time_()); RAY_CHECK_OK(gcs_client_->Errors().AsyncReportJobError(error_data_ptr, nullptr)); diff --git a/src/ray/raylet/worker_pool.h b/src/ray/raylet/worker_pool.h index ef2e1e048635..3d7f456f82cb 100644 --- a/src/ray/raylet/worker_pool.h +++ b/src/ray/raylet/worker_pool.h @@ -25,6 +25,7 @@ #include #include +#include "absl/time/time.h" #include "ray/common/asio/instrumented_io_context.h" #include "ray/common/asio/periodical_runner.h" #include "ray/common/client_connection.h" @@ -200,7 +201,7 @@ class WorkerPool : public WorkerPoolInterface, public IOWorkerPoolInterface { /// it times out to start a worker. /// \param ray_debugger_external Ray debugger in workers will be started in a way /// that they are accessible from outside the node. - /// \param get_time A callback to get the current time. + /// \param get_time A callback to get the current time in milliseconds. WorkerPool(instrumented_io_context &io_service, const NodeID node_id, const std::string node_address, @@ -212,10 +213,10 @@ class WorkerPool : public WorkerPoolInterface, public IOWorkerPoolInterface { const std::vector &worker_ports, std::shared_ptr gcs_client, const WorkerCommandMap &worker_commands, - const std::string &native_library_path, + std::string native_library_path, std::function starting_worker_timeout_callback, int ray_debugger_external, - const std::function get_time); + std::function get_time); /// Destructor responsible for freeing a set of workers owned by this class. virtual ~WorkerPool() override; @@ -472,7 +473,7 @@ class WorkerPool : public WorkerPoolInterface, public IOWorkerPoolInterface { /// TODO(scv119): replace dynamic options by runtime_env. const std::vector &LookupWorkerDynamicOptions(StartupToken token) const; - void KillIdleWorker(std::shared_ptr worker, int64_t last_time_used_ms); + void KillIdleWorker(std::shared_ptr worker, absl::Time last_time_used); /// Gloabl startup token variable. Incremented once assigned /// to a worker process and is added to @@ -587,7 +588,8 @@ class WorkerPool : public WorkerPoolInterface, public IOWorkerPoolInterface { /// The pool of idle non-actor workers of all languages. This is used to kill idle /// workers in FIFO order. The second element of std::pair is the time a worker becomes /// idle. - std::list, int64_t>> idle_of_all_languages_; + std::list, absl::Time>> + idle_of_all_languages_; private: /// A helper function that returns the reference of the pool state @@ -792,7 +794,7 @@ class WorkerPool : public WorkerPoolInterface, public IOWorkerPoolInterface { PeriodicalRunner periodical_runner_; /// A callback to get the current time. - const std::function get_time_; + const std::function get_time_; /// Runtime env manager client. std::shared_ptr runtime_env_agent_client_; /// Stats diff --git a/src/ray/raylet/worker_pool_test.cc b/src/ray/raylet/worker_pool_test.cc index d945384b7277..022c5055522a 100644 --- a/src/ray/raylet/worker_pool_test.cc +++ b/src/ray/raylet/worker_pool_test.cc @@ -14,8 +14,10 @@ #include "ray/raylet/worker_pool.h" -#include "gmock/gmock.h" -#include "gtest/gtest.h" +#include +#include + +#include "absl/time/time.h" #include "nlohmann/json.hpp" #include "ray/common/asio/asio_util.h" #include "ray/common/asio/instrumented_io_context.h" @@ -26,9 +28,8 @@ #include "src/ray/protobuf/runtime_env_agent.pb.h" using json = nlohmann::json; -namespace ray { -namespace raylet { +namespace ray::raylet { int MAXIMUM_STARTUP_CONCURRENCY = 15; int PYTHON_PRESTART_WORKERS = 15; @@ -140,7 +141,7 @@ class WorkerPoolMock : public WorkerPool { "", []() {}, 0, - [this]() { return current_time_ms_; }), + [this]() { return absl::FromUnixMillis(current_time_ms_); }), last_worker_process_(), instrumented_io_service_(io_service), error_message_type_(1), @@ -241,7 +242,7 @@ class WorkerPoolMock : public WorkerPool { size_t GetIdleWorkerSize() { return idle_of_all_languages_.size(); } - std::list, int64_t>> &GetIdleWorkers() { + std::list, absl::Time>> &GetIdleWorkers() { return idle_of_all_languages_; } @@ -1928,6 +1929,19 @@ TEST_F(WorkerPoolDriverRegisteredTest, PopWorkerStatus) { worker_pool_->ClearProcesses(); } +TEST_F(WorkerPoolDriverRegisteredTest, WorkerPendingRegistrationErasesRequest) { + std::shared_ptr popped_worker; + PopWorkerStatus status; + auto task_spec = ExampleTaskSpec(); + // Create a task without push worker. It should time out (WorkerPendingRegistration). + popped_worker = worker_pool_->PopWorkerSync(task_spec, false, &status); + ASSERT_EQ(popped_worker, nullptr); + ASSERT_EQ(status, PopWorkerStatus::WorkerPendingRegistration); + // The request should be erased. + ASSERT_EQ(worker_pool_->NumPendingRegistrationRequests(), 0); + worker_pool_->ClearProcesses(); +} + TEST_F(WorkerPoolDriverRegisteredTest, TestIOWorkerFailureAndSpawn) { std::unordered_set> spill_worker_set; auto spill_worker_callback = @@ -2138,9 +2152,7 @@ TEST_F(WorkerPoolTest, RegisterFirstJavaDriverCallbackImmediately) { ASSERT_TRUE(callback_called); } -} // namespace raylet - -} // namespace ray +} // namespace ray::raylet int main(int argc, char **argv) { InitShutdownRAII ray_log_shutdown_raii( diff --git a/src/ray/raylet_client/raylet_client.cc b/src/ray/raylet_client/raylet_client.cc index 7911a3ce0a86..5eff4538f837 100644 --- a/src/ray/raylet_client/raylet_client.cc +++ b/src/ray/raylet_client/raylet_client.cc @@ -370,10 +370,10 @@ void raylet::RayletClient::ReportWorkerBacklog( request.set_worker_id(worker_id.Binary()); request.mutable_backlog_reports()->Add(backlog_reports.begin(), backlog_reports.end()); grpc_client_->ReportWorkerBacklog( - request, [](const Status &status, rpc::ReportWorkerBacklogReply &&reply) { - if (!status.ok()) { - RAY_LOG(INFO) << "Error reporting task backlog information: " << status; - } + request, + [](const Status &status, rpc::ReportWorkerBacklogReply &&reply /*unused*/) { + RAY_LOG_IF_ERROR(INFO, status) + << "Error reporting task backlog information: " << status; }); } @@ -389,12 +389,10 @@ Status raylet::RayletClient::ReturnWorker( request.set_disconnect_worker(disconnect_worker); request.set_disconnect_worker_error_detail(disconnect_worker_error_detail); request.set_worker_exiting(worker_exiting); - grpc_client_->ReturnWorker(request, - [](const Status &status, rpc::ReturnWorkerReply &&reply) { - if (!status.ok()) { - RAY_LOG(INFO) << "Error returning worker: " << status; - } - }); + grpc_client_->ReturnWorker( + request, [](const Status &status, rpc::ReturnWorkerReply &&reply /*unused*/) { + RAY_LOG_IF_ERROR(INFO, status) << "Error returning worker: " << status; + }); return Status::OK(); } @@ -405,9 +403,7 @@ void raylet::RayletClient::GetTaskFailureCause( request.set_task_id(task_id.Binary()); grpc_client_->GetTaskFailureCause( request, [callback](const Status &status, rpc::GetTaskFailureCauseReply &&reply) { - if (!status.ok()) { - RAY_LOG(INFO) << "Error getting task result: " << status; - } + RAY_LOG_IF_ERROR(INFO, status) << "Error getting task result: " << status; callback(status, std::move(reply)); }); } @@ -459,9 +455,7 @@ void raylet::RayletClient::PushMutableObject( // TODO: Add failure recovery, retries, and timeout. grpc_client_->PushMutableObject( request, [callback](const Status &status, rpc::PushMutableObjectReply &&reply) { - if (!status.ok()) { - RAY_LOG(ERROR) << "Error pushing mutable object: " << status; - } + RAY_LOG_IF_ERROR(ERROR, status) << "Error pushing mutable object: " << status; if (reply.done()) { // The callback is only executed once the receiver node receives all chunks // for the mutable object write. diff --git a/src/ray/util/BUILD b/src/ray/util/BUILD index 23d9f1e90150..87f8a57e8dea 100644 --- a/src/ray/util/BUILD +++ b/src/ray/util/BUILD @@ -55,3 +55,13 @@ cc_library( srcs = ["thread_checker.cc"], visibility = ["//visibility:public"], ) + +cc_library( + name = "shared_lru", + hdrs = ["shared_lru.h"], + visibility = ["//visibility:public"], + deps = [ + ":util", + "@com_google_absl//absl/container:flat_hash_map", + ], +) diff --git a/src/ray/util/logging.cc b/src/ray/util/logging.cc index 12737eabed7e..7562b3e2ecce 100644 --- a/src/ray/util/logging.cc +++ b/src/ray/util/logging.cc @@ -27,14 +27,17 @@ #endif #include +#include #include #include #include #include +#include #include "absl/debugging/failure_signal_handler.h" #include "absl/debugging/stacktrace.h" #include "absl/debugging/symbolize.h" +#include "absl/strings/numbers.h" #include "absl/strings/str_format.h" #include "nlohmann/json.hpp" #include "ray/util/event_label.h" @@ -301,10 +304,8 @@ void RayLog::InitLogFormat() { log_format_json_ = false; log_format_pattern_ = kLogFormatTextPattern; - const char *var_value = std::getenv("RAY_BACKEND_LOG_JSON"); - if (var_value != nullptr) { - std::string data = var_value; - if (data == "1") { + if (const char *var_value = std::getenv("RAY_BACKEND_LOG_JSON"); var_value != nullptr) { + if (std::string_view{var_value} == std::string_view{"1"}) { log_format_json_ = true; log_format_pattern_ = kLogFormatJsonPattern; } @@ -321,7 +322,9 @@ void RayLog::StartRayLog(const std::string &app_name, log_dir_ = log_dir; // All the logging sinks to add. - std::vector sinks; + // One for file/stdout, another for stderr. + std::array sinks; // Intentionally no initialization. + auto level = GetMappedSeverity(severity_threshold_); std::string app_name_without_path = app_name; if (app_name.empty()) { @@ -343,17 +346,20 @@ void RayLog::StartRayLog(const std::string &app_name, #endif // Reset log pattern and level and we assume a log file can be rotated with // 10 files in max size 512M by default. - if (std::getenv("RAY_ROTATION_MAX_BYTES")) { - long max_size = std::atol(std::getenv("RAY_ROTATION_MAX_BYTES")); - // 0 means no log rotation in python, but not in spdlog. We just use the default - // value here. - if (max_size != 0) { + if (const char *ray_rotation_max_bytes = std::getenv("RAY_ROTATION_MAX_BYTES"); + ray_rotation_max_bytes != nullptr) { + long max_size = 0; + if (absl::SimpleAtoi(ray_rotation_max_bytes, &max_size) && max_size > 0) { + // 0 means no log rotation in python, but not in spdlog. We just use the default + // value here. log_rotation_max_size_ = max_size; } } - if (std::getenv("RAY_ROTATION_BACKUP_COUNT")) { - long file_num = std::atol(std::getenv("RAY_ROTATION_BACKUP_COUNT")); - if (file_num != 0) { + + if (const char *ray_rotation_backup_count = std::getenv("RAY_ROTATION_BACKUP_COUNT"); + ray_rotation_backup_count != nullptr) { + long file_num = 0; + if (absl::SimpleAtoi(ray_rotation_backup_count, &file_num) && file_num > 0) { log_rotation_file_num_ = file_num; } } @@ -370,23 +376,24 @@ void RayLog::StartRayLog(const std::string &app_name, log_rotation_max_size_, log_rotation_file_num_); file_sink->set_level(level); - sinks.push_back(file_sink); + sinks[0] = std::move(file_sink); } else { component_name_ = app_name_without_path; auto console_sink = std::make_shared(); console_sink->set_level(level); - sinks.push_back(console_sink); + sinks[0] = std::move(console_sink); } // In all cases, log errors to the console log so they are in driver logs. // https://github.com/ray-project/ray/issues/12893 auto err_sink = std::make_shared(); err_sink->set_level(spdlog::level::err); - sinks.push_back(err_sink); + sinks[1] = std::move(err_sink); // Set the combined logger. - auto logger = std::make_shared( - RayLog::GetLoggerName(), sinks.begin(), sinks.end()); + auto logger = std::make_shared(RayLog::GetLoggerName(), + std::make_move_iterator(sinks.begin()), + std::make_move_iterator(sinks.end())); logger->set_level(level); // Set the pattern of all sinks. logger->set_pattern(log_format_pattern_); diff --git a/src/ray/util/logging.h b/src/ray/util/logging.h index bea9c0b5de44..b5e06f4b083a 100644 --- a/src/ray/util/logging.h +++ b/src/ray/util/logging.h @@ -129,6 +129,11 @@ enum class RayLogLevel { if (ray::RayLog::IsLevelEnabled(ray::RayLogLevel::level)) \ RAY_LOG_INTERNAL(ray::RayLogLevel::level) +// `cond` is a `Status` class, could be `ray::Status`, or from third-party like +// `grpc::Status`. +#define RAY_LOG_IF_ERROR(level, cond) \ + if (RAY_PREDICT_FALSE(!(cond).ok())) RAY_LOG(level) + #define RAY_IGNORE_EXPR(expr) ((void)(expr)) #define RAY_CHECK(condition) \ diff --git a/src/ray/util/shared_lru.h b/src/ray/util/shared_lru.h new file mode 100644 index 000000000000..8132e38b6f12 --- /dev/null +++ b/src/ray/util/shared_lru.h @@ -0,0 +1,207 @@ +// Copyright 2024 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SharedLruCache is a LRU cache, with all entries shared, which means a single entry +// could be accessed by multiple getters. All values are wrapped with shared pointer to +// avoid copy at get operation, meanwhile also useful to maintain memory validity at any +// time. +// +// Example usage: +// SharedLruCache cache{cap}; +// // Put a key-value pair into cache. +// cache.Put("key", std::make_shared("val")); +// +// // Get a key-value pair from cache. +// auto val = cache.Get("key"); +// // Check and consume `val`. +// +// TODO(hjiang): +// 1. Add template arguments for key hash and key equal, to pass into absl::flat_hash_map. +// 2. Provide a key hash wrapper to save a copy. +// 3. flat hash map supports heterogeneous lookup, expose `KeyLike` templated interface. +// 4. Add a `GetOrCreate` interface, which takes factory function to creation value. +// 5. For thread-safe cache, add a sharded container wrapper to reduce lock contention. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "absl/container/flat_hash_map.h" +#include "src/ray/util/logging.h" + +namespace ray::utils::container { + +template +class SharedLruCache final { + public: + using key_type = Key; + using mapped_type = Val; + + // A `max_entries` of 0 means that there is no limit on the number of entries + // in the cache. + explicit SharedLruCache(size_t max_entries) : max_entries_(max_entries) {} + + SharedLruCache(const SharedLruCache &) = delete; + SharedLruCache &operator=(const SharedLruCache &) = delete; + + ~SharedLruCache() = default; + + // Insert `value` with key `key`. This will replace any previous entry with + // the same key. + void Put(Key key, std::shared_ptr value) { + RAY_CHECK(value != nullptr); + auto iter = cache_.find(key); + if (iter != cache_.end()) { + lru_list_.splice(lru_list_.begin(), lru_list_, iter->second.lru_iterator); + iter->second.value = std::move(value); + return; + } + + lru_list_.emplace_front(key); + Entry new_entry{std::move(value), lru_list_.begin()}; + cache_[std::move(key)] = std::move(new_entry); + + if (max_entries_ > 0 && lru_list_.size() > max_entries_) { + const auto &stale_key = lru_list_.back(); + cache_.erase(stale_key); + lru_list_.pop_back(); + } + + RAY_CHECK_EQ(lru_list_.size(), cache_.size()); + } + + // Delete the entry with key `key`. Return true if the entry was found for + // `key`, false if the entry was not found. In both cases, there is no entry + // with key `key` existed after the call. + bool Delete(const Key &key) { + auto it = cache_.find(key); + if (it == cache_.end()) { + return false; + } + lru_list_.erase(it->second.lru_iterator); + cache_.erase(it); + return true; + } + + // Look up the entry with key `key`. Return nullptr if key doesn't exist. + std::shared_ptr Get(const Key &key) { + const auto cache_iter = cache_.find(key); + if (cache_iter == cache_.end()) { + return nullptr; + } + lru_list_.splice(lru_list_.begin(), lru_list_, cache_iter->second.lru_iterator); + return cache_iter->second.value; + } + + // Clear the cache. + void Clear() { + cache_.clear(); + lru_list_.clear(); + } + + // Accessors for cache parameters. + size_t max_entries() const { return max_entries_; } + + private: + struct Entry { + // The entry's value. + std::shared_ptr value; + + // A list iterator pointing to the entry's position in the LRU list. + typename std::list::iterator lru_iterator; + }; + + using EntryMap = absl::flat_hash_map; + + // The maximum number of entries in the cache. A value of 0 means there is no + // limit on entry count. + const size_t max_entries_; + + // Stores key-value pairs. + EntryMap cache_; + + // The LRU list of entries. The front of the list identifies the most + // recently accessed entry. + std::list lru_list_; +}; + +// Same interfaces as `SharedLruCache`, but all cached values are +// `const`-specified to avoid concurrent updates. +template +using SharedLruConstCache = SharedLruCache; + +// Same interface and functionality as `SharedLruCache`, but thread-safe version. +template +class ThreadSafeSharedLruCache final { + public: + using key_type = Key; + using mapped_type = Val; + + // A `max_entries` of 0 means that there is no limit on the number of entries + // in the cache. + explicit ThreadSafeSharedLruCache(size_t max_entries) : cache_(max_entries) {} + + ThreadSafeSharedLruCache(const ThreadSafeSharedLruCache &) = delete; + ThreadSafeSharedLruCache &operator=(const ThreadSafeSharedLruCache &) = delete; + + ~ThreadSafeSharedLruCache() = default; + + // Insert `value` with key `key`. This will replace any previous entry with + // the same key. + void Put(Key key, std::shared_ptr value) { + std::lock_guard lck(mu_); + cache_.Put(std::move(key), std::move(value)); + } + + // Delete the entry with key `key`. Return true if the entry was found for + // `key`, false if the entry was not found. In both cases, there is no entry + // with key `key` existed after the call. + bool Delete(const Key &key) { + std::lock_guard lck(mu_); + return cache_.Delete(key); + } + + // Look up the entry with key `key`. Return std::nullopt if key doesn't exist. + // If found, return a copy for the value. + std::shared_ptr Get(const Key &key) { + std::lock_guard lck(mu_); + return cache_.Get(key); + } + + // Clear the cache. + void Clear() { + std::lock_guard lck(mu_); + cache_.Clear(); + } + + // Accessors for cache parameters. + size_t max_entries() const { return cache_.max_entries(); } + + private: + std::mutex mu_; + SharedLruCache cache_; +}; + +// Same interfaces as `SharedLruCache`, but all cached values are +// `const`-specified to avoid concurrent updates. +template +using ThreadSafeSharedLruConstCache = ThreadSafeSharedLruCache; + +} // namespace ray::utils::container diff --git a/src/ray/util/tests/BUILD b/src/ray/util/tests/BUILD index 2941d105cf91..b85c01f28ebf 100644 --- a/src/ray/util/tests/BUILD +++ b/src/ray/util/tests/BUILD @@ -194,3 +194,15 @@ cc_test( "@com_google_googletest//:gtest_main", ], ) + +cc_test( + name = "shared_lru_test", + srcs = ["shared_lru_test.cc"], + deps = [ + "//src/ray/util:shared_lru", + "@com_google_googletest//:gtest_main", + ], + size = "small", + copts = COPTS, + tags = ["team:core"], +) diff --git a/src/ray/util/tests/shared_lru_test.cc b/src/ray/util/tests/shared_lru_test.cc new file mode 100644 index 000000000000..7c47f4d1daf0 --- /dev/null +++ b/src/ray/util/tests/shared_lru_test.cc @@ -0,0 +1,76 @@ +// Copyright 2024 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/ray/util/shared_lru.h" + +#include + +#include +#include + +namespace ray::utils::container { + +namespace { +constexpr size_t kTestCacheSz = 1; +} // namespace + +TEST(SharedLruCache, PutAndGet) { + ThreadSafeSharedLruCache cache{kTestCacheSz}; + + // No value initially. + auto val = cache.Get("1"); + EXPECT_EQ(val, nullptr); + + // Check put and get. + cache.Put("1", std::make_shared("1")); + val = cache.Get("1"); + EXPECT_NE(val, nullptr); + EXPECT_EQ(*val, "1"); + + // Check key eviction. + cache.Put("2", std::make_shared("2")); + val = cache.Get("1"); + EXPECT_EQ(val, nullptr); + val = cache.Get("2"); + EXPECT_NE(val, nullptr); + EXPECT_EQ(*val, "2"); + + // Check deletion. + EXPECT_FALSE(cache.Delete("1")); + EXPECT_TRUE(cache.Delete("2")); + val = cache.Get("2"); + EXPECT_EQ(val, nullptr); +} + +// Testing senario: push multiple same keys into the cache. +TEST(SharedLruCache, SameKeyTest) { + ThreadSafeSharedLruCache cache{2}; + + cache.Put(1, std::make_shared(1)); + auto val = cache.Get(1); + EXPECT_NE(val, nullptr); + EXPECT_EQ(1, *val); + + cache.Put(1, std::make_shared(2)); + val = cache.Get(1); + EXPECT_NE(val, nullptr); + EXPECT_EQ(2, *val); +} + +TEST(SharedLruConstCache, TypeAliasAssertion) { + static_assert( + std::is_same_v, SharedLruCache>); +} + +} // namespace ray::utils::container diff --git a/src/ray/util/thread_checker.cc b/src/ray/util/thread_checker.cc index 73a0072c7575..0f33dfd4c712 100644 --- a/src/ray/util/thread_checker.cc +++ b/src/ray/util/thread_checker.cc @@ -16,7 +16,7 @@ namespace ray { -bool ThreadChecker::IsOnSameThread() { +bool ThreadChecker::IsOnSameThread() const { const auto cur_id = std::this_thread::get_id(); std::thread::id uninitialized_id; return thread_id_.compare_exchange_strong(uninitialized_id, cur_id) || diff --git a/src/ray/util/thread_checker.h b/src/ray/util/thread_checker.h index 622624859b75..2e3dcf1ed3df 100644 --- a/src/ray/util/thread_checker.h +++ b/src/ray/util/thread_checker.h @@ -34,10 +34,10 @@ class ThreadChecker { public: // Return true at initialization, or current invocation happens on the same thread as // initialization. - bool IsOnSameThread(); + bool IsOnSameThread() const; private: - std::atomic thread_id_{}; + mutable std::atomic thread_id_{}; }; } // namespace ray diff --git a/src/ray/util/util.cc b/src/ray/util/util.cc index 478fec48d157..35b2d7c51144 100644 --- a/src/ray/util/util.cc +++ b/src/ray/util/util.cc @@ -35,11 +35,12 @@ #include "ray/util/filesystem.h" #include "ray/util/logging.h" +namespace { /// Uses sscanf() to read a token matching from the string, advancing the iterator. /// \param c_str A string iterator that is dereferenceable. (i.e.: c_str < string::end()) /// \param format The pattern. It must not produce any output. (e.g., use %*d, not %d.) /// \return The scanned prefix of the string, if any. -static std::string ScanToken(std::string::const_iterator &c_str, std::string format) { +std::string ScanToken(std::string::const_iterator &c_str, std::string format) { int i = 0; std::string result; format += "%n"; @@ -49,6 +50,7 @@ static std::string ScanToken(std::string::const_iterator &c_str, std::string for } return result; } +} // namespace std::string EndpointToUrl( const boost::asio::generic::basic_endpoint &ep, @@ -58,7 +60,7 @@ std::string EndpointToUrl( case AF_INET: { scheme = "tcp://"; boost::asio::ip::tcp::endpoint e(boost::asio::ip::tcp::v4(), 0); - RAY_CHECK(e.size() == ep.size()); + RAY_CHECK_EQ(e.size(), ep.size()); const sockaddr *src = ep.data(); sockaddr *dst = e.data(); *reinterpret_cast(dst) = *reinterpret_cast(src); @@ -70,7 +72,7 @@ std::string EndpointToUrl( case AF_INET6: { scheme = "tcp://"; boost::asio::ip::tcp::endpoint e(boost::asio::ip::tcp::v6(), 0); - RAY_CHECK(e.size() == ep.size()); + RAY_CHECK_EQ(e.size(), ep.size()); const sockaddr *src = ep.data(); sockaddr *dst = e.data(); *reinterpret_cast(dst) = *reinterpret_cast(src); diff --git a/src/ray/util/util.h b/src/ray/util/util.h index e7f7a1d96781..05ce88c7ee47 100644 --- a/src/ray/util/util.h +++ b/src/ray/util/util.h @@ -62,22 +62,13 @@ #endif // Boost forward-declarations (to avoid forcing slow header inclusions) -namespace boost { - -namespace asio { - -namespace generic { +namespace boost::asio::generic { template class basic_endpoint; - class stream_protocol; -} // namespace generic - -} // namespace asio - -} // namespace boost +} // namespace boost::asio::generic enum class CommandLineSyntax { System, POSIX, Windows }; @@ -302,12 +293,19 @@ inline void unsetEnv(const std::string &name) { RAY_CHECK_EQ(ret, 0) << "Failed to unset env var " << name; } +// Set [thread_name] to current thread; if it fails, error will be logged. +// NOTICE: It only works for macos and linux. inline void SetThreadName(const std::string &thread_name) { + int ret = 0; #if defined(__APPLE__) - pthread_setname_np(thread_name.c_str()); + ret = pthread_setname_np(thread_name.c_str()); #elif defined(__linux__) - pthread_setname_np(pthread_self(), thread_name.substr(0, 15).c_str()); + ret = pthread_setname_np(pthread_self(), thread_name.substr(0, 15).c_str()); #endif + if (ret < 0) { + RAY_LOG(ERROR) << "Fails to set thread name to " << thread_name << " since " + << strerror(errno); + } } inline std::string GetThreadName() {