Skip to content

Commit

Permalink
Merge branch 'master' into remove-python-gcs-client
Browse files Browse the repository at this point in the history
  • Loading branch information
rynewang authored Dec 3, 2024
2 parents 3b64b82 + 0a11215 commit 04f4cd0
Show file tree
Hide file tree
Showing 141 changed files with 2,392 additions and 887 deletions.
4 changes: 2 additions & 2 deletions ci/ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -274,8 +274,8 @@ test_cpp() {
BAZEL_EXPORT_OPTIONS=($(./ci/run/bazel_export_options))
bazel test --config=ci "${BAZEL_EXPORT_OPTIONS[@]}" --test_strategy=exclusive //cpp:all --build_tests_only
# run cluster mode test with external cluster
bazel test //cpp:cluster_mode_test --test_arg=--external_cluster=true --test_arg=--redis_password="1234" \
--test_arg=--ray_redis_password="1234"
bazel test //cpp:cluster_mode_test --test_arg=--external_cluster=true \
--test_arg=--ray_redis_password="1234" --test_arg=--ray_redis_username="default"
bazel test --test_output=all //cpp:test_python_call_cpp

# run the cpp example, currently does not work on mac
Expand Down
3 changes: 3 additions & 0 deletions cpp/include/ray/api/ray_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ class RayConfig {

/* The following are unstable parameters and their use is discouraged. */

// Prevents external clients without the username from connecting to Redis if provided.
boost::optional<std::string> redis_username_;

// Prevents external clients without the password from connecting to Redis if provided.
boost::optional<std::string> redis_password_;

Expand Down
16 changes: 16 additions & 0 deletions cpp/src/ray/config_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,14 @@

ABSL_FLAG(std::string, ray_address, "", "The address of the Ray cluster to connect to.");

/// absl::flags does not provide a IsDefaultValue method, so use a non-empty dummy default
/// value to support an empty Redis username.
ABSL_FLAG(std::string,
ray_redis_username,
"absl::flags dummy default value",
"Prevents external clients without the username from connecting to Redis "
"if provided.");

/// absl::flags does not provide a IsDefaultValue method, so use a non-empty dummy default
/// value to support empty redis password.
ABSL_FLAG(std::string,
Expand Down Expand Up @@ -119,6 +127,9 @@ void ConfigInternal::Init(RayConfig &config, int argc, char **argv) {
if (!config.code_search_path.empty()) {
code_search_path = config.code_search_path;
}
if (config.redis_username_) {
redis_username = *config.redis_username_;
}
if (config.redis_password_) {
redis_password = *config.redis_password_;
}
Expand Down Expand Up @@ -146,6 +157,11 @@ void ConfigInternal::Init(RayConfig &config, int argc, char **argv) {
if (!FLAGS_ray_address.CurrentValue().empty()) {
SetBootstrapAddress(FLAGS_ray_address.CurrentValue());
}
// Don't rewrite `ray_redis_username` when it is not set in the command line.
if (FLAGS_ray_redis_username.CurrentValue() !=
FLAGS_ray_redis_username.DefaultValue()) {
redis_username = FLAGS_ray_redis_username.CurrentValue();
}
// Don't rewrite `ray_redis_password` when it is not set in the command line.
if (FLAGS_ray_redis_password.CurrentValue() !=
FLAGS_ray_redis_password.DefaultValue()) {
Expand Down
2 changes: 2 additions & 0 deletions cpp/src/ray/config_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ class ConfigInternal {

int bootstrap_port = 6379;

std::string redis_username = "default";

std::string redis_password = "5241590000000000";

int node_manager_port = 0;
Expand Down
6 changes: 5 additions & 1 deletion cpp/src/ray/test/cluster/cluster_mode_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ int cmd_argc = 0;
char **cmd_argv = nullptr;

ABSL_FLAG(bool, external_cluster, false, "");
ABSL_FLAG(std::string, redis_username, "default", "");
ABSL_FLAG(std::string, redis_password, "12345678", "");
ABSL_FLAG(int32_t, redis_port, 6379, "");

Expand Down Expand Up @@ -67,10 +68,13 @@ TEST(RayClusterModeTest, FullTest) {
"--num-cpus", "2", "--resources", "{\"resource1\":1,\"resource2\":2}"};
if (absl::GetFlag<bool>(FLAGS_external_cluster)) {
auto port = absl::GetFlag<int32_t>(FLAGS_redis_port);
std::string username = absl::GetFlag<std::string>(FLAGS_redis_username);
std::string password = absl::GetFlag<std::string>(FLAGS_redis_password);
std::string local_ip = ray::internal::GetNodeIpAddress();
ray::internal::ProcessHelper::GetInstance().StartRayNode(local_ip, port, password);
ray::internal::ProcessHelper::GetInstance().StartRayNode(
local_ip, port, username, password);
config.address = local_ip + ":" + std::to_string(port);
config.redis_username_ = username;
config.redis_password_ = password;
}
ray::Init(config, cmd_argc, cmd_argv);
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/ray/util/process_helper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,16 @@ using ray::core::WorkerType;

void ProcessHelper::StartRayNode(const std::string node_id_address,
const int port,
const std::string redis_username,
const std::string redis_password,
const std::vector<std::string> &head_args) {
std::vector<std::string> cmdargs({"ray",
"start",
"--head",
"--port",
std::to_string(port),
"--redis-username",
redis_username,
"--redis-password",
redis_password,
"--node-ip-address",
Expand Down Expand Up @@ -82,6 +85,7 @@ void ProcessHelper::RayStart(CoreWorkerOptions::TaskExecutionCallback callback)
bootstrap_ip = GetNodeIpAddress();
StartRayNode(bootstrap_ip,
bootstrap_port,
ConfigInternal::Instance().redis_username,
ConfigInternal::Instance().redis_password,
ConfigInternal::Instance().head_args);
}
Expand Down
1 change: 1 addition & 0 deletions cpp/src/ray/util/process_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class ProcessHelper {
void RayStop();
void StartRayNode(const std::string node_id_address,
const int port,
const std::string redis_username,
const std::string redis_password,
const std::vector<std::string> &head_args = {});
void StopRayNode();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,105 +134,6 @@ Examine the FluentBit sidecar's STDOUT to see logs for Ray's component processes
kubectl logs raycluster-complete-logs-head-xxxxx -c fluentbit
```

(kuberay-fluentbit-ds)=
## Set up logging on a DaemonSet with Fluent Bit

Fluent Bit is a lightweight agent that allows you to collect logs from your Kubernetes cluster and send them to a variety of destinations such as Elasticsearch, CloudWatch, S3, etc.
The following steps set up [Fluent Bit][FluentBit] as a DaemonSet to send logs to CloudWatch Logs.

### Setup for AWS EKS

Create an Amazon EKS cluster named `fluent-bit-demo` in `us-west-2` region using `eksctl`, as shown in the [EKS docs](https://docs.aws.amazon.com/eks/latest/userguide/getting-started-eksctl.html).

```shell
eksctl create cluster --name fluent-bit-demo --region us-west-2
```

View your cluster nodes:
```shell
$ kubectl get nodes -o wide
NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME
ip-192-168-59-62.us-west-2.compute.internal Ready <none> 157m v1.27.5-eks-43840fb 192.168.59.62 54.190.144.241 Amazon Linux 2 5.10.192-183.736.amzn2.x86_64 containerd://1.6.19
ip-192-168-86-99.us-west-2.compute.internal Ready <none> 157m v1.27.5-eks-43840fb 192.168.86.99 34.219.16.107 Amazon Linux 2 5.10.192-183.736.amzn2.x86_64 containerd://1.6.19
```

EKS cluster nodes need to have access to CloudWatch Logs for Fluent Bit.
Attach the `CloudWatchLogsFullAccess` policy to the IAM role that is attached to the cluster nodes:

```shell
ROLE_NAME=$(eksctl get nodegroup --cluster fluent-bit-demo --region us-west-2 -o json | jq -r '.[].NodeInstanceRoleARN' | cut -f2 -d/)

aws iam attach-role-policy \
--role-name $ROLE_NAME \
--policy-arn arn:aws:iam::aws:policy/CloudWatchLogsFullAccess
```

### Deploy Fluent Bit DaemonSet

If you don't already have a namespace called `amazon-cloudwatch`, create one by entering the following command:

```bash
kubectl apply -f https://raw.githubusercontent.com/aws-samples/amazon-cloudwatch-container-insights/latest/k8s-deployment-manifest-templates/deployment-mode/daemonset/container-insights-monitoring/cloudwatch-namespace.yaml
```

Run the following command to create a ConfigMap named `cluster-info` with the cluster name and the Region to send logs to:

```shell
ClusterName=fluent-bit-demo
RegionName=us-west-2
FluentBitHttpPort='2020'
FluentBitReadFromHead='Off'
[[ ${FluentBitReadFromHead} = 'On' ]] && FluentBitReadFromTail='Off'|| FluentBitReadFromTail='On'
[[ -z ${FluentBitHttpPort} ]] && FluentBitHttpServer='Off' || FluentBitHttpServer='On'
kubectl create configmap fluent-bit-cluster-info \
--from-literal=cluster.name=${ClusterName} \
--from-literal=http.server=${FluentBitHttpServer} \
--from-literal=http.port=${FluentBitHttpPort} \
--from-literal=read.head=${FluentBitReadFromHead} \
--from-literal=read.tail=${FluentBitReadFromTail} \
--from-literal=logs.region=${RegionName} -n amazon-cloudwatch
```

Deploy the Fluent Bit DaemonSet to the cluster by running the following commands:

```shell
kubectl apply -f https://raw.githubusercontent.com/aws-samples/amazon-cloudwatch-container-insights/latest/k8s-deployment-manifest-templates/deployment-mode/daemonset/container-insights-monitoring/fluent-bit/fluent-bit.yaml
```

Validate whether you successfully deployed Fluent Bit by entering the following command:

```shell
kubectl -n amazon-cloudwatch logs ds/fluent-bit
```

Verify that the command created log groups:

```shell
...
[2023/10/10 06:13:55] [ info] [output:cloudwatch_logs:cloudwatch_logs.0] Created log group /aws/containerinsights/fluent-bit-demo/application
[2023/10/10 06:13:57] [ info] [output:cloudwatch_logs:cloudwatch_logs.2] Created log group /aws/containerinsights/fluent-bit-demo/host
[2023/10/10 06:13:57] [ info] [output:cloudwatch_logs:cloudwatch_logs.1] Created log group /aws/containerinsights/fluent-bit-demo/dataplane
...
```

### Check the CloudWatch dashboard

Finally, check the CloudWatch dashboard to see the logs.
Open the CLoudWatch console at https://console.aws.amazon.com/cloudwatch/.

Type `/aws/containerinsights/fluent-bit-demo/` in the search box.

![CloudWatch Dashboard](images/cloudwatch-dashboard.png)

Select `/aws/containerinsights/fluent-bit-demo/application`. You should see the logs from the application pods including Ray.

Under the log streams, click any log stream. You should see the logs from the pods:

![CloudWatch Logs](images/cloudwatch-logs.png)

You can specify filters based on pod name, namespace, etc.
Learn how to write filters in this [filter pattern syntax doc](https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/FilterAndPatternSyntax.html#matching-terms-events).

[Vector]: https://vector.dev/
[FluentBit]: https://docs.fluentbit.io/manual
[FluentBitStorage]: https://docs.fluentbit.io/manual
Expand Down
2 changes: 1 addition & 1 deletion doc/source/cluster/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ For a quick demo, you can run Prometheus locally on your machine. Follow the qui

```{admonition} Note
:class: note
If you need to change the root temporary directory by using "--temp-dir" in your Ray
If you need to change the root temporary directory by using "--temp-dir" in your Ray
cluster setup, follow these [manual steps](#optional-manual-running-prometheus-locally) to set up Prometheus locally.
```

Expand Down
2 changes: 2 additions & 0 deletions doc/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@
]

# Configuration for algolia
# Note: This API key grants read access to our indexes and is intended to be public.
# See https://www.algolia.com/doc/guides/security/api-keys/ for more information.
docsearch_app_id = "LBHF0PABBL"
docsearch_api_key = "6c42f30d9669d8e42f6fc92f44028596"
docsearch_index_name = "docs-ray"
Expand Down
4 changes: 2 additions & 2 deletions doc/source/ray-core/fault_tolerance/gcs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ Setting up Redis

.. code-block:: shell
RAY_REDIS_ADDRESS=redis_ip:port ray start --head --redis-password PASSWORD
RAY_REDIS_ADDRESS=redis_ip:port ray start --head --redis-password PASSWORD --redis-username default
.. tab-item:: ray up

Expand All @@ -46,7 +46,7 @@ Setting up Redis
head_start_ray_commands:
- ray stop
- ulimit -n 65536; RAY_REDIS_ADDRESS=redis_ip:port ray start --head --redis-password PASSWORD --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0
- ulimit -n 65536; RAY_REDIS_ADDRESS=redis_ip:port ray start --head --redis-password PASSWORD --redis-username default --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0
.. tab-item:: Kubernetes

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ advanced_configurations_json:
- ResourceType: "instance"
Tags:
- Key: ttl-hours
Value: '24'
Value: '24'
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@ advanced_configurations_json:
- ResourceType: "instance"
Tags:
- Key: ttl-hours
Value: '24'
Value: '24'
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@ advanced_configurations_json:
- ResourceType: "instance"
Tags:
- Key: ttl-hours
Value: '24'
Value: '24'
2 changes: 1 addition & 1 deletion doc/source/templates/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,4 +95,4 @@ To add a template:
tags = ["exclusive", "team:ml", "ray_air", "gpu"],
env = {"SMOKE_TEST": "1"},
)
``` -->
``` -->
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@ advanced_configurations_json:
- ResourceType: "instance"
Tags:
- Key: ttl-hours
Value: '24'
Value: '24'
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@ advanced_configurations_json:
- ResourceType: "instance"
Tags:
- Key: ttl-hours
Value: '24'
Value: '24'
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ advanced_configurations_json:
- ResourceType: "instance"
Tags:
- Key: ttl-hours
Value: '24'
Value: '24'
2 changes: 1 addition & 1 deletion doc/source/templates/testing/compute_configs/cpu/aws.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ advanced_configurations_json:
- ResourceType: "instance"
Tags:
- Key: ttl-hours
Value: '24'
Value: '24'
2 changes: 1 addition & 1 deletion doc/source/templates/testing/compute_configs/gpu/aws.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ advanced_configurations_json:
- ResourceType: "instance"
Tags:
- Key: ttl-hours
Value: '24'
Value: '24'
2 changes: 1 addition & 1 deletion doc/source/train/examples.yml
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ examples:
- title: Fine-tune Llama3.1 with AWS Trainium
frameworks:
- pytorch
- aws neuron
- aws neuron
skill_level: advanced
use_cases:
- natural language processing
Expand Down
6 changes: 3 additions & 3 deletions doc/source/train/examples/aws-trainium/llama3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ When the EKS cluster is ready, create an Amazon ECR repository for building and

::

chmod +x 0-kuberay-trn1-llama3-finetune-build-image.sh
chmod +x 0-kuberay-trn1-llama3-finetune-build-image.sh
./0-kuberay-trn1-llama3-finetune-build-image.sh

4. Enter the zone your cluster is running in, for example: us-east-2.
Expand All @@ -61,7 +61,7 @@ Configuring Ray Cluster

The ``llama3.1_8B_finetune_ray_ptl_neuron`` directory in the AWS Neuron samples repository simplifies the
Ray configuration. KubeRay provides a manifest that you can apply
to the cluster to set up the head and worker pods.
to the cluster to set up the head and worker pods.

Run the following command to set up the Ray cluster:

Expand Down Expand Up @@ -100,4 +100,4 @@ The Ray cluster now ready to handle workloads. Initiate the data preparation and
3. Monitor the jobs via the Ray Dashboard


For detailed information on each of the steps above, see the `AWS documentation link <https://github.com/aws-neuron/aws-neuron-eks-samples/blob/master/llama3.1_8B_finetune_ray_ptl_neuron/README.md/>`__.
For detailed information on each of the steps above, see the `AWS documentation link <https://github.com/aws-neuron/aws-neuron-eks-samples/blob/master/llama3.1_8B_finetune_ray_ptl_neuron/README.md/>`__.
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,11 @@ public GcsClient getGcsClient() {
if (gcsClient == null) {
synchronized (this) {
if (gcsClient == null) {
gcsClient = new GcsClient(rayConfig.getBootstrapAddress(), rayConfig.redisPassword);
gcsClient =
new GcsClient(
rayConfig.getBootstrapAddress(),
rayConfig.redisUsername,
rayConfig.redisPassword);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ public class RayConfig {
public String logDir;

private String bootstrapAddress;
public final String redisUsername;
public final String redisPassword;

// RPC socket name of object store.
Expand Down Expand Up @@ -170,6 +171,7 @@ public RayConfig(Config config) {
this.bootstrapAddress = null;
}

redisUsername = config.getString("ray.redis.username");
redisPassword = config.getString("ray.redis.password");
// Raylet node manager port.
if (config.hasPath("ray.raylet.node-manager-port")) {
Expand Down
Loading

0 comments on commit 04f4cd0

Please sign in to comment.