deploy/ibm_cloud_code_engine/example-cluster.yaml

# A unique identifier for the head node and workers of this cluster.
cluster_name: example-cluster

# The maximum number of workers nodes to launch in addition to the head
# node.
max_workers: 10

# The autoscaler will scale up the cluster faster with higher upscaling speed.
# E.g., if the task requires adding more nodes then autoscaler will gradually
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
# This number should be > 0.
upscaling_speed: 2 

# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 1 

# Kubernetes resources that need to be configured for the autoscaler to be
# able to manage the Ray cluster. If any of the provided resources don't
# exist, the autoscaler will attempt to create them. If this fails, you may
# not have the required permissions and will have to request them to be
# created by your cluster administrator.
provider:
    type: kubernetes

    # Exposing external IP addresses for ray pods isn't currently supported.
    use_internal_ips: true

    # Namespace to use for all resources created.
    namespace:  

    services:
      # Service that maps to the head node of the Ray cluster.
      - apiVersion: v1
        kind: Service
        metadata:
            # NOTE: If you're running multiple Ray clusters with services
            # on one Kubernetes cluster, they must have unique service
            # names.
            name: example-cluster-ray-head
        spec:
            # This selector must match the head node pod's selector below.
            selector:
                component: example-cluster-ray-head
            ports:
                - name: client
                  protocol: TCP
                  port: 10001
                  targetPort: 10001
                - name: dashboard
                  protocol: TCP
                  port: 8265
                  targetPort: 8265

# Specify the pod type for the ray head node (as configured below).
head_node_type: head_node
# Specify the allowed pod types for this ray cluster and the resources they provide.
available_node_types:
  worker_node:
    # Minimum number of Ray workers of this Pod type.
    min_workers: 0 
    # Maximum number of Ray workers of this Pod type. Takes precedence over min_workers.
    max_workers: 10
    # User-specified custom resources for use by Ray. Object with string keys and integer values.
    # (Ray detects CPU and GPU from pod spec resource requests and limits, so no need to fill those here.)
    node_config:
      apiVersion: v1
      kind: Pod
      metadata:
        # Automatically generates a name for the pod with this prefix.
        generateName: example-cluster-ray-worker-
      spec:
        restartPolicy: Never
        volumes:
        - name: dshm
          emptyDir:
            medium: Memory
        containers:
        - name: ray-node
          imagePullPolicy: Always
          image: projectcodeflare/codeflare:latest
          command: ["/bin/bash", "-c", "--"]
          args: ["trap : TERM INT; sleep infinity & wait;"]
          # This volume allocates shared memory for Ray to use for its plasma
          # object store. If you do not provide this, Ray will fall back to
          # /tmp which cause slowdowns if is not a shared memory volume.
          volumeMounts:
          - mountPath: /dev/shm
            name: dshm
          resources:
            requests:
              cpu: 1
              memory: 2G
            limits:
              # The maximum memory that this pod is allowed to use. The
              # limit will be detected by ray and split to use 10% for
              # redis, 30% for the shared memory object store, and the
              # rest for application memory. If this limit is not set and
              # the object store size is not set manually, ray will
              # allocate a very large object store in each pod that may
              # cause problems for other pods.
              cpu: 1
              memory: 2G
  head_node:
    # The minimum number of worker nodes of this type to launch.
    # This number should be >= 0.
    min_workers: 0
    # The maximum number of worker nodes of this type to launch.
    # This takes precedence over min_workers.
    max_workers: 0
    node_config:
      apiVersion: v1
      kind: Pod
      metadata:
        # Automatically generates a name for the pod with this prefix.
        generateName: example-cluster-ray-head-
        # Must match the head node service selector above if a head node
        # service is required.
        labels:
            component: example-cluster-ray-head
      spec:
        # Change this if you altered the autoscaler_service_account above
        # or want to provide your own.
        serviceAccountName: ap9fjwkf04j-writer

        restartPolicy: Never

        # This volume allocates shared memory for Ray to use for its plasma
        # object store. If you do not provide this, Ray will fall back to
        # /tmp which cause slowdowns if is not a shared memory volume.
        volumes:
        - name: dshm
          emptyDir:
            medium: Memory
        containers:
        - name: ray-node
          imagePullPolicy: Always
          image: projectcodeflare/codeflare:latest
          # Do not change this command - it keeps the pod alive until it is
          # explicitly killed.
          command: ["/bin/bash", "-c", "--"]
          args: ['trap : TERM INT; sleep infinity & wait;']
          ports:
          - containerPort: 6379  # Redis port
          - containerPort: 10001  # Used by Ray Client
          - containerPort: 8265  # Used by Ray Dashboard

          # This volume allocates shared memory for Ray to use for its plasma
          # object store. If you do not provide this, Ray will fall back to
          # /tmp which cause slowdowns if is not a shared memory volume.
          volumeMounts:
          - mountPath: /dev/shm
            name: dshm
          resources:
            requests:
              cpu: 1
              memory: 2G
            limits:
              # The maximum memory that this pod is allowed to use. The
              # limit will be detected by ray and split to use 10% for
              # redis, 30% for the shared memory object store, and the
              # rest for application memory. If this limit is not set and
              # the object store size is not set manually, ray will
              # allocate a very large object store in each pod that may
              # cause problems for other pods.
              cpu: 1
              memory: 2G


# Command to start ray on the head node. You don't need to change this.
# Note dashboard-host is set to 0.0.0.0 so that kubernetes can port forward.
head_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0

# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379