Skip to content

Commit

Permalink
added yaml file to launch workers + sim simultaneously
Browse files Browse the repository at this point in the history
  • Loading branch information
flimdejong committed Nov 18, 2024
1 parent 3d33d3a commit a5df29c
Showing 1 changed file with 247 additions and 0 deletions.
247 changes: 247 additions & 0 deletions docker/runner/ray-cluster-combined.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
apiVersion: ray.io/v1alpha1
kind: RayCluster
metadata:
name: roboteam-ray-cluster
spec:
rayVersion: "2.38.0"
# Head node configuration
headGroupSpec:
rayStartParams:
dashboard-host: "0.0.0.0"
node-ip-address: "$(POD_IP)"
template:
metadata:
labels:
app: ray-head
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- multinode-demo # Schedule on control-plane
containers:
- name: ray-head
image: roboteamtwente/ray:development
imagePullPolicy: Always
ports:
- containerPort: 8265 # dashboard
- containerPort: 6379 # redis
- containerPort: 10001 # GCS server
- containerPort: 8000 # Serve
resources:
requests:
cpu: "500m"
memory: "1Gi"
limits:
cpu: "1"
memory: "2Gi"
env:
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
command: ["/bin/bash", "-c", "--"]
args: ["ray start --head --port=6379 --dashboard-host=0.0.0.0 --node-ip-address=$(POD_IP) --block"]
livenessProbe:
exec:
command:
- bash
- -c
- "wget -T 2 -q -O- http://localhost:52365/api/local_raylet_healthz | grep success"
initialDelaySeconds: 15
timeoutSeconds: 10
periodSeconds: 30
failureThreshold: 5
readinessProbe:
exec:
command:
- bash
- -c
- "wget -T 10 -q -O- http://localhost:8265/api/gcs_healthz | grep success"
initialDelaySeconds: 15
timeoutSeconds: 10
periodSeconds: 30
failureThreshold: 5

# Worker node configuration with integrated simulator
workerGroupSpecs:
- groupName: worker-group
replicas: 1
rayStartParams:
num-cpus: "1"
template:
metadata:
labels:
app: ray-worker
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- multinode-demo-m02 # Schedule on worker node
volumes:
- name: gradle-cache
emptyDir: {}
containers:

# Ray worker container
- name: ray-worker
image: roboteamtwente/ray:development
imagePullPolicy: Always
resources:
requests:
cpu: 500m
memory: 1Gi
limits:
cpu: 1000m
memory: 2Gi
env:
- name: LD_LIBRARY_PATH
value: /home/roboteam/build/release/lib
command: ["/bin/bash", "-c", "--"]
args: ["ray start --address='roboteam-ray-cluster-head-svc.default.svc.cluster.local:6379' --block"]

# Game Controller
- name: ssl-game-controller
image: robocupssl/ssl-game-controller:latest
args: ["-address", "0.0.0.0:8081"] # Changed from :8081 to explicitly bind to all interfaces
ports:
- containerPort: 8081
protocol: TCP # Explicitly set protocol

# Primary AI
- name: roboteam-primary-ai
image: roboteamtwente/roboteam:kubernetes
command: ["/bin/sh"]
args: ["-c", "/home/roboteam/build/release/bin/roboteam_ai --primary-ai"]
workingDir: /home/roboteam
env:
- name: LD_LIBRARY_PATH
value: /home/roboteam/build/release/lib
resources:
requests:
cpu: 60m
memory: 10Mi
limits:
cpu: 100m
memory: 50Mi

# Observer
- name: roboteam-observer-sim
image: roboteamtwente/roboteam:kubernetes
command: ["/bin/sh"]
args: ["-c", "/home/roboteam/build/release/bin/roboteam_observer --vision-ip 224.5.23.2 --referee-ip 224.5.23.1 --vision-port 10020 --referee-port 10003 --log"]
env:
- name: LD_LIBRARY_PATH
value: /home/roboteam/build/release/lib
workingDir: /home/roboteam
resources:
requests:
cpu: 60m
memory: 60Mi
limits:
cpu: 100m
memory: 90Mi

# Robothub
- name: roboteam-robothub-sim
image: roboteamtwente/roboteam:kubernetes
command: ["/bin/sh"]
args: ["-c", "/home/roboteam/build/release/bin/roboteam_robothub"]
workingDir: /home/roboteam
env:
- name: LD_LIBRARY_PATH
value: /home/roboteam/build/release/lib
resources:
requests:
cpu: 30m
memory: 20Mi
limits:
cpu: 60m
memory: 50Mi

# Autoref
- name: erforce-autoref-sim
image: roboteamtwente/roboteam:kubernetes
command: ["/bin/sh"]
args: ["-c", "cd /home/roboteam/external/autoref/build/bin && ./autoref-cli --vision-port 10020 --tracker-port 10010 --gc-port 10003"]
env:
- name: LD_LIBRARY_PATH
value: /home/roboteam/build/release/lib
resources:
requests:
cpu: 120m
memory: 20Mi
limits:
cpu: 150m
memory: 50Mi

# Simulator
- name: erforce-simulator
image: roboteamtwente/roboteam:kubernetes
command: ["/bin/sh"]
args:
- "-c"
- "/home/roboteam/external/framework/build/bin/simulator-cli"
ports:
- containerPort: 10300
protocol: UDP # Simulator control port
- containerPort: 10301
protocol: TCP # Presumably TCP ports
- containerPort: 5558
protocol: TCP # ZMQ port
env:
- name: LD_LIBRARY_PATH
value: /home/roboteam/build/release/lib

# Tigers Sumatra
- name: tigers-sumatra
image: roboteamtwente/roboteam:kubernetes
imagePullPolicy: Always
command: ["/bin/sh"]
args: ["-c", "cd /home/roboteam/tigers_sumatra && ./gradlew :run --args=\"--headless --moduli roboteamtwente --aiBlue --visionAddress 224.5.23.2:10020 --refereeAddress 224.5.23.1:10003\""]
env:
- name: LD_LIBRARY_PATH
value: /home/roboteam/build/release/lib
resources:
requests:
cpu: 500m
memory: 1400Mi
limits:
cpu: 700m
memory: 1800Mi
volumeMounts:
- name: gradle-cache
mountPath: /home/roboteam/.gradle

---
# Nodeport service for external access
apiVersion: v1
kind: Service
metadata:
name: roboteam-ray-cluster-head-nodeport
spec:
type: NodePort
selector:
app: ray-head
ports:
- name: dashboard
port: 8265
targetPort: 8265
nodePort: 30265 # Ray dashboard
- name: gcs
port: 10001
targetPort: 10001
nodePort: 31001 # GCS server
- name: gc-interface
port: 8081
targetPort: 8081
nodePort: 30081 # Game controller interface

0 comments on commit a5df29c

Please sign in to comment.