forked from carla-simulator/leaderboard-cloud
-
Notifications
You must be signed in to change notification settings - Fork 0
/
leaderboard-cluster.yaml
114 lines (109 loc) · 3.8 KB
/
leaderboard-cluster.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig
metadata:
name: leaderboard-10
region: us-west-2
version: "1.24"
iamIdentityMappings:
- arn: arn:aws:iam::342236305043:role/LB2-eks-admin
groups:
- system:masters
username: admin
noDuplicateARNs: true # prevents shadowing of ARNs
- arn: arn:aws:iam::342236305043:role/LeaderboardStepFunctionRole
groups:
- system:masters
username: admin
noDuplicateARNs: true # prevents shadowing of ARNs
iam:
withOIDC: true
serviceAccounts:
- metadata:
name: fluentd
namespace: kube-system
attachPolicy:
Version: "2012-10-17"
Statement:
- Effect: Allow
Action:
- "logs:CreateLogStream"
- "logs:CreateLogGroup"
- "logs:PutLogEvents"
- "logs:DescribeLogGroups"
- "logs:DescribeLogStreams"
Resource: "arn:aws:logs:*:*:*"
- metadata:
name: cluster-autoscaler
namespace: kube-system
wellKnownPolicies:
autoScaler: true
- metadata:
name: submission-worker
attachPolicy:
Version: "2012-10-17"
Statement:
- Effect: Allow
Action:
# Based on AWSAppRunnerServicePolicyForECRAccess, grants acces to download the ECR docker images
- "ecr:GetDownloadUrlForLayer"
- "ecr:BatchGetImage"
- "ecr:DescribeImages"
- "ecr:GetAuthorizationToken"
- "ecr:BatchCheckLayerAvailability"
# Based on AWSS3FullAccess, grant read + write access to S3“
- "s3:*"
- "s3-object-lambda:*"
# DynamoDB
- "dynamodb:GetItem"
- "dynamodb:DeleteItem"
- "dynamodb:PutItem"
- "dynamodb:UpdateItem"
Resource:
# All ECR repositories, a specific S3 bucket, and the database
- "arn:aws:ecr:*:342236305043:repository/*"
- "arn:aws:s3:::leaderboard-10*"
- "arn:aws:dynamodb:*:342236305043:table/leaderboard-10"
nodeGroups:
- name: basic-worker
amiFamily: AmazonLinux2
instanceType: t3.large
desiredCapacity: 1
volumeSize: 100
labels:
role: basic-worker
- name: submission-worker
instanceType: g5.12xlarge
amiFamily: Ubuntu2004
ami: ami-05c54e41645c675fe
desiredCapacity: 0
minSize: 0
maxSize: 2
volumeSize: 400
tags:
k8s.io/cluster-autoscaler/enabled: "true"
k8s.io/cluster-autoscaler/leaderboard-10: "owned"
labels:
role: submission-generic-worker
ssh:
publicKey: "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDkL6oBOlOqWp4BgOIsQnHQkaPCEGQjdqwWPy1WXLPEnjMLQ3iFGK+zMJ3VNhYujhemn2Yxja8Yw+a0MWv0OfV9TTcW6gsjsBuZyBA0g7OkaFFrAiEi42gajqqnBCEpbEL8/+MYnOHSYCqIXi7yyzHwDGuUzBsyTTsbAmdvuQ8o7sh7QH0Ncw5Z7605RTQI1MxP2zAQdl/UdZipFH9Q3pCidwWLJ3WFYTvKkhpEjiUyrf2sfPya89yFQdfLytpX4mW/YRsvLIoBElJYDkcAkyGPU6N0o+CoXyFg1ezvB9rXFsW1XgRf4ZR3nKxiM9yi1N1Z0/rf5hUWseNRt6/Xl0pn"
overrideBootstrapCommand: |
#!/bin/bash
/etc/eks/bootstrap.sh leaderboard-10
sudo bash -c "echo 'version = 2
[plugins]
[plugins.\"io.containerd.grpc.v1.cri\"]
[plugins.\"io.containerd.grpc.v1.cri\".containerd]
default_runtime_name = \"nvidia\"
[plugins.\"io.containerd.grpc.v1.cri\".containerd.runtimes]
[plugins.\"io.containerd.grpc.v1.cri\".containerd.runtimes.nvidia]
privileged_without_host_devices = false
runtime_engine = \"\"
runtime_root = \"\"
runtime_type = \"io.containerd.runc.v2\"
[plugins.\"io.containerd.grpc.v1.cri\".containerd.runtimes.nvidia.options]
BinaryName = \"/usr/bin/nvidia-container-runtime\"' \
> /etc/containerd/config.toml"
sudo systemctl restart containerd
preBootstrapCommands:
- "sudo nvidia-xconfig --preserve-busid -a --virtual=1280x1024"
- "sudo X :0&"