forked from sustainable-computing-io/kepler-model-server
-
Notifications
You must be signed in to change notification settings - Fork 0
180 lines (168 loc) · 5.73 KB
/
collect-data-self-hosted.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
name: Self-hosted Collect Data Workflow
on:
workflow_call:
secrets:
self_hosted_github_token:
description: 'The GitHub token to use'
required: true
aws_access_key_id:
description: 'The AWS access key id to use'
required: true
aws_secret_access_key:
description: 'The AWS secret access key to use'
required: true
security_group_id:
description: 'The AWS security group id to use'
required: true
aws_region:
description: 'The AWS region to use'
required: true
inputs:
github_repo:
description: 'The GitHub repo to use'
required: true
type: string
ami_id:
description: 'The AMI ID to use for the EC2 instance'
required: true
type: string
instance_type:
description: 'The instance type to use for the EC2 instance'
required: true
type: string
model_server_image:
description: 'Kepler Model Server image'
required: true
type: string
env:
KUBECONFIG: /tmp/kubeconfig
jobs:
setup-runner:
name: Setup Self Hosted Runner
runs-on: ubuntu-latest
outputs:
instance_id: ${{ steps.create-runner.outputs.instance_id }}
runner_name: ${{ steps.create-runner.outputs.runner_name }}
steps:
- name: Create Runner
uses: sustainable-computing-io/aws_ec2_self_hosted_runner@v4
id: create-runner
with:
action: "create"
aws_region: ${{ secrets.aws_region }}
github_token: ${{ secrets.self_hosted_github_token }}
aws_access_key_id: ${{ secrets.aws_access_key_id }}
aws_secret_access_key: ${{ secrets.aws_secret_access_key }}
security_group_id: ${{ secrets.security_group_id }}
github_repo: ${{ inputs.github_repo }}
ami_id: ${{ inputs.ami_id }}
instance_type: ${{ inputs.instance_type }}
create_s3_bucket: "false"
spot_instance_only: "true"
root_volume_size: "100"
- name: Print Output
id: output
run: |
echo "instance_id ${{ steps.create-runner.outputs.instance_id }}"
echo "instance_ip ${{ steps.create-runner.outputs.instance_ip }}"
echo "runner_name ${{ steps.create-runner.outputs.runner_name }}"
whoami
collect-data:
name: Collect Data
needs: setup-runner
runs-on: [self-hosted, linux, x64]
steps:
- name: Checkout
id: checkout
uses: actions/checkout@v4
- name: use Kepler action to deploy cluster
uses: sustainable-computing-io/[email protected]
with:
ebpfprovider: libbpf
cluster_provider: kind
install_containerruntime: true
prometheus_enable: true
tekton_enable: true
kernel_module_names: intel_rapl_common
- name: Install Kepler
working-directory: model_training
run: |
./script.sh deploy_kepler
./script.sh deploy_prom_dependency
kubectl logs $(kubectl get pods -oname -nkepler) -n kepler|grep "obtain power"
- name: Prepare PVC
working-directory: model_training/tekton
run: |
kubectl apply -f pvc/hostpath.yaml
- name: Deploy S3 Secret
run: |
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Secret
metadata:
name: aws-cos-secret
type: Opaque
stringData:
accessKeyID: ${{ secrets.aws_access_key_id }}
accessSecret: ${{ secrets.aws_secret_access_key }}
regionName: ${{ secrets.aws_region }}
bucketName: kepler-power-model
EOF
- name: Deploy Tasks and Pipelines
working-directory: model_training/tekton
run: |
kubectl apply -f tasks
kubectl apply -f tasks/s3
kubectl apply -f pipelines
- name: Run Tekton Pipeline with S3Push
run: |
cat <<EOF | kubectl apply -f -
apiVersion: tekton.dev/v1
kind: PipelineRun
metadata:
name: self-hosted-aws-collect
spec:
timeouts:
pipeline: "6h"
tasks: "5h50m"
workspaces:
- name: mnt
persistentVolumeClaim:
claimName: task-pvc
params:
- name: MODEL_SERVER_IMAGE
value: ${{ inputs.model_server_image }}
- name: COS_PROVIDER
value: aws
- name: COS_SECRET_NAME
value: aws-cos-secret
- name: MACHINE_ID
value: ${{ inputs.instance_type }}-${{ inputs.ami_id }}
pipelineRef:
name: collect-data-pipeline
EOF
- name: Wait for PipelineRun
run: |
./hack/k8s_helper.sh wait_for_pipelinerun self-hosted-aws-collect
destroy-runner:
if: always()
needs: [setup-runner, collect-data]
name: Destroy Self Hosted Runner
runs-on: ubuntu-latest
steps:
- name: unregister runner
id: unregister
uses: sustainable-computing-io/aws_ec2_self_hosted_runner@v4
with:
action: "unregister"
runner_name: ${{ needs.setup-runner.outputs.runner_name }}
github_token: ${{ secrets.self_hosted_github_token }}
github_repo: ${{ inputs.github_repo }}
- name: terminate instance
id: terminate
uses: sustainable-computing-io/aws_ec2_self_hosted_runner@v4
with:
action: "terminate"
aws_access_key_id: ${{ secrets.aws_access_key_id }}
aws_secret_access_key: ${{ secrets.aws_secret_access_key }}
instance_id: ${{ needs.setup-runner.outputs.instance_id }}