-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
k8s: Terraform deployment for Azure clusters
This provides a Terraform configuration for deploying our Kubernetes clusters to Azure. We deploy an identical cluster to each of a list of regions, with one small node for admin purposes due to a requirement to not use spot instances for the main node group for the and two autoscaling groups one with small 8 core nodes for most jobs and one with bigger nodes for the more resource intensive ones. This is different to our current scheme where each cluster has a single node group and we direct jobs in Jenkins. With this scheme we allow the Kubernetes scheduler to place jobs, or we can still direct them to specific node sizes using nodeSelector in the jobs and the labels that are assigned to the nodegroups. This is a more Kubernetes way of doing things and decouples further from Jenkins. This needs updates for authentication, and for the storage space for the Terraform state. Signed-off-by: Mark Brown <[email protected]>
- Loading branch information
Showing
5 changed files
with
172 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
This needs to be run on a machine with Terraform install and the | ||
Azure CLI. Personal login for the Azure CLI can be done with: | ||
|
||
az login | ||
|
||
The actual account used is a service principal account though: | ||
|
||
az ad sp create-for-rbac -n kernelci-k8s | ||
|
||
which outputs an appId and password, this should be distributed via the | ||
credential store and set in terraform variables, see variables.tf. | ||
|
||
When the clusters are created a logged in user can set up the client | ||
credentials like this: | ||
|
||
for c in $(az aks list --query '[].name' -o tsv) ; do | ||
az aks get-credentials --resource-group kernelci-workers --name ${c} | ||
done | ||
|
||
(TBD: also put this in outputs.tf, need to figure out syntax for arrays) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
# FIXME: For real deployment we should store the terraform state | ||
# in cloud storage rather than just the current directory, terraform | ||
# supports Azure blob storage directly. This means configuration | ||
# doesn't need to be on a single machine somewhere. | ||
# | ||
# See https://www.terraform.io/language/settings/backends/azurerm | ||
# | ||
#terraform { | ||
# backend "azurerm" { | ||
# resource_group_name = "kernelci-tf-storage" | ||
# storage_account_name = "kernelci-tf" | ||
# container_name = "tfstate" | ||
# key = "workers.terraform.tfstate" | ||
# } | ||
#} | ||
|
||
provider "azurerm" { | ||
features {} | ||
} | ||
|
||
# We assign all clusters to the same resource group, this is purely for | ||
# accounting purposes so it doesn't matter where the resource group is | ||
resource "azurerm_resource_group" "workers" { | ||
name = "kernelci-workers" | ||
location = "East US" | ||
|
||
tags = { | ||
environment = "kernelci-workers" | ||
} | ||
} | ||
|
||
locals { | ||
zones = toset([ | ||
"uksouth", | ||
"eastus", | ||
]) | ||
} | ||
|
||
resource "azurerm_kubernetes_cluster" "workers" { | ||
for_each = local.zones | ||
|
||
name = "${each.key}-workers-aks" | ||
location = each.key | ||
resource_group_name = azurerm_resource_group.workers.name | ||
dns_prefix = "${each.key}-workers-k8s" | ||
|
||
# Automatically roll out upgrades from AKS | ||
automatic_channel_upgrade = "stable" | ||
|
||
# Single always present node as AKS requires a default node pool - | ||
# Terraform and/or AKS don't let us tag this as a spot instance and | ||
# ideally we can scale the builders down to 0 so this is a small | ||
# instance not tagged for work. | ||
default_node_pool { | ||
name = "default" | ||
node_count = 1 | ||
vm_size = "Standard_DS2_v2" | ||
os_disk_size_gb = 30 | ||
|
||
node_labels = { | ||
"kernelci/management" = "management" | ||
} | ||
} | ||
|
||
service_principal { | ||
client_id = var.appId | ||
client_secret = var.password | ||
} | ||
|
||
role_based_access_control { | ||
enabled = true | ||
} | ||
|
||
tags = { | ||
environment = "kernelci" | ||
} | ||
} | ||
|
||
# Smaller nodes for most jobs | ||
resource "azurerm_kubernetes_cluster_node_pool" "small_workers" { | ||
for_each = azurerm_kubernetes_cluster.workers | ||
|
||
name = "smallworkers" | ||
kubernetes_cluster_id = each.value.id | ||
|
||
# 3rd gen Xeon 8 cores, 32G RAM - general purpose | ||
vm_size = "Standard_D8s_v5" | ||
|
||
# Currently things struggle with scale to 0 so require a node | ||
enable_auto_scaling = true | ||
min_count = 1 | ||
node_count = 1 | ||
max_count = 10 | ||
|
||
priority = "Spot" | ||
# We could set this lower to control costs, -1 means up to on demand | ||
# price | ||
spot_max_price = -1 | ||
|
||
node_labels = { | ||
"kernelci/worker" = "worker" | ||
"kernelci/worker-size" = "small" | ||
} | ||
} | ||
|
||
# Big nodes for more intensive jobs (and large numbers of small jobs) | ||
resource "azurerm_kubernetes_cluster_node_pool" "big_workers" { | ||
for_each = azurerm_kubernetes_cluster.workers | ||
|
||
name = "bigworkers" | ||
kubernetes_cluster_id = each.value.id | ||
|
||
# 3rd gen Xeon, 32 core, 64G RAM - compute optimised | ||
vm_size = "Standard_F32s_v2" | ||
|
||
# Currently things struggle with scale to 0 so require a node | ||
enable_auto_scaling = true | ||
min_count = 1 | ||
node_count = 1 | ||
max_count = 10 | ||
|
||
priority = "Spot" | ||
# We could set this lower to control costs, -1 means up to on demand | ||
# price | ||
spot_max_price = -1 | ||
|
||
node_labels = { | ||
"kernelci/worker" = "worker" | ||
"kernelci/worker-size" = "big" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
output "resource_group_name" { | ||
value = azurerm_resource_group.workers.name | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
variable "appId" { | ||
description = "Azure Kubernetes Service Cluster service principal" | ||
} | ||
|
||
variable "password" { | ||
description = "Azure Kubernetes Service Cluster password" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
terraform { | ||
required_providers { | ||
azurerm = { | ||
source = "hashicorp/azurerm" | ||
version = "2.66.0" | ||
} | ||
} | ||
|
||
required_version = ">= 0.14" | ||
} | ||
|