From 0f2b6fde4decf31d5ecb0fd9ba1554460b198f8d Mon Sep 17 00:00:00 2001 From: Josh Wolf Date: Tue, 3 Dec 2024 04:38:09 -0500 Subject: [PATCH] add gke module (#540) adds a reusable GKE module --------- Signed-off-by: Pris Nasrat Co-authored-by: Pris Nasrat --- modules/gke/main.tf | 303 +++++++++++++++++++++++++++++++++++++++ modules/gke/outputs.tf | 21 +++ modules/gke/variables.tf | 117 +++++++++++++++ 3 files changed, 441 insertions(+) create mode 100644 modules/gke/main.tf create mode 100644 modules/gke/outputs.tf create mode 100644 modules/gke/variables.tf diff --git a/modules/gke/main.tf b/modules/gke/main.tf new file mode 100644 index 00000000..9279b22e --- /dev/null +++ b/modules/gke/main.tf @@ -0,0 +1,303 @@ +terraform { + required_providers { + google = { + source = "hashicorp/google" + } + } +} + +# The default service account applied to all cluster node pools +resource "google_service_account" "cluster_default" { + account_id = "${var.name}-gke-default" + display_name = "${var.name} GKE Default" + project = var.project +} + +resource "google_project_iam_member" "cluster" { + for_each = merge({ + # Read access to the project GCR + "gcr" = "roles/storage.objectViewer" + # Read access to the project ACR + "acr" = "roles/artifactregistry.reader" + # Log writer access + "log-writer" = "roles/logging.logWriter" + # Metrics writer access + "metrics-writer" = "roles/monitoring.metricWriter" + # Monitoring viewer access + "monitoring-viewer" = "roles/monitoring.viewer" + }, var.extra_roles) + + project = var.project + role = each.value + member = "serviceAccount:${google_service_account.cluster_default.email}" +} + +locals { + default_labels = { + "gke" : var.name + } + + squad_label = { + "squad" : var.squad + } +} + +resource "google_container_cluster" "this" { + name = var.name + project = var.project + + network = var.network + subnetwork = var.subnetwork + + location = var.region + node_locations = var.zones + + enable_intranode_visibility = true + + remove_default_node_pool = true + initial_node_count = 1 + + # Use Dataplane V2 (eBPF based networking) + datapath_provider = "ADVANCED_DATAPATH" + + networking_mode = "VPC_NATIVE" + // Keeping this empty means GKE handles the secondary pod/service CIDR creation + ip_allocation_policy {} + + workload_identity_config { + workload_pool = "${var.project}.svc.id.goog" + } + + release_channel { + # NOTE: Toggle to "RAPID" when we want to start playing with things like gcsfuse + channel = var.release_channel + } + + # Configured with separate node_pool resources + # node_config {} + + dynamic "cluster_autoscaling" { + for_each = var.cluster_autoscaling == false ? [] : ["placeholder"] + + content { + enabled = var.cluster_autoscaling + resource_limits { + resource_type = var.cluster_autoscaling_cpu_limits.resource_type + minimum = var.cluster_autoscaling_cpu_limits.minimum + maximum = var.cluster_autoscaling_cpu_limits.maximum + } + resource_limits { + resource_type = var.cluster_autoscaling_memory_limits.resource_type + minimum = var.cluster_autoscaling_memory_limits.minimum + maximum = var.cluster_autoscaling_memory_limits.maximum + } + dynamic "auto_provisioning_defaults" { + for_each = var.cluster_autoscaling_provisioning_defaults == null ? [] : ["placeholder"] + + content { + service_account = google_service_account.cluster_default.email + disk_size = var.cluster_autoscaling_provisioning_defaults.disk_size + disk_type = var.cluster_autoscaling_provisioning_defaults.disk_type + + dynamic "shielded_instance_config" { + for_each = var.cluster_autoscaling_provisioning_defaults.shielded_instance_config == null ? [] : ["placeholder"] + + content { + enable_secure_boot = var.cluster_autoscaling_provisioning_defaults.shielded_instance_config.enable_secure_boot + enable_integrity_monitoring = var.cluster_autoscaling_provisioning_defaults.shielded_instance_config.enable_integrity_monitoring + } + } + dynamic "management" { + for_each = var.cluster_autoscaling_provisioning_defaults.management == null ? [] : ["placeholder"] + + content { + auto_upgrade = var.cluster_autoscaling_provisioning_defaults.management.auto_upgrade + auto_repair = var.cluster_autoscaling_provisioning_defaults.management.auto_repair + } + } + } + } + autoscaling_profile = var.cluster_autoscaling_profile + } + } + + master_authorized_networks_config { + # gcp_public_cidrs_access_enabled = true + cidr_blocks { + display_name = "Everywhere" + cidr_block = "0.0.0.0/0" + } + + # TODO: Pin this to https://api.github.com/meta + # Github recommends against doing this, so maybe there's a more effective way, perhaps a certain scale with a tail? + # cidr_blocks {} + } + + private_cluster_config { + enable_private_nodes = true + enable_private_endpoint = false + master_ipv4_cidr_block = var.master_ipv4_cidr_block + master_global_access_config { + enabled = true + } + # This doesn't do what you think it does + # private_endpoint_subnetwork = var.subnetwork + } + + dns_config { + # Enable more efficient DNS resolution by leveraging the GCP backplane (instead of kube-dns) + # Technically this adds cloud DNS billing, but the cost is negligible + # https://cloud.google.com/kubernetes-engine/docs/how-to/cloud-dns + cluster_dns = "CLOUD_DNS" + cluster_dns_scope = "CLUSTER_SCOPE" + } + + # TODO: These probably could be configurable + addons_config { + http_load_balancing { + disabled = false + } + gke_backup_agent_config { + enabled = false + } + config_connector_config { + enabled = false + } + gcs_fuse_csi_driver_config { + enabled = true + } + } + + monitoring_config { + enable_components = ["SYSTEM_COMPONENTS", "APISERVER", "SCHEDULER", "CONTROLLER_MANAGER", "STORAGE", "POD"] + managed_prometheus { enabled = true } + + } + + # This can't hurt... right? + cost_management_config { + enabled = true + } + + timeouts { + create = "30m" + update = "30m" + delete = "30m" + } + + lifecycle { + ignore_changes = [initial_node_count] + } + + depends_on = [google_service_account.cluster_default] +} + +resource "google_container_node_pool" "pools" { + for_each = var.pools + provider = google-beta + + name = each.key + cluster = google_container_cluster.this.name + project = var.project + location = google_container_cluster.this.location + + network_config { + enable_private_nodes = false + create_pod_range = true + pod_ipv4_cidr_block = null + } + + node_config { + service_account = google_service_account.cluster_default.email + image_type = "COS_CONTAINERD" + machine_type = each.value.machine_type + workload_metadata_config { + # Run the GKE metadata server on these nodes (required for workload identity) + mode = "GKE_METADATA" + } + metadata = { + disable-legacy-endpoints = true + block-project-ssh-keys = true + } + + disk_type = each.value.disk_type + disk_size_gb = each.value.disk_size + + dynamic "ephemeral_storage_local_ssd_config" { + for_each = each.value.ephemeral_storage_local_ssd_count > 0 ? [1] : [] + content { + local_ssd_count = each.value.ephemeral_storage_local_ssd_count + } + } + + # Don't set legacy scopes + # oauth_scopes = [] + + # Enable google vNIC driver + gvnic { + enabled = true + } + + # Enable google container filesystem (required for image streaming) + gcfs_config { + enabled = true + } + + dynamic "sandbox_config" { + for_each = each.value.gvisor ? [1] : [] + content { + sandbox_type = "gvisor" + } + } + + spot = each.value.spot + labels = each.value.labels + resource_labels = merge(local.default_labels, local.squad_label) + + dynamic "taint" { + for_each = each.value.taints + content { + key = taint.value.key + value = taint.value.value + effect = taint.value.effect + } + } + } + + autoscaling { + min_node_count = each.value.min_node_count + max_node_count = each.value.max_node_count + } + + management { + auto_repair = true + auto_upgrade = true + } +} + +# Allow GKE master to hit non 443 ports for webhook/admission controllers +# +# https://github.com/kubernetes/kubernetes/issues/79739 +resource "google_compute_firewall" "master_webhook" { + project = var.project + network = var.network + + name = "${var.name}-master-webhook" + description = "Allow GKE master to hit non 443 ports for webhook/admission controllers" + direction = "INGRESS" + + source_ranges = ["${google_container_cluster.this.endpoint}/32"] + source_tags = [] + target_tags = ["gke-${google_container_cluster.this.name}"] + + allow { + protocol = "tcp" + ports = [ + "8443", + "9443", + "15017", + ] + } + + depends_on = [google_container_cluster.this] +} diff --git a/modules/gke/outputs.tf b/modules/gke/outputs.tf new file mode 100644 index 00000000..b605a9a2 --- /dev/null +++ b/modules/gke/outputs.tf @@ -0,0 +1,21 @@ +output "cluster_name" { + value = google_container_cluster.this.name +} + +output "cluster_id" { + value = google_container_cluster.this.id +} + +output "service_account_email" { + value = google_service_account.cluster_default.email +} + +output "cluster_endpoint" { + value = google_container_cluster.this.endpoint + sensitive = true +} + +output "cluster_ca_certificate" { + value = google_container_cluster.this.master_auth[0].cluster_ca_certificate + sensitive = true +} diff --git a/modules/gke/variables.tf b/modules/gke/variables.tf new file mode 100644 index 00000000..65259576 --- /dev/null +++ b/modules/gke/variables.tf @@ -0,0 +1,117 @@ +variable "name" {} + +variable "project" {} + +variable "network" {} + +variable "region" { + description = "Always create a regional cluster since GKE doesn't charge differently for regional/zonal clusters. Rather, we configure the node locations using `var.zones`" +} + +variable "require_squad" { + description = "Whether to require squad variable to be specified" + type = bool + default = true +} + +variable "squad" { + description = "squad label to apply to the service." + type = string + default = "" + + validation { + condition = !var.require_squad || var.squad != "" + error_message = "squad needs to specified or disable check by setting require_squad = false" + } +} + +variable "zones" { + default = null + description = "If specified, will spread nodes across these zones" +} + +variable "subnetwork" {} + +variable "master_ipv4_cidr_block" { + description = "If specified, will use this CIDR block for the master's IP address" +} + +variable "pools" { + type = map(object({ + min_node_count = optional(number, 1) + max_node_count = optional(number, 1) + machine_type = optional(string, "c3-standard-4") + disk_type = optional(string, "pd-balanced") + disk_size = optional(number, 100) + ephemeral_storage_local_ssd_count = optional(number, 0) + spot = optional(bool, false) + gvisor = optional(bool, false) + labels = optional(map(string), {}) + taints = optional(list(object({ + key = string + value = string + effect = string + })), []) + })) +} + +variable "extra_roles" { + type = map(string) + default = {} + description = "Extra roles to add to the cluster's default service account" +} + +variable "release_channel" { + type = string + default = "REGULAR" + description = "GKE release channel" +} + +variable "cluster_autoscaling" { + type = bool + default = false + description = "Enabling of node auto-provisioning" +} + +variable "cluster_autoscaling_cpu_limits" { + type = object({ + resource_type = optional(string, "cpu") + minimum = optional(number, 4) + maximum = optional(number, 10) + }) + default = {} + description = "cluster autoscaling cpu limits" +} + +variable "cluster_autoscaling_memory_limits" { + type = object({ + resource_type = optional(string, "memory"), + minimum = optional(number, 8) + maximum = optional(number, 80) + }) + default = null + description = "cluster autoscaling memory limits" +} + +variable "cluster_autoscaling_provisioning_defaults" { + type = object({ + disk_size = optional(number, null) + disk_type = optional(string, null) + shielded_instance_config = optional(object({ + enable_secure_boot = optional(bool, null) + enable_integrity_monitoring = optional(bool, null) + }), null) + management = optional(object({ + auto_upgrade = optional(bool, null) + auto_repair = optional(bool, null) + }), null) + }) + default = null + description = "cluster autoscaling provisioning defaults" +} + +variable "cluster_autoscaling_profile" { + type = string + default = null + description = "cluster autoscaling profile" +}