From 2f56d49726c6a1e1982ec8952145ff20bd809481 Mon Sep 17 00:00:00 2001 From: sthompson22 Date: Fri, 25 Jan 2019 10:58:03 -0500 Subject: [PATCH 01/13] Set metrics namepsace and app versions --- infrastructure/terraform/keep-dev/variables.tf | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/infrastructure/terraform/keep-dev/variables.tf b/infrastructure/terraform/keep-dev/variables.tf index 978644deae..db24c350fc 100644 --- a/infrastructure/terraform/keep-dev/variables.tf +++ b/infrastructure/terraform/keep-dev/variables.tf @@ -161,3 +161,20 @@ variable "atlantis_ip_address_type" { description = "Internet facing or not. internal or external" default = "external" } + +# gke_metrics +variable "gke_metrics_namespace" { + default = "metrics" +} + +variable "kube_state_metrics" { + default { + version = "0.13.0" + } +} + +variable "prometheus_to_sd" { + default { + version = "0.1.1" + } +} From 42a109933644db90f8565b224d09d832925775cd Mon Sep 17 00:00:00 2001 From: sthompson22 Date: Fri, 25 Jan 2019 10:59:25 -0500 Subject: [PATCH 02/13] Install gke_metrics for keep-dev --- infrastructure/terraform/keep-dev/main.tf | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/infrastructure/terraform/keep-dev/main.tf b/infrastructure/terraform/keep-dev/main.tf index 0eced7220f..c79abcb8f0 100644 --- a/infrastructure/terraform/keep-dev/main.tf +++ b/infrastructure/terraform/keep-dev/main.tf @@ -149,3 +149,16 @@ resource "google_compute_global_address" "atlantis_external_ip" { address_type = "${upper(var.atlantis_ip_address_type)}" labels = "${local.labels}" } + +module "demo_dev_gke_cluster_metrics" { + source = "git@github.com:thesis/infrastructure.git//terraform/modules/gke_metrics" + namespace = "${var.gke_metrics_namespace}" + + kube_state_metrics { + version = "${var.kube_state_metrics["version"]}" + } + + prometheus_to_sd { + version = "${var.prometheus_to_sd["version"]}" + } +} From d569c000b6f71177714431f13d12e3700cc0da2a Mon Sep 17 00:00:00 2001 From: sthompson22 Date: Fri, 25 Jan 2019 11:13:17 -0500 Subject: [PATCH 03/13] Standalone tf config for openvpn I'm going to wrap this in a module eventually. For now I've configured openvpn in a separate config file to keep it from the perm configs. --- .../terraform/keep-dev/helm-release.tf | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 infrastructure/terraform/keep-dev/helm-release.tf diff --git a/infrastructure/terraform/keep-dev/helm-release.tf b/infrastructure/terraform/keep-dev/helm-release.tf new file mode 100644 index 0000000000..bc3ea3bf8f --- /dev/null +++ b/infrastructure/terraform/keep-dev/helm-release.tf @@ -0,0 +1,18 @@ +# .tf file for configuring helm releases + +resource "helm_release" "openvpn" { + name = "helm-openvpn" + namespace = "default" + chart = "stable/openvpn" + version = "3.10.0" + + set { + name = "openvpn.redirectGateway" + value = "false" + } + + set { + name = "openvpn.conf" + value = "push \"route 172.16.0.0 255.255.255.240\"" + } +} From 041434708741470222c3dcb6a8eae36f44c1560d Mon Sep 17 00:00:00 2001 From: sthompson22 Date: Mon, 11 Feb 2019 14:38:49 -0500 Subject: [PATCH 04/13] Explicit provider version definitions Providers are our interface into the various APIs Terraform needs to interact with to provision our infrastructure. They're all managed and versioned independently. Without versioning our providers we run the risk of breaking our currently deployed infra by a provider update. Here we version all providers, and rollback a the template provider from version 2.0.0 to 1.0.0 to prevent some unsavory behavior in the Google managed NAT module. --- infrastructure/terraform/keep-dev/provider.tf | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/infrastructure/terraform/keep-dev/provider.tf b/infrastructure/terraform/keep-dev/provider.tf index 2d248b3c11..978cb2eaa9 100644 --- a/infrastructure/terraform/keep-dev/provider.tf +++ b/infrastructure/terraform/keep-dev/provider.tf @@ -2,16 +2,17 @@ data "google_client_config" "default" {} # Configure the Google Cloud provider provider "google" { - version = "~> 1.19" + version = "<= 1.19.0" region = "${var.region_data["region"]}" } provider "google-beta" { - version = "~> 1.19" + version = "<= 1.19.0" region = "${var.region_data["region"]}" } provider "kubernetes" { + version = "<= 1.5.0" load_config_file = false host = "https://${var.gke_cluster["master_private_endpoint"]}" token = "${data.google_client_config.default.access_token}" @@ -24,6 +25,8 @@ module "helm_provider_helper" { } provider "helm" { + version = "<= 0.7.0" + kubernetes { host = "https://${var.gke_cluster["master_private_endpoint"]}" token = "${data.google_client_config.default.access_token}" @@ -36,3 +39,15 @@ provider "helm" { namespace = "${module.helm_provider_helper.tiller_namespace}" install_tiller = true } + +provider "null" { + version = "<= 2.0.0" +} + +provider "random" { + version = "<= 2.0.0" +} + +provider "template" { + version = "<= 1.0.0" +} From f6da6a48102082d6720a2767bc54f37f12b86b1d Mon Sep 17 00:00:00 2001 From: sthompson22 Date: Tue, 12 Feb 2019 09:01:41 -0500 Subject: [PATCH 05/13] Sourcing from local as a temp measure to observe some terraform plan changes on the flyyyyy --- infrastructure/terraform/keep-dev/main.tf | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/infrastructure/terraform/keep-dev/main.tf b/infrastructure/terraform/keep-dev/main.tf index c79abcb8f0..479a2b5a1a 100644 --- a/infrastructure/terraform/keep-dev/main.tf +++ b/infrastructure/terraform/keep-dev/main.tf @@ -150,8 +150,10 @@ resource "google_compute_global_address" "atlantis_external_ip" { labels = "${local.labels}" } -module "demo_dev_gke_cluster_metrics" { - source = "git@github.com:thesis/infrastructure.git//terraform/modules/gke_metrics" +module "gke_cluster_metrics" { + source = "../../../../thesis/infrastructure/terraform/modules/gke_metrics" + + #source = "git@github.com:thesis/infrastructure.git//terraform/modules/gke_metrics" namespace = "${var.gke_metrics_namespace}" kube_state_metrics { From 30df0cf2112d5cc922058fd7e59b6c47a1c58a53 Mon Sep 17 00:00:00 2001 From: sthompson22 Date: Tue, 12 Feb 2019 09:49:13 -0500 Subject: [PATCH 06/13] Revert "Sourcing from local as a temp measure to observe some terraform plan changes on the flyyyyy" This reverts commit f6da6a48102082d6720a2767bc54f37f12b86b1d. --- infrastructure/terraform/keep-dev/main.tf | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/infrastructure/terraform/keep-dev/main.tf b/infrastructure/terraform/keep-dev/main.tf index 479a2b5a1a..c79abcb8f0 100644 --- a/infrastructure/terraform/keep-dev/main.tf +++ b/infrastructure/terraform/keep-dev/main.tf @@ -150,10 +150,8 @@ resource "google_compute_global_address" "atlantis_external_ip" { labels = "${local.labels}" } -module "gke_cluster_metrics" { - source = "../../../../thesis/infrastructure/terraform/modules/gke_metrics" - - #source = "git@github.com:thesis/infrastructure.git//terraform/modules/gke_metrics" +module "demo_dev_gke_cluster_metrics" { + source = "git@github.com:thesis/infrastructure.git//terraform/modules/gke_metrics" namespace = "${var.gke_metrics_namespace}" kube_state_metrics { From 2140ca9e6231f7b9f132f23765e562e95f8f3842 Mon Sep 17 00:00:00 2001 From: sthompson22 Date: Tue, 12 Feb 2019 09:51:03 -0500 Subject: [PATCH 07/13] Set GKE logging beta service For our current implementation of metrics to work we need to enable the GKE "beta monitoring and logging experience". While prototyping in keep-dev I did this by hand. The default logging as configured by Terraform does not use the same logging service. While trying to reapply metrics configs to `keep-dev` via the new Terraform module, Terraform kept wanting to revert the logging service (this is bad). Here we set the proper logging service so that we can continue to configure metrics without Terraform wanting to revert the logger on every plan/apply run. --- infrastructure/terraform/keep-dev/main.tf | 3 ++- infrastructure/terraform/keep-dev/variables.tf | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/infrastructure/terraform/keep-dev/main.tf b/infrastructure/terraform/keep-dev/main.tf index c79abcb8f0..3777b1fcd4 100644 --- a/infrastructure/terraform/keep-dev/main.tf +++ b/infrastructure/terraform/keep-dev/main.tf @@ -126,6 +126,7 @@ module "gke_cluster" { daily_maintenance_window_start_time = "${var.gke_cluster["daily_maintenance_window_start_time"]}" network_policy_enabled = "${var.gke_cluster["network_policy_enabled"]}" network_policy_provider = "${var.gke_cluster["network_policy_provider"]}" + logging_service = "${var.gke_cluster["logging_service"]}" } gke_node_pool { @@ -150,7 +151,7 @@ resource "google_compute_global_address" "atlantis_external_ip" { labels = "${local.labels}" } -module "demo_dev_gke_cluster_metrics" { +module "gke_cluster_metrics" { source = "git@github.com:thesis/infrastructure.git//terraform/modules/gke_metrics" namespace = "${var.gke_metrics_namespace}" diff --git a/infrastructure/terraform/keep-dev/variables.tf b/infrastructure/terraform/keep-dev/variables.tf index db24c350fc..47ceb8f230 100644 --- a/infrastructure/terraform/keep-dev/variables.tf +++ b/infrastructure/terraform/keep-dev/variables.tf @@ -120,6 +120,7 @@ variable "gke_cluster" { daily_maintenance_window_start_time = "00:00" network_policy_enabled = true network_policy_provider = "CALICO" + logging_service = "logging.googleapis.com/kubernetes" } } From 8895c73c9412e5774e43f061043dca99f57d17dd Mon Sep 17 00:00:00 2001 From: sthompson22 Date: Tue, 12 Feb 2019 15:00:56 -0500 Subject: [PATCH 08/13] Remove stand alone helm-release file, will sort proper resourse segmentation later --- .../terraform/keep-dev/helm-release.tf | 18 ------------------ infrastructure/terraform/keep-dev/main.tf | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 18 deletions(-) delete mode 100644 infrastructure/terraform/keep-dev/helm-release.tf diff --git a/infrastructure/terraform/keep-dev/helm-release.tf b/infrastructure/terraform/keep-dev/helm-release.tf deleted file mode 100644 index bc3ea3bf8f..0000000000 --- a/infrastructure/terraform/keep-dev/helm-release.tf +++ /dev/null @@ -1,18 +0,0 @@ -# .tf file for configuring helm releases - -resource "helm_release" "openvpn" { - name = "helm-openvpn" - namespace = "default" - chart = "stable/openvpn" - version = "3.10.0" - - set { - name = "openvpn.redirectGateway" - value = "false" - } - - set { - name = "openvpn.conf" - value = "push \"route 172.16.0.0 255.255.255.240\"" - } -} diff --git a/infrastructure/terraform/keep-dev/main.tf b/infrastructure/terraform/keep-dev/main.tf index 3777b1fcd4..4d0c123557 100644 --- a/infrastructure/terraform/keep-dev/main.tf +++ b/infrastructure/terraform/keep-dev/main.tf @@ -163,3 +163,21 @@ module "gke_cluster_metrics" { version = "${var.prometheus_to_sd["version"]}" } } + +# OpenVPN +resource "helm_release" "openvpn" { + name = "helm-openvpn" + namespace = "default" + chart = "stable/openvpn" + version = "3.10.0" + + set { + name = "openvpn.redirectGateway" + value = "false" + } + + set { + name = "openvpn.conf" + value = "push \"route 172.16.0.0 255.255.255.240\"" + } +} From 4ef1691fbc02f3181e2864b18826e1ab70376b5b Mon Sep 17 00:00:00 2001 From: sthompson22 Date: Wed, 13 Feb 2019 10:08:25 -0500 Subject: [PATCH 09/13] Metrics currently creates a data read / change on each TF plan/apply Using this module will create a data read and an update for the prometheus-to-sd resource on each Terraform planand apply run. These updates will do nothing and are an artifact of the depends_on in the modules data resource. Terraform team is aware and have a proposed fix in the works. --- infrastructure/terraform/keep-dev/main.tf | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/infrastructure/terraform/keep-dev/main.tf b/infrastructure/terraform/keep-dev/main.tf index 4d0c123557..1813b43747 100644 --- a/infrastructure/terraform/keep-dev/main.tf +++ b/infrastructure/terraform/keep-dev/main.tf @@ -151,6 +151,12 @@ resource "google_compute_global_address" "atlantis_external_ip" { labels = "${local.labels}" } +/* Using this module will create a data read and an update for the + * prometheus-to-sd resource on each Terraform planand apply run. These + * updates will do nothing and are an artifact of the depends_on in the + * modules data resource. Terraform team is aware and have a proposed fix + * in the works. +*/ module "gke_cluster_metrics" { source = "git@github.com:thesis/infrastructure.git//terraform/modules/gke_metrics" namespace = "${var.gke_metrics_namespace}" From a95b7ae11d7a157b4f5aac125ece47def5c05404 Mon Sep 17 00:00:00 2001 From: sthompson22 Date: Wed, 13 Feb 2019 10:51:16 -0500 Subject: [PATCH 10/13] Docu section for gke-metrics --- .../terraform/keep-dev/tf-setup.org | 145 ++++++++++++++++++ 1 file changed, 145 insertions(+) diff --git a/infrastructure/terraform/keep-dev/tf-setup.org b/infrastructure/terraform/keep-dev/tf-setup.org index f03ad834b7..7f0e0e09c1 100644 --- a/infrastructure/terraform/keep-dev/tf-setup.org +++ b/infrastructure/terraform/keep-dev/tf-setup.org @@ -1853,3 +1853,148 @@ terraform output \ #+RESULTS: +** DONE gke-metrics +Configures monitoring and metrics collection for a GKE cluster. + +NOTE: Using this module will create a data read and an update for the +prometheus-to-sd resource on each Terraform planand apply run. These +updates will do nothing and are an artifact of the depends_on in the +modules data resource. Terraform team is aware and have a proposed fix +in the works. + +*** Describe +#+BEGIN_SRC sh :results pp +date -u +echo `whoami` "\n" + +kubectl describe \ +service helm-kube-state-metrics \ +--namespace metrics + +echo "-------" + +kubectl describe \ +deployment helm-kube-state-metrics \ +--namespace metrics + +echo "-------" + +kubectl describe \ +deployment helm-prometheus-to-sd \ +--namespace metrics +#+END_SRC + +#+RESULTS: +#+begin_example +Wed Feb 13 15:50:08 UTC 2019 +sthompson22 + +Name: helm-kube-state-metrics +Namespace: metrics +Labels: app=kube-state-metrics + chart=kube-state-metrics-0.13.0 + heritage=Tiller + release=helm-kube-state-metrics +Annotations: prometheus.io/scrape: true +Selector: app=kube-state-metrics,release=helm-kube-state-metrics +Type: ClusterIP +IP: 10.102.100.169 +Port: http 8080/TCP +TargetPort: 8080/TCP +Endpoints: 10.102.3.15:8080 +Session Affinity: None +Events: +------- +Name: helm-kube-state-metrics +Namespace: metrics +CreationTimestamp: Tue, 12 Feb 2019 11:18:51 -0500 +Labels: app=kube-state-metrics + chart=kube-state-metrics-0.13.0 + heritage=Tiller + release=helm-kube-state-metrics +Annotations: deployment.kubernetes.io/revision: 1 +Selector: app=kube-state-metrics,release=helm-kube-state-metrics +Replicas: 1 desired | 1 updated | 1 total | 1 available | 0 unavailable +StrategyType: RollingUpdate +MinReadySeconds: 0 +RollingUpdateStrategy: 1 max unavailable, 1 max surge +Pod Template: + Labels: app=kube-state-metrics + release=helm-kube-state-metrics + Service Account: helm-kube-state-metrics + Containers: + kube-state-metrics: + Image: quay.io/coreos/kube-state-metrics:v1.4.0 + Port: 8080/TCP + Host Port: 0/TCP + Args: + --collectors=configmaps + --collectors=cronjobs + --collectors=daemonsets + --collectors=deployments + --collectors=endpoints + --collectors=horizontalpodautoscalers + --collectors=jobs + --collectors=limitranges + --collectors=namespaces + --collectors=nodes + --collectors=persistentvolumeclaims + --collectors=persistentvolumes + --collectors=pods + --collectors=replicasets + --collectors=replicationcontrollers + --collectors=resourcequotas + --collectors=secrets + --collectors=services + --collectors=statefulsets + Readiness: http-get http://:8080/healthz delay=5s timeout=5s period=10s #success=1 #failure=3 + Environment: + Mounts: + Volumes: +Conditions: + Type Status Reason + ---- ------ ------ + Available True MinimumReplicasAvailable + Progressing True NewReplicaSetAvailable +OldReplicaSets: +NewReplicaSet: helm-kube-state-metrics-898cb4bf7 (1/1 replicas created) +Events: +------- +Name: helm-prometheus-to-sd +Namespace: metrics +CreationTimestamp: Tue, 12 Feb 2019 11:18:57 -0500 +Labels: app=prometheus-to-sd + chart=prometheus-to-sd-0.1.1 + heritage=Tiller + release=helm-prometheus-to-sd +Annotations: deployment.kubernetes.io/revision: 1 +Selector: app=prometheus-to-sd,release=helm-prometheus-to-sd +Replicas: 1 desired | 1 updated | 1 total | 1 available | 0 unavailable +StrategyType: RollingUpdate +MinReadySeconds: 0 +RollingUpdateStrategy: 25% max unavailable, 25% max surge +Pod Template: + Labels: app=prometheus-to-sd + release=helm-prometheus-to-sd + Containers: + prometheus-to-sd: + Image: gcr.io/google-containers/prometheus-to-sd:v0.2.2 + Port: 6060/TCP + Host Port: 0/TCP + Command: + /monitor + --stackdriver-prefix=custom.googleapis.com + --source=kube-state-metrics:http://10.102.100.169:8080 + Environment: + Mounts: + Volumes: +Conditions: + Type Status Reason + ---- ------ ------ + Available True MinimumReplicasAvailable + Progressing True NewReplicaSetAvailable +OldReplicaSets: +NewReplicaSet: helm-prometheus-to-sd-7447796c5 (1/1 replicas created) +Events: +#+end_example + From e9a9aa3d60291f2c8c27797f6a562b49a8bf1d34 Mon Sep 17 00:00:00 2001 From: sthompson22 Date: Wed, 13 Feb 2019 11:46:39 -0500 Subject: [PATCH 11/13] Set helm keyring to empty string to prevent constant updates --- infrastructure/terraform/keep-dev/main.tf | 1 + 1 file changed, 1 insertion(+) diff --git a/infrastructure/terraform/keep-dev/main.tf b/infrastructure/terraform/keep-dev/main.tf index 1813b43747..0e74f55e12 100644 --- a/infrastructure/terraform/keep-dev/main.tf +++ b/infrastructure/terraform/keep-dev/main.tf @@ -176,6 +176,7 @@ resource "helm_release" "openvpn" { namespace = "default" chart = "stable/openvpn" version = "3.10.0" + keyring = "" set { name = "openvpn.redirectGateway" From a707c2498a27b5f33cefab1e28db2aff07630a82 Mon Sep 17 00:00:00 2001 From: sthompson22 Date: Thu, 21 Feb 2019 21:01:13 -0500 Subject: [PATCH 12/13] Move to openvpn install via module --- infrastructure/terraform/keep-dev/main.tf | 24 +++++++++-------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/infrastructure/terraform/keep-dev/main.tf b/infrastructure/terraform/keep-dev/main.tf index 0e74f55e12..46cb8d1fae 100644 --- a/infrastructure/terraform/keep-dev/main.tf +++ b/infrastructure/terraform/keep-dev/main.tf @@ -121,7 +121,6 @@ module "gke_cluster" { gke_cluster { name = "${var.gke_cluster["name"]}" private_cluster = "${var.gke_cluster["private_cluster"]}" - subnetwork = "${module.vpc.vpc_private_subnet_self_link}" master_ipv4_cidr_block = "${var.gke_cluster["master_ipv4_cidr_block"]}" daily_maintenance_window_start_time = "${var.gke_cluster["daily_maintenance_window_start_time"]}" network_policy_enabled = "${var.gke_cluster["network_policy_enabled"]}" @@ -170,21 +169,16 @@ module "gke_cluster_metrics" { } } -# OpenVPN -resource "helm_release" "openvpn" { - name = "helm-openvpn" - namespace = "default" - chart = "stable/openvpn" - version = "3.10.0" - keyring = "" - - set { - name = "openvpn.redirectGateway" - value = "false" +module "openvpn" { + source = "git@github.com:thesis/infrastructure.git//terraform/modules/helm_openvpn" + + openvpn { + name = "${var.openvpn["name"]}" + version = "${var.openvpn["version"]}" } - set { - name = "openvpn.conf" - value = "push \"route 172.16.0.0 255.255.255.240\"" + openvpn_parameters { + route_all_traffic_through_vpn = "${var.openvpn_parameters["route_all_traffic_through_vpn"]}" + gke_master_ipv4_cidr_address = "${var.openvpn_parameters["gke_master_ipv4_cidr_address"]}" } } From 0d98659fc10d9178f64194367ef494af23caf9b7 Mon Sep 17 00:00:00 2001 From: sthompson22 Date: Thu, 21 Feb 2019 21:02:23 -0500 Subject: [PATCH 13/13] Disable network policies A recent GKE upgrade resulted in openvpn connections not working. Specifically, a client can connect to the vpn but requests to the tun0 interface are not routed. After exhaustive research we could not find a misconfiguration between server and client. Because the demo-dev environment continued to work post upgrade and does not have network policies enabled we rebuilt the cluster with policies disabled. This corrected the openvpn connection. For now were disabling network policies so that we can use the environment. We need to look into Calico routing. --- infrastructure/terraform/keep-dev/variables.tf | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/infrastructure/terraform/keep-dev/variables.tf b/infrastructure/terraform/keep-dev/variables.tf index 47ceb8f230..51c063db6a 100644 --- a/infrastructure/terraform/keep-dev/variables.tf +++ b/infrastructure/terraform/keep-dev/variables.tf @@ -118,8 +118,8 @@ variable "gke_cluster" { master_ipv4_cidr_block = "172.16.0.0/28" master_private_endpoint = "172.16.0.2" daily_maintenance_window_start_time = "00:00" - network_policy_enabled = true - network_policy_provider = "CALICO" + network_policy_enabled = false + network_policy_provider = "PROVIDER_UNSPECIFIED" logging_service = "logging.googleapis.com/kubernetes" } } @@ -179,3 +179,17 @@ variable "prometheus_to_sd" { version = "0.1.1" } } + +variable "openvpn" { + default { + name = "helm-openvpn" + version = "3.10.0" + } +} + +variable "openvpn_parameters" { + default { + route_all_traffic_through_vpn = "false" + gke_master_ipv4_cidr_address = "172.16.0.0" + } +}