From 906e2d3e8d84230d055fc1610ced904c24dff3c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Corentin=20N=C3=A9au?= Date: Wed, 5 Jun 2024 14:22:24 +0200 Subject: [PATCH 1/7] Add agentTLSMode option Fleet now supports two distinct TLS mode for its agent when registering against an upstream cluster: * `system-store`, the default, does not change its current behaviour: the Fleet agent trusts any certificate signed by a CA found in its system store * `strict`, to bypass the system store when validating a certificate. --- charts/fleet-agent/templates/configmap.yaml | 3 +- charts/fleet-agent/values.yaml | 4 + charts/fleet/templates/configmap.yaml | 1 + charts/fleet/values.yaml | 4 + dev/setup-fleet-downstream | 34 +++-- e2e/multi-cluster/installation/agent_test.go | 116 ++++++++++++++++++ e2e/multi-cluster/installation/suite_test.go | 28 +++++ internal/cmd/agent/register/register.go | 31 +++-- .../agentmanagement/agent/config.go | 14 ++- .../controllers/cluster/import.go | 29 +++-- internal/config/config.go | 7 ++ 11 files changed, 234 insertions(+), 37 deletions(-) create mode 100644 e2e/multi-cluster/installation/agent_test.go create mode 100644 e2e/multi-cluster/installation/suite_test.go diff --git a/charts/fleet-agent/templates/configmap.yaml b/charts/fleet-agent/templates/configmap.yaml index ce61a87568..f3e83a89cc 100644 --- a/charts/fleet-agent/templates/configmap.yaml +++ b/charts/fleet-agent/templates/configmap.yaml @@ -8,5 +8,6 @@ data: {{ if .Values.labels }} "labels":{{toJson .Values.labels}}, {{ end }} - "clientID":"{{.Values.clientID}}" + "clientID":"{{.Values.clientID}}", + "agentTLSMode": "{{.Values.agentTLSMode}}" } diff --git a/charts/fleet-agent/values.yaml b/charts/fleet-agent/values.yaml index df293e1862..47cb0be226 100644 --- a/charts/fleet-agent/values.yaml +++ b/charts/fleet-agent/values.yaml @@ -11,6 +11,10 @@ apiServerURL: "" # If left empty it is assumed this Kubernetes API TLS is signed by a well known CA. apiServerCA: "" +# Determines whether the agent should trust CA bundles from the operating system's trust store when connecting to a +# management cluster. True in `system-store` mode, false in `strict` mode. +agentTLSMode: "system-store" + # The cluster registration value token: "" diff --git a/charts/fleet/templates/configmap.yaml b/charts/fleet/templates/configmap.yaml index 4596405cf5..a801cb6024 100644 --- a/charts/fleet/templates/configmap.yaml +++ b/charts/fleet/templates/configmap.yaml @@ -11,6 +11,7 @@ data: "apiServerURL": "{{.Values.apiServerURL}}", "apiServerCA": "{{b64enc .Values.apiServerCA}}", "agentCheckinInterval": "{{.Values.agentCheckinInterval}}", + "agentTLSMode": "{{.Values.agentTLSMode}}", "ignoreClusterRegistrationLabels": {{.Values.ignoreClusterRegistrationLabels}}, "bootstrap": { "paths": "{{.Values.bootstrap.paths}}", diff --git a/charts/fleet/values.yaml b/charts/fleet/values.yaml index a9a17bc060..9bedc76091 100644 --- a/charts/fleet/values.yaml +++ b/charts/fleet/values.yaml @@ -16,6 +16,10 @@ apiServerURL: "" # If left empty it is assumed this Kubernetes API TLS is signed by a well known CA. apiServerCA: "" +# Determines whether the agent should trust CA bundles from the operating system's trust store when connecting to a +# management cluster. True in `system-store` mode, false in `strict` mode. +agentTLSMode: "system-store" + # A duration string for how often agents should report a heartbeat agentCheckinInterval: "15m" diff --git a/dev/setup-fleet-downstream b/dev/setup-fleet-downstream index 9a15759e31..c7c29f3437 100755 --- a/dev/setup-fleet-downstream +++ b/dev/setup-fleet-downstream @@ -3,14 +3,15 @@ set -euxo pipefail -if [ ! -d ./charts/fleet ]; then - echo "please change the current directory to the fleet repo checkout" - exit 1 -fi +root_dir=$(git rev-parse --show-toplevel) +cd "$root_dir" upstream_ctx="${FLEET_E2E_CLUSTER-k3d-upstream}" downstream_ctx="${FLEET_E2E_CLUSTER_DOWNSTREAM-k3d-downstream}" ns="${FLEET_E2E_NS_DOWNSTREAM-fleet-local}" +force_empty_ca="${FORCE_EMPTY_AGENT_CA-''}" +api_server_url="${FORCE_API_SERVER_URL-''}" +agent_tls_mode="${AGENT_TLS_MODE-system-store}" kubectl create ns "$ns"|| true @@ -45,27 +46,34 @@ kubectl wait clusterregistrationtoken -n "$ns" --for=jsonpath='{.status.secretNa token=$(kubectl get secret -n "$ns" second-token -o go-template='{{index .data "values" | base64decode}}' | yq .token -) ca="" -serverver=$(kubectl version -ojson 2> /dev/null | jq '.serverVersion.minor' | sed 's/"//g') -if [ "$serverver" -gt 23 ]; then - ca=$(kubectl get secret -n cattle-fleet-system fleet-controller-bootstrap-token -o go-template='{{index .data "ca.crt" | base64decode}}') -else - name=$(kubectl get -n default sa default -o=jsonpath='{.secrets[0].name}') - ca=$(kubectl get -n default secret "$name" -o go-template='{{index .data "ca.crt" | base64decode}}') +if [ -z $force_empty_ca ]; then + serverver=$(kubectl version -ojson 2> /dev/null | jq '.serverVersion.minor' | sed 's/"//g') + if [ "$serverver" -gt 23 ]; then + ca=$(kubectl get secret -n cattle-fleet-system fleet-controller-bootstrap-token -o go-template='{{index .data "ca.crt" | base64decode}}') + else + name=$(kubectl get -n default sa default -o=jsonpath='{.secrets[0].name}') + ca=$(kubectl get -n default secret "$name" -o go-template='{{index .data "ca.crt" | base64decode}}') + fi fi # docker network inspect bridge -f '{{(index .IPAM.Config 0).Gateway}}' # public_hostname="${public_hostname-172.17.0.1.sslip.io}" -# works due to same network of k3d clustres and patched SAN cert +# works due to same network of k3d clusters and patched SAN cert public_hostname="${public_hostname-k3d-upstream-server-0}" +if [ -z $api_server_url ]; then + api_server_url="https://$public_hostname:6443" +fi + kubectl config use-context "$downstream_ctx" helm -n cattle-fleet-system upgrade --install --create-namespace --wait fleet-agent charts/fleet-agent \ --set-string labels.env=test \ --set apiServerCA="$ca" \ - --set apiServerURL="https://$public_hostname:6443" \ + --set apiServerURL="$api_server_url" \ --set clusterNamespace="$ns" \ - --set token="$token" + --set token="$token" \ + --set agentTLSMode="$agent_tls_mode" #--set systemRegistrationNamespace="fleet-clusters-system" \ #--set clientID="fake-random" \ # --set global.cattle.systemDefaultRegistry=public.ecr.aws/b3e3i8k2 \ diff --git a/e2e/multi-cluster/installation/agent_test.go b/e2e/multi-cluster/installation/agent_test.go new file mode 100644 index 0000000000..d2b18baa11 --- /dev/null +++ b/e2e/multi-cluster/installation/agent_test.go @@ -0,0 +1,116 @@ +package installation_test + +import ( + "os" + "os/exec" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/onsi/gomega/matchers" + "github.com/rancher/fleet/e2e/testenv/kubectl" +) + +var ( + agentMode string + kd kubectl.Command + setupCmd *exec.Cmd +) + +var _ = Describe("Fleet installation with TLS agent modes", func() { + BeforeEach(func() { + kd = env.Kubectl.Context(env.Downstream) + }) + + JustBeforeEach(func() { + cmd := exec.Command( + "helm", + "--kube-context", + "k3d-downstream", + "uninstall", + "fleet-agent", + "-n", + "cattle-fleet-system", + "--wait", + ) + _ = cmd.Run() // Ignore errors, Fleet might not be installed + + err := os.Setenv("FORCE_EMPTY_AGENT_CA", "yes") + Expect(err).ToNot(HaveOccurred()) + err = os.Setenv("FORCE_API_SERVER_URL", "https://google.com") + Expect(err).ToNot(HaveOccurred()) + + err = os.Setenv("AGENT_TLS_MODE", agentMode) + Expect(err).ToNot(HaveOccurred()) + + go func() { + setupCmd = exec.Command("../../../dev/setup-fleet-downstream") + _ = setupCmd.Run() + }() + }) + + Context("with non-strict agent TLS mode", func() { + When("fetching fleet-agent-register logs", func() { + BeforeEach(func() { + agentMode = "system-store" + }) + + It("reaches the server without cert issues", func() { + Eventually(func() bool { + logs, err := kd.Namespace("cattle-fleet-system").Logs( + "-l", + "app=fleet-agent", + "-c", + "fleet-agent-register", + "--tail=-1", + ) + if err != nil { + return false + } + + regexMatcher := matchers.MatchRegexpMatcher{ + Regexp: "Failed to register agent.*could not find the requested resource", + } + reachesServerWithoutCertIssue, err := regexMatcher.Match(logs) + if err != nil { + return false + } + + return reachesServerWithoutCertIssue + }).Should(BeTrue()) + }) + }) + }) + + Context("with strict agent TLS mode", func() { + When("fetching fleet-agent-register logs", func() { + BeforeEach(func() { + agentMode = "strict" + }) + + It("cannot reach the server because the cert is signed by an unknown authority", func() { + Eventually(func() bool { + logs, err := kd.Namespace("cattle-fleet-system").Logs( + "-l", + "app=fleet-agent", + "-c", + "fleet-agent-register", + "--tail=-1", + ) + if err != nil { + return false + } + + regexMatcher := matchers.MatchRegexpMatcher{ + Regexp: "Failed to register agent.*signed by unknown authority", + } + reachesServerWithoutCertIssue, err := regexMatcher.Match(logs) + if err != nil { + return false + } + + return reachesServerWithoutCertIssue + }).Should(BeTrue()) + }) + }) + }) +}) diff --git a/e2e/multi-cluster/installation/suite_test.go b/e2e/multi-cluster/installation/suite_test.go new file mode 100644 index 0000000000..2e8d083fd5 --- /dev/null +++ b/e2e/multi-cluster/installation/suite_test.go @@ -0,0 +1,28 @@ +// Package installation contains e2e tests deploying Fleet to multiple clusters. The tests use kubectl to apply +// manifests. Expectations are verified by checking cluster resources. +package installation_test + +import ( + "testing" + + "github.com/rancher/fleet/e2e/testenv" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestE2E(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "E2E Installation Suite for Multi-Cluster") +} + +var ( + env *testenv.Env +) + +var _ = BeforeSuite(func() { + SetDefaultEventuallyTimeout(testenv.Timeout) + testenv.SetRoot("../..") + + env = testenv.New() +}) diff --git a/internal/cmd/agent/register/register.go b/internal/cmd/agent/register/register.go index 95da54617f..dd23ddeb5e 100644 --- a/internal/cmd/agent/register/register.go +++ b/internal/cmd/agent/register/register.go @@ -167,7 +167,12 @@ func runRegistration(ctx context.Context, k8s coreInterface, namespace string) ( return nil, fmt.Errorf("looking up secret %s/%s: %w", namespace, config.AgentBootstrapConfigName, err) } - clientConfig := createClientConfigFromSecret(secret) + cfg, err := config.Lookup(ctx, secret.Namespace, config.AgentConfigName, k8s.ConfigMap()) + if err != nil { + return nil, fmt.Errorf("failed to look up client config %s/%s: %w", secret.Namespace, config.AgentConfigName, err) + } + + clientConfig := createClientConfigFromSecret(secret, cfg.AgentTLSMode == config.AgentTLSModeSystemStore) ns, _, err := clientConfig.Namespace() if err != nil { @@ -179,11 +184,6 @@ func runRegistration(ctx context.Context, k8s coreInterface, namespace string) ( return nil, err } - cfg, err := config.Lookup(ctx, secret.Namespace, config.AgentConfigName, k8s.ConfigMap()) - if err != nil { - return nil, fmt.Errorf("failed to look up client config %s/%s: %w", secret.Namespace, config.AgentConfigName, err) - } - fleetK8s, err := kubernetes.NewForConfig(kc) if err != nil { return nil, err @@ -318,15 +318,28 @@ func values(data map[string][]byte) map[string][]byte { // createClientConfigFromSecret reads the fleet-agent-bootstrap secret and // creates a clientConfig to access the upstream cluster -func createClientConfigFromSecret(secret *corev1.Secret) clientcmd.ClientConfig { +func createClientConfigFromSecret(secret *corev1.Secret, trustSystemStoreCAs bool) clientcmd.ClientConfig { data := values(secret.Data) apiServerURL := string(data[config.APIServerURLKey]) apiServerCA := data[config.APIServerCAKey] namespace := string(data[ClusterNamespace]) token := string(data[Token]) - if _, err := http.Get(apiServerURL); err == nil { - apiServerCA = nil + if trustSystemStoreCAs { // Save a request to the API server URL if system CAs are not to be trusted. + if _, err := http.Get(apiServerURL); err == nil { + apiServerCA = nil + } + } else { + // Bypass the OS trust store. + err := os.Setenv("SSL_CERT_FILE", "/var/does-not-exist.pem") + if err != nil { + logrus.Errorf("failed to set env var SSL_CERT_FILE: %s", err.Error()) + } + + err = os.Setenv("SSL_CERT_DIR", "/var/does-not-exist-either") + if err != nil { + logrus.Errorf("failed to set env var SSL_CERT_DIR: %s", err.Error()) + } } cfg := clientcmdapi.Config{ diff --git a/internal/cmd/controller/agentmanagement/agent/config.go b/internal/cmd/controller/agentmanagement/agent/config.go index b8ad7b26cc..c88983a8f5 100644 --- a/internal/cmd/controller/agentmanagement/agent/config.go +++ b/internal/cmd/controller/agentmanagement/agent/config.go @@ -12,8 +12,9 @@ import ( ) type ConfigOptions struct { - Labels map[string]string - ClientID string + Labels map[string]string + ClientID string + AgentTLSMode string } func agentConfig(ctx context.Context, agentNamespace, controllerNamespace string, cg *client.Getter, opts *ConfigOptions) ([]runtime.Object, error) { @@ -32,13 +33,14 @@ func agentConfig(ctx context.Context, agentNamespace, controllerNamespace string return nil, err } - return configObjects(agentNamespace, opts.Labels, opts.ClientID) + return configObjects(agentNamespace, opts) } -func configObjects(controllerNamespace string, clusterLabels map[string]string, clientID string) ([]runtime.Object, error) { +func configObjects(controllerNamespace string, co *ConfigOptions) ([]runtime.Object, error) { cm, err := config.ToConfigMap(controllerNamespace, config.AgentConfigName, &config.Config{ - Labels: clusterLabels, - ClientID: clientID, + Labels: co.Labels, + ClientID: co.ClientID, + AgentTLSMode: co.AgentTLSMode, }) if err != nil { return nil, err diff --git a/internal/cmd/controller/agentmanagement/controllers/cluster/import.go b/internal/cmd/controller/agentmanagement/controllers/cluster/import.go index bc49cf2403..b907c2fc44 100644 --- a/internal/cmd/controller/agentmanagement/controllers/cluster/import.go +++ b/internal/cmd/controller/agentmanagement/controllers/cluster/import.go @@ -240,7 +240,19 @@ func (i *importHandler) importCluster(cluster *fleet.Cluster, status fleet.Clust apiServerCA = cfg.APIServerCA } - restConfig, err := i.restConfigFromKubeConfig(secret.Data[config.KubeConfigSecretValueKey]) + if cfg.AgentTLSMode != config.AgentTLSModeStrict && cfg.AgentTLSMode != config.AgentTLSModeSystemStore { + return status, + fmt.Errorf( + "provided config value for agentTLSMode is none of [%q,%q]", + config.AgentTLSModeStrict, + config.AgentTLSModeSystemStore, + ) + } + + restConfig, err := i.restConfigFromKubeConfig( + secret.Data[config.KubeConfigSecretValueKey], + cfg.AgentTLSMode == config.AgentTLSModeSystemStore, + ) if err != nil { return status, err } @@ -307,8 +319,9 @@ func (i *importHandler) importCluster(cluster *fleet.Cluster, status fleet.Clust APIServerCA: apiServerCA, APIServerURL: apiServerURL, ConfigOptions: agent.ConfigOptions{ - ClientID: cluster.Spec.ClientID, - Labels: clusterLabels, + ClientID: cluster.Spec.ClientID, + Labels: clusterLabels, + AgentTLSMode: cfg.AgentTLSMode, }, ManifestOptions: agent.ManifestOptions{ AgentEnvVars: cluster.Spec.AgentEnvVars, @@ -389,8 +402,9 @@ func isLegacyAgentNamespaceSelectedByUser() bool { cfg.Bootstrap.AgentNamespace == config.LegacyDefaultNamespace } -// restConfigFromKubeConfig checks kubeconfig data and tries to connect to server. If server is behind public CA, remove CertificateAuthorityData in kubeconfig file. -func (i *importHandler) restConfigFromKubeConfig(data []byte) (*rest.Config, error) { +// restConfigFromKubeConfig checks kubeconfig data and tries to connect to server. If server is behind public CA, remove +// CertificateAuthorityData in kubeconfig file unless strict TLS mode is enabled. +func (i *importHandler) restConfigFromKubeConfig(data []byte, trustSystemStoreCAs bool) (*rest.Config, error) { clientConfig, err := clientcmd.NewClientConfigFromBytes(data) if err != nil { return nil, err @@ -401,11 +415,10 @@ func (i *importHandler) restConfigFromKubeConfig(data []byte) (*rest.Config, err return nil, err } - if raw.Contexts[raw.CurrentContext] != nil { + if trustSystemStoreCAs && raw.Contexts[raw.CurrentContext] != nil { cluster := raw.Contexts[raw.CurrentContext].Cluster if raw.Clusters[cluster] != nil { - _, err := http.Get(raw.Clusters[cluster].Server) - if err == nil { + if _, err := http.Get(raw.Clusters[cluster].Server); err == nil { raw.Clusters[cluster].CertificateAuthorityData = nil } } diff --git a/internal/config/config.go b/internal/config/config.go index e7ac629d2b..a0928ccaa8 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -20,6 +20,8 @@ const ( ManagerConfigName = "fleet-controller" AgentConfigName = "fleet-agent" AgentBootstrapConfigName = "fleet-agent-bootstrap" + AgentTLSModeStrict = "strict" + AgentTLSModeSystemStore = "system-store" Key = "config" // DefaultNamespace is the default for the system namespace, which // contains the manager and agent @@ -101,6 +103,11 @@ type Config struct { // IgnoreClusterRegistrationLabels if set to true, the labels on the cluster registration resource will not be copied to the cluster resource. IgnoreClusterRegistrationLabels bool `json:"ignoreClusterRegistrationLabels,omitempty"` + + // AgentTLSMode supports two values: `system-store` and `strict`. If set to `system-store`, instructs the agent + // to trust CA bundles from the operating system's store. If set to `strict`, then the agent shall only connect + // to a server which uses the exact CA configured when creating/updating the agent. + AgentTLSMode string `json:"agentTLSMode,omitempty"` } type Bootstrap struct { From bd77be82d48f417a6c1e941ec126fcbf6742151e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Corentin=20N=C3=A9au?= Date: Tue, 11 Jun 2024 17:56:37 +0200 Subject: [PATCH 2/7] Add agent tests to multi-cluster e2e runs This adds agent TLS mode tests to the multi-cluster e2e workflow. --- .github/workflows/e2e-multicluster-ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/e2e-multicluster-ci.yml b/.github/workflows/e2e-multicluster-ci.yml index f7b48c02da..ee4ded3427 100644 --- a/.github/workflows/e2e-multicluster-ci.yml +++ b/.github/workflows/e2e-multicluster-ci.yml @@ -135,6 +135,8 @@ jobs: run: | kubectl config use-context k3d-upstream ginkgo --github-output e2e/multi-cluster + + ginkgo --github-output e2e/multi-cluster/installation - name: Acceptance Tests for Examples if: > From 1dcfdfd0d5d89a30f8675d0ddebbc729c1e00616 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Corentin=20N=C3=A9au?= Date: Wed, 12 Jun 2024 17:16:34 +0200 Subject: [PATCH 3/7] Fix multi-cluster e2e tests for strict agent TLS mode This uses the Helm CLI to install the Fleet agent on the downstream cluster, instead of relying on an external script which is subject to changes. Logic used to delete previous Fleet agent installs now deletes the `cattle-fleet-system` namespace as well, which leaves a clean slate for further test cases. --- dev/setup-fleet-downstream | 4 +- e2e/multi-cluster/installation/agent_test.go | 39 +++++++++++++++++--- 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/dev/setup-fleet-downstream b/dev/setup-fleet-downstream index c7c29f3437..db91292965 100755 --- a/dev/setup-fleet-downstream +++ b/dev/setup-fleet-downstream @@ -9,8 +9,8 @@ cd "$root_dir" upstream_ctx="${FLEET_E2E_CLUSTER-k3d-upstream}" downstream_ctx="${FLEET_E2E_CLUSTER_DOWNSTREAM-k3d-downstream}" ns="${FLEET_E2E_NS_DOWNSTREAM-fleet-local}" -force_empty_ca="${FORCE_EMPTY_AGENT_CA-''}" -api_server_url="${FORCE_API_SERVER_URL-''}" +force_empty_ca="${FORCE_EMPTY_AGENT_CA-}" +api_server_url="${FORCE_API_SERVER_URL-}" agent_tls_mode="${AGENT_TLS_MODE-system-store}" kubectl create ns "$ns"|| true diff --git a/e2e/multi-cluster/installation/agent_test.go b/e2e/multi-cluster/installation/agent_test.go index d2b18baa11..a342145966 100644 --- a/e2e/multi-cluster/installation/agent_test.go +++ b/e2e/multi-cluster/installation/agent_test.go @@ -1,6 +1,7 @@ package installation_test import ( + "fmt" "os" "os/exec" @@ -13,7 +14,6 @@ import ( var ( agentMode string kd kubectl.Command - setupCmd *exec.Cmd ) var _ = Describe("Fleet installation with TLS agent modes", func() { @@ -30,11 +30,14 @@ var _ = Describe("Fleet installation with TLS agent modes", func() { "fleet-agent", "-n", "cattle-fleet-system", - "--wait", ) - _ = cmd.Run() // Ignore errors, Fleet might not be installed + out, err := cmd.CombinedOutput() + Expect(err).ToNot(HaveOccurred(), string(out)) - err := os.Setenv("FORCE_EMPTY_AGENT_CA", "yes") + deleteOut, err := kd.Delete("ns", "cattle-fleet-system", "--now") + Expect(err).ToNot(HaveOccurred(), deleteOut) + + err = os.Setenv("FORCE_EMPTY_AGENT_CA", "yes") Expect(err).ToNot(HaveOccurred()) err = os.Setenv("FORCE_API_SERVER_URL", "https://google.com") Expect(err).ToNot(HaveOccurred()) @@ -43,8 +46,32 @@ var _ = Describe("Fleet installation with TLS agent modes", func() { Expect(err).ToNot(HaveOccurred()) go func() { - setupCmd = exec.Command("../../../dev/setup-fleet-downstream") - _ = setupCmd.Run() + cmd := exec.Command( + "helm", + "--kube-context", + "k3d-downstream", + "-n", + "cattle-fleet-system", + "upgrade", + "--install", + "--create-namespace", + "--wait", + "fleet-agent", + "../../../charts/fleet-agent", + "--set-string", + "labels.env=test", + "--set", + `apiServerCA=`, + "--set", + `apiServerURL=https://google.com`, + "--set", + "clusterNamespace=fleet-default", + "--set", + "token=foo", // we don't need a correct token for this. + "--set", + fmt.Sprintf("agentTLSMode=%s", agentMode), + ) + _ = cmd.Run() }() }) From bcae247f23de737d8bb369d07cf4d14e39b09fd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Corentin=20N=C3=A9au?= Date: Wed, 12 Jun 2024 17:23:29 +0200 Subject: [PATCH 4/7] Undo changes to dev script Script `./dev/setup-fleet-downstream` is no longer needed by multi-cluster end-to-end test cases for the agent's strict TLS mode. --- dev/setup-fleet-downstream | 34 ++++++++------------ e2e/multi-cluster/installation/agent_test.go | 9 ------ 2 files changed, 13 insertions(+), 30 deletions(-) diff --git a/dev/setup-fleet-downstream b/dev/setup-fleet-downstream index db91292965..9a15759e31 100755 --- a/dev/setup-fleet-downstream +++ b/dev/setup-fleet-downstream @@ -3,15 +3,14 @@ set -euxo pipefail -root_dir=$(git rev-parse --show-toplevel) -cd "$root_dir" +if [ ! -d ./charts/fleet ]; then + echo "please change the current directory to the fleet repo checkout" + exit 1 +fi upstream_ctx="${FLEET_E2E_CLUSTER-k3d-upstream}" downstream_ctx="${FLEET_E2E_CLUSTER_DOWNSTREAM-k3d-downstream}" ns="${FLEET_E2E_NS_DOWNSTREAM-fleet-local}" -force_empty_ca="${FORCE_EMPTY_AGENT_CA-}" -api_server_url="${FORCE_API_SERVER_URL-}" -agent_tls_mode="${AGENT_TLS_MODE-system-store}" kubectl create ns "$ns"|| true @@ -46,34 +45,27 @@ kubectl wait clusterregistrationtoken -n "$ns" --for=jsonpath='{.status.secretNa token=$(kubectl get secret -n "$ns" second-token -o go-template='{{index .data "values" | base64decode}}' | yq .token -) ca="" -if [ -z $force_empty_ca ]; then - serverver=$(kubectl version -ojson 2> /dev/null | jq '.serverVersion.minor' | sed 's/"//g') - if [ "$serverver" -gt 23 ]; then - ca=$(kubectl get secret -n cattle-fleet-system fleet-controller-bootstrap-token -o go-template='{{index .data "ca.crt" | base64decode}}') - else - name=$(kubectl get -n default sa default -o=jsonpath='{.secrets[0].name}') - ca=$(kubectl get -n default secret "$name" -o go-template='{{index .data "ca.crt" | base64decode}}') - fi +serverver=$(kubectl version -ojson 2> /dev/null | jq '.serverVersion.minor' | sed 's/"//g') +if [ "$serverver" -gt 23 ]; then + ca=$(kubectl get secret -n cattle-fleet-system fleet-controller-bootstrap-token -o go-template='{{index .data "ca.crt" | base64decode}}') +else + name=$(kubectl get -n default sa default -o=jsonpath='{.secrets[0].name}') + ca=$(kubectl get -n default secret "$name" -o go-template='{{index .data "ca.crt" | base64decode}}') fi # docker network inspect bridge -f '{{(index .IPAM.Config 0).Gateway}}' # public_hostname="${public_hostname-172.17.0.1.sslip.io}" -# works due to same network of k3d clusters and patched SAN cert +# works due to same network of k3d clustres and patched SAN cert public_hostname="${public_hostname-k3d-upstream-server-0}" -if [ -z $api_server_url ]; then - api_server_url="https://$public_hostname:6443" -fi - kubectl config use-context "$downstream_ctx" helm -n cattle-fleet-system upgrade --install --create-namespace --wait fleet-agent charts/fleet-agent \ --set-string labels.env=test \ --set apiServerCA="$ca" \ - --set apiServerURL="$api_server_url" \ + --set apiServerURL="https://$public_hostname:6443" \ --set clusterNamespace="$ns" \ - --set token="$token" \ - --set agentTLSMode="$agent_tls_mode" + --set token="$token" #--set systemRegistrationNamespace="fleet-clusters-system" \ #--set clientID="fake-random" \ # --set global.cattle.systemDefaultRegistry=public.ecr.aws/b3e3i8k2 \ diff --git a/e2e/multi-cluster/installation/agent_test.go b/e2e/multi-cluster/installation/agent_test.go index a342145966..547a16ec0b 100644 --- a/e2e/multi-cluster/installation/agent_test.go +++ b/e2e/multi-cluster/installation/agent_test.go @@ -2,7 +2,6 @@ package installation_test import ( "fmt" - "os" "os/exec" . "github.com/onsi/ginkgo/v2" @@ -37,14 +36,6 @@ var _ = Describe("Fleet installation with TLS agent modes", func() { deleteOut, err := kd.Delete("ns", "cattle-fleet-system", "--now") Expect(err).ToNot(HaveOccurred(), deleteOut) - err = os.Setenv("FORCE_EMPTY_AGENT_CA", "yes") - Expect(err).ToNot(HaveOccurred()) - err = os.Setenv("FORCE_API_SERVER_URL", "https://google.com") - Expect(err).ToNot(HaveOccurred()) - - err = os.Setenv("AGENT_TLS_MODE", agentMode) - Expect(err).ToNot(HaveOccurred()) - go func() { cmd := exec.Command( "helm", From a6213928936767bfe6f17b9f3fea490383a87b5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Corentin=20N=C3=A9au?= Date: Fri, 14 Jun 2024 13:50:12 +0200 Subject: [PATCH 5/7] Redeploy Fleet agent when TLS mode setting changes This commit takes care of watching the agent TLS mode setting in the `fleet-controller` config map, and of redeploying the Fleet agent to upstream and downstream clusters when that setting changes. Note that this only works for downstream clusters registered through a manager-initiated process [1]. Testing this is done by reusing existing agent TLS mode test cases, and triggering new deployments of the Fleet agent by patching the `fleet-controller` config map. Requirements for this include a cluster registered in manager-initiated mode, while existing multi-cluster end-to-end tests need a downstream cluster registered in agent-initiated mode. Therefore, this commit also adds a new downstream cluster to the multi-cluster CI workflow, which is so far only used for agent TLS mode tests. [1]: https://fleet.rancher.io/cluster-registration#manager-initiated --- .github/scripts/deploy-fleet.sh | 6 ++ .github/workflows/e2e-ci.yml | 4 +- .github/workflows/e2e-multicluster-ci.yml | 79 ++++++++++++++++++- .github/workflows/nightly-ci.yml | 4 +- charts/fleet-crd/templates/crds.yaml | 14 ++++ e2e/multi-cluster/installation/agent_test.go | 50 +++--------- .../controllers/cluster/import.go | 9 ++- .../fleet.cattle.io/v1alpha1/cluster_types.go | 7 ++ 8 files changed, 127 insertions(+), 46 deletions(-) diff --git a/.github/scripts/deploy-fleet.sh b/.github/scripts/deploy-fleet.sh index 22cb640592..7e8b77052a 100755 --- a/.github/scripts/deploy-fleet.sh +++ b/.github/scripts/deploy-fleet.sh @@ -24,6 +24,10 @@ else agentTag="dev" fi +host=$(kubectl get node k3d-upstream-server-0 -o jsonpath='{.status.addresses[?(@.type=="InternalIP")].address}') +ca=$( kubectl config view --flatten -o jsonpath='{.clusters[?(@.name == "k3d-upstream")].cluster.certificate-authority-data}' | base64 -d ) +server="https://$host:6443" + eventually helm upgrade --install fleet-crd charts/fleet-crd \ --atomic \ -n cattle-fleet-system \ @@ -37,6 +41,8 @@ eventually helm upgrade --install fleet charts/fleet \ --set agentImage.repository="$agentRepo" \ --set agentImage.tag="$agentTag" \ --set agentImage.imagePullPolicy=IfNotPresent \ + --set apiServerCA="$ca" \ + --set apiServerURL="$server" \ --set shards="{$shards}" \ --set debug=true --set debugLevel=1 diff --git a/.github/workflows/e2e-ci.yml b/.github/workflows/e2e-ci.yml index 3d0b18cf56..9c9b992f0b 100644 --- a/.github/workflows/e2e-ci.yml +++ b/.github/workflows/e2e-ci.yml @@ -63,7 +63,7 @@ jobs: # k3d will automatically create a network named k3d-test-cluster-1 with the range 172.18.0.0/16 with: k3d-version: ${{ env.SETUP_K3D_VERSION }} - cluster-name: "k3s-default" + cluster-name: "upstream" args: >- --agents 1 --network "nw01" @@ -71,7 +71,7 @@ jobs: - name: Import Images Into k3d run: | - ./.github/scripts/k3d-import-retry.sh rancher/fleet:dev rancher/fleet-agent:dev nginx-git:test + ./.github/scripts/k3d-import-retry.sh rancher/fleet:dev rancher/fleet-agent:dev nginx-git:test -c upstream - name: Set Up Tmate Debug Session if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.enable_tmate == 'true' }} diff --git a/.github/workflows/e2e-multicluster-ci.yml b/.github/workflows/e2e-multicluster-ci.yml index ee4ded3427..b56357f82a 100644 --- a/.github/workflows/e2e-multicluster-ci.yml +++ b/.github/workflows/e2e-multicluster-ci.yml @@ -58,7 +58,7 @@ jobs: --agents 1 --network "nw01" - - name: Provision k3d Downstream Cluster + name: Provision k3d Downstream Cluster for agent-initiated registration uses: AbsaOSS/k3d-action@v2 with: k3d-version: ${{ env.SETUP_K3D_VERSION }} @@ -69,11 +69,24 @@ jobs: --api-port 6644 --agents 1 --network "nw01" + - + name: Provision k3d Downstream Cluster for manager-initiated registration + uses: AbsaOSS/k3d-action@v2 + with: + k3d-version: ${{ env.SETUP_K3D_VERSION }} + cluster-name: "managed-downstream" + args: >- + -p "82:80@agent:0:direct" + -p "445:443@agent:0:direct" + --api-port 6645 + --agents 1 + --network "nw01" - name: Import Images Into k3d run: | ./.github/scripts/k3d-import-retry.sh rancher/fleet:dev rancher/fleet-agent:dev -c upstream ./.github/scripts/k3d-import-retry.sh rancher/fleet-agent:dev -c downstream + ./.github/scripts/k3d-import-retry.sh rancher/fleet-agent:dev -c managed-downstream - name: Set Up Tmate Debug Session if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.enable_tmate == 'true' }} @@ -127,6 +140,61 @@ jobs: while [ $(kubectl -n fleet-default get cluster -o jsonpath='{.items[0].status.summary.ready}') -ne 1 ]; do sleep 1 done + + - + name: Deploy and Register Managed Downstream Fleet + run: | + kubectl config use-context k3d-managed-downstream + host=$(kubectl get node k3d-managed-downstream-server-0 -o jsonpath='{.status.addresses[?(@.type=="InternalIP")].address}') + ca=$( kubectl config view --flatten -o jsonpath='{.clusters[?(@.name == "k3d-managed-downstream")].cluster.certificate-authority-data}' ) + client_cert=$( kubectl config view --flatten -o jsonpath='{.users[?(@.name == "admin@k3d-managed-downstream")].user.client-certificate-data}' ) + token=$( kubectl config view --flatten -o jsonpath='{.users[?(@.name == "admin@k3d-managed-downstream")].user.client-key-data}' ) + server="https://$host:6443" + + kubectl config use-context k3d-upstream + + value=$(cat <- --agents 1 --network "nw01" @@ -72,7 +72,7 @@ jobs: - name: Import Images Into k3d run: | - ./.github/scripts/k3d-import-retry.sh rancher/fleet:dev rancher/fleet-agent:dev nginx-git:test + ./.github/scripts/k3d-import-retry.sh rancher/fleet:dev rancher/fleet-agent:dev nginx-git:test -c upstream - name: Set Up Tmate Debug Session if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.enable_tmate == 'true' }} diff --git a/charts/fleet-crd/templates/crds.yaml b/charts/fleet-crd/templates/crds.yaml index b01c81f424..95210ebc52 100644 --- a/charts/fleet-crd/templates/crds.yaml +++ b/charts/fleet-crd/templates/crds.yaml @@ -5173,6 +5173,20 @@ spec: used to detect changes.' nullable: true type: string + agentTLSMode: + description: 'AgentTLSMode supports two values: `system-store` and + `strict`. If set to + + `system-store`, instructs the agent to trust CA bundles from the + operating + + system''s store. If set to `strict`, then the agent shall only + connect to a + + server which uses the exact CA configured when creating/updating + the agent.' + nullable: true + type: string agentTolerationsHash: description: 'AgentTolerationsHash is a hash of the agent''s tolerations diff --git a/e2e/multi-cluster/installation/agent_test.go b/e2e/multi-cluster/installation/agent_test.go index 547a16ec0b..831690b9a2 100644 --- a/e2e/multi-cluster/installation/agent_test.go +++ b/e2e/multi-cluster/installation/agent_test.go @@ -2,7 +2,6 @@ package installation_test import ( "fmt" - "os/exec" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -13,57 +12,30 @@ import ( var ( agentMode string kd kubectl.Command + ku kubectl.Command ) var _ = Describe("Fleet installation with TLS agent modes", func() { BeforeEach(func() { kd = env.Kubectl.Context(env.Downstream) + ku = env.Kubectl.Context(env.Upstream) }) JustBeforeEach(func() { - cmd := exec.Command( - "helm", - "--kube-context", - "k3d-downstream", - "uninstall", - "fleet-agent", + out, err := ku.Patch( + "configmap", + "fleet-controller", "-n", "cattle-fleet-system", + "--type=merge", + "-p", + fmt.Sprintf( + `{"data":{"config":"{\"apiServerURL\": \"https://google.com\", \"apiServerCA\": \"\", \"agentTLSMode\": \"%s\"}"}}`, + agentMode, + ), ) - out, err := cmd.CombinedOutput() Expect(err).ToNot(HaveOccurred(), string(out)) - deleteOut, err := kd.Delete("ns", "cattle-fleet-system", "--now") - Expect(err).ToNot(HaveOccurred(), deleteOut) - - go func() { - cmd := exec.Command( - "helm", - "--kube-context", - "k3d-downstream", - "-n", - "cattle-fleet-system", - "upgrade", - "--install", - "--create-namespace", - "--wait", - "fleet-agent", - "../../../charts/fleet-agent", - "--set-string", - "labels.env=test", - "--set", - `apiServerCA=`, - "--set", - `apiServerURL=https://google.com`, - "--set", - "clusterNamespace=fleet-default", - "--set", - "token=foo", // we don't need a correct token for this. - "--set", - fmt.Sprintf("agentTLSMode=%s", agentMode), - ) - _ = cmd.Run() - }() }) Context("with non-strict agent TLS mode", func() { diff --git a/internal/cmd/controller/agentmanagement/controllers/cluster/import.go b/internal/cmd/controller/agentmanagement/controllers/cluster/import.go index b907c2fc44..68a030e1f6 100644 --- a/internal/cmd/controller/agentmanagement/controllers/cluster/import.go +++ b/internal/cmd/controller/agentmanagement/controllers/cluster/import.go @@ -92,7 +92,12 @@ func (i *importHandler) onConfig(config *config.Config) error { if cluster.Spec.KubeConfigSecret == "" { continue } - if config.APIServerURL != cluster.Status.APIServerURL || hashStatusField(config.APIServerCA) != cluster.Status.APIServerCAHash { + + hasConfigChanged := config.APIServerURL != cluster.Status.APIServerURL || + hashStatusField(config.APIServerCA) != cluster.Status.APIServerCAHash || + config.AgentTLSMode != cluster.Status.AgentTLSMode + + if hasConfigChanged { logrus.Infof("API server config changed, trigger cluster import for cluster %s/%s", cluster.Namespace, cluster.Name) c := cluster.DeepCopy() c.Status.AgentConfigChanged = true @@ -388,6 +393,8 @@ func (i *importHandler) importCluster(cluster *fleet.Cluster, status fleet.Clust status.AgentConfigChanged = false status.APIServerURL = apiServerURL status.APIServerCAHash = hashStatusField(apiServerCA) + status.AgentTLSMode = cfg.AgentTLSMode + return status, nil } diff --git a/pkg/apis/fleet.cattle.io/v1alpha1/cluster_types.go b/pkg/apis/fleet.cattle.io/v1alpha1/cluster_types.go index 46bfde6ea4..1806c533dd 100644 --- a/pkg/apis/fleet.cattle.io/v1alpha1/cluster_types.go +++ b/pkg/apis/fleet.cattle.io/v1alpha1/cluster_types.go @@ -190,6 +190,13 @@ type ClusterStatus struct { // +nullable APIServerCAHash string `json:"apiServerCAHash,omitempty"` + // AgentTLSMode supports two values: `system-store` and `strict`. If set to + // `system-store`, instructs the agent to trust CA bundles from the operating + // system's store. If set to `strict`, then the agent shall only connect to a + // server which uses the exact CA configured when creating/updating the agent. + // +nullable + AgentTLSMode string `json:"agentTLSMode,omitempty"` + // Display contains the number of ready bundles, nodes and a summary state. Display ClusterDisplay `json:"display,omitempty"` // AgentStatus contains information about the agent. From 4242fd973a99aac77350ff8b14d1fd288733453a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Corentin=20N=C3=A9au?= Date: Fri, 14 Jun 2024 17:50:09 +0200 Subject: [PATCH 6/7] Restore initial configmap contents after install tests This implements cleanup, enabling agent TLS mode tests to be run multiple times and not necessarily after all other tests. --- e2e/multi-cluster/installation/agent_test.go | 2 -- e2e/multi-cluster/installation/suite_test.go | 35 +++++++++++++++++++- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/e2e/multi-cluster/installation/agent_test.go b/e2e/multi-cluster/installation/agent_test.go index 831690b9a2..5bea51fdd2 100644 --- a/e2e/multi-cluster/installation/agent_test.go +++ b/e2e/multi-cluster/installation/agent_test.go @@ -12,13 +12,11 @@ import ( var ( agentMode string kd kubectl.Command - ku kubectl.Command ) var _ = Describe("Fleet installation with TLS agent modes", func() { BeforeEach(func() { kd = env.Kubectl.Context(env.Downstream) - ku = env.Kubectl.Context(env.Upstream) }) JustBeforeEach(func() { diff --git a/e2e/multi-cluster/installation/suite_test.go b/e2e/multi-cluster/installation/suite_test.go index 2e8d083fd5..003a08fec5 100644 --- a/e2e/multi-cluster/installation/suite_test.go +++ b/e2e/multi-cluster/installation/suite_test.go @@ -3,9 +3,12 @@ package installation_test import ( + "fmt" + "strings" "testing" "github.com/rancher/fleet/e2e/testenv" + "github.com/rancher/fleet/e2e/testenv/kubectl" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -17,7 +20,9 @@ func TestE2E(t *testing.T) { } var ( - env *testenv.Env + env *testenv.Env + ku kubectl.Command + config string ) var _ = BeforeSuite(func() { @@ -25,4 +30,32 @@ var _ = BeforeSuite(func() { testenv.SetRoot("../..") env = testenv.New() + ku = env.Kubectl.Context(env.Upstream) + + // Save initial state of `fleet-controller` config map + cfg, err := ku.Get( + "configmap", + "fleet-controller", + "-n", + "cattle-fleet-system", + "-o", + "jsonpath={.data.config}") + Expect(err).ToNot(HaveOccurred(), cfg) + + cfg = strings.ReplaceAll(cfg, `"`, `\"`) + config = strings.ReplaceAll(cfg, "\n", "") +}) + +var _ = AfterSuite(func() { + // Restore initial state of config map + out, err := ku.Patch( + "configmap", + "fleet-controller", + "-n", + "cattle-fleet-system", + "--type=merge", + "-p", + fmt.Sprintf(`{"data":{"config":"%s"}}`, config), + ) + Expect(err).ToNot(HaveOccurred(), string(out)) }) From ee12a8b513fdd4f30eace85eeb37f7bb19949e36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Corentin=20N=C3=A9au?= Date: Wed, 26 Jun 2024 09:31:16 +0200 Subject: [PATCH 7/7] Improve docs and dummy paths for bypassing system cert store This documents where environment variables used to bypass the store come from, and sets them to `/dev/null` to make the absence of usable values/cert files more explicit. --- internal/cmd/agent/register/register.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/internal/cmd/agent/register/register.go b/internal/cmd/agent/register/register.go index dd23ddeb5e..666c15465c 100644 --- a/internal/cmd/agent/register/register.go +++ b/internal/cmd/agent/register/register.go @@ -330,13 +330,15 @@ func createClientConfigFromSecret(secret *corev1.Secret, trustSystemStoreCAs boo apiServerCA = nil } } else { - // Bypass the OS trust store. - err := os.Setenv("SSL_CERT_FILE", "/var/does-not-exist.pem") + // Bypass the OS trust store through env vars, see https://pkg.go.dev/crypto/x509#SystemCertPool + // We set values to paths belonging to the root filesystem, which is read-only, to prevent tampering. + // Note: this will not work on Windows nor Mac OS. Agent are expected to run on Linux nodes. + err := os.Setenv("SSL_CERT_FILE", "/dev/null") if err != nil { logrus.Errorf("failed to set env var SSL_CERT_FILE: %s", err.Error()) } - err = os.Setenv("SSL_CERT_DIR", "/var/does-not-exist-either") + err = os.Setenv("SSL_CERT_DIR", "/dev/null") if err != nil { logrus.Errorf("failed to set env var SSL_CERT_DIR: %s", err.Error()) }