From dbead63320008174135a9e87258330cf058ec89d Mon Sep 17 00:00:00 2001 From: Thibault Mange <22740367+thibaultmg@users.noreply.github.com> Date: Mon, 22 Jul 2024 18:59:11 +0200 Subject: [PATCH] Metrics collector refactor (#1504) * Add resources for etcd Signed-off-by: Thibault Mange <22740367+thibaultmg@users.noreply.github.com> * init metrics collector unit tests Signed-off-by: Thibault Mange <22740367+thibaultmg@users.noreply.github.com> * fix addon tests and mc cronjob Signed-off-by: Thibault Mange <22740367+thibaultmg@users.noreply.github.com> * add status update Signed-off-by: Thibault Mange <22740367+thibaultmg@users.noreply.github.com> * set privileged to cronjob, add retry on update Signed-off-by: Thibault Mange <22740367+thibaultmg@users.noreply.github.com> * remove prom var Signed-off-by: Thibault Mange <22740367+thibaultmg@users.noreply.github.com> * cleaning Signed-off-by: Thibault Mange <22740367+thibaultmg@users.noreply.github.com> * clean diffs Signed-off-by: Thibault Mange <22740367+thibaultmg@users.noreply.github.com> * post rebase fix Signed-off-by: Thibault Mange <22740367+thibaultmg@users.noreply.github.com> * post rebase fixes Signed-off-by: Thibault Mange <22740367+thibaultmg@users.noreply.github.com> --------- Signed-off-by: Thibault Mange <22740367+thibaultmg@users.noreply.github.com> --- .../observabilityendpoint/match_evaluator.go | 36 - .../metrics_collector.go | 903 --------------- .../metrics_collector_test.go | 245 ---- .../observabilityaddon_controller.go | 246 ++-- ...bilityaddon_controller_integration_test.go | 206 +++- .../observabilityaddon_controller_test.go | 175 +-- .../ocp_monitoring_config.go | 83 +- .../ocp_monitoring_config_test.go | 21 +- .../predicate_func_test.go | 6 +- .../controllers/status/status_controller.go | 21 +- operators/endpointmetrics/main.go | 29 +- .../pkg/collector/match_evaluator.go | 26 + .../collector}/match_evaluator_test.go | 22 +- .../pkg/collector/metrics_collector.go | 1032 +++++++++++++++++ .../pkg/collector/metrics_collector_test.go | 356 ++++++ .../pkg/hypershift/hypershift.go | 16 + .../endpointmetrics/pkg/rendering/renderer.go | 33 +- .../pkg/rendering/renderer_test.go | 17 +- .../pkg/{util => status}/status.go | 4 +- .../pkg/{util => status}/status_test.go | 42 +- operators/endpointmetrics/pkg/util/error.go | 28 + .../controllers/placementrule/manifestwork.go | 2 + operators/pkg/config/config.go | 6 + 23 files changed, 1989 insertions(+), 1566 deletions(-) delete mode 100644 operators/endpointmetrics/controllers/observabilityendpoint/match_evaluator.go delete mode 100644 operators/endpointmetrics/controllers/observabilityendpoint/metrics_collector.go delete mode 100644 operators/endpointmetrics/controllers/observabilityendpoint/metrics_collector_test.go create mode 100644 operators/endpointmetrics/pkg/collector/match_evaluator.go rename operators/endpointmetrics/{controllers/observabilityendpoint => pkg/collector}/match_evaluator_test.go (79%) create mode 100644 operators/endpointmetrics/pkg/collector/metrics_collector.go create mode 100644 operators/endpointmetrics/pkg/collector/metrics_collector_test.go rename operators/endpointmetrics/pkg/{util => status}/status.go (98%) rename operators/endpointmetrics/pkg/{util => status}/status_test.go (88%) create mode 100644 operators/endpointmetrics/pkg/util/error.go diff --git a/operators/endpointmetrics/controllers/observabilityendpoint/match_evaluator.go b/operators/endpointmetrics/controllers/observabilityendpoint/match_evaluator.go deleted file mode 100644 index ff0ab5e00..000000000 --- a/operators/endpointmetrics/controllers/observabilityendpoint/match_evaluator.go +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) Red Hat, Inc. -// Copyright Contributors to the Open Cluster Management project -// Licensed under the Apache License 2.0 - -package observabilityendpoint - -import ( - "golang.org/x/exp/slices" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -type evaluateFn func(metav1.LabelSelectorRequirement, ...interface{}) bool - -var evaluateFns = map[string]evaluateFn{ - "clusterType": evaluateClusterType, -} - -func evluateMatchExpression(expr metav1.LabelSelectorRequirement, params ...interface{}) bool { - if _, ok := evaluateFns[expr.Key]; !ok { - // return false if expr.key not defined - return false - } - return evaluateFns[expr.Key](expr, params...) -} - -func evaluateClusterType(expr metav1.LabelSelectorRequirement, params ...interface{}) bool { - switch expr.Operator { - case metav1.LabelSelectorOpIn: - return slices.Contains(expr.Values, params[1].(string)) - case metav1.LabelSelectorOpNotIn: - return !slices.Contains(expr.Values, params[1].(string)) - default: - // return false for unsupported/invalid operator - return false - } -} diff --git a/operators/endpointmetrics/controllers/observabilityendpoint/metrics_collector.go b/operators/endpointmetrics/controllers/observabilityendpoint/metrics_collector.go deleted file mode 100644 index f5156975a..000000000 --- a/operators/endpointmetrics/controllers/observabilityendpoint/metrics_collector.go +++ /dev/null @@ -1,903 +0,0 @@ -// Copyright (c) Red Hat, Inc. -// Copyright Contributors to the Open Cluster Management project -// Licensed under the Apache License 2.0 - -package observabilityendpoint - -import ( - "context" - "fmt" - "regexp" - "sort" - "strconv" - "strings" - "time" - - monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" - "gopkg.in/yaml.v2" - appsv1 "k8s.io/api/apps/v1" - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/equality" - "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - "k8s.io/apimachinery/pkg/util/intstr" - "k8s.io/client-go/util/retry" - "sigs.k8s.io/controller-runtime/pkg/client" - - "github.com/stolostron/multicluster-observability-operator/operators/endpointmetrics/pkg/openshift" - "github.com/stolostron/multicluster-observability-operator/operators/endpointmetrics/pkg/rendering" - oashared "github.com/stolostron/multicluster-observability-operator/operators/multiclusterobservability/api/shared" - operatorconfig "github.com/stolostron/multicluster-observability-operator/operators/pkg/config" - "github.com/stolostron/multicluster-observability-operator/operators/pkg/util" -) - -const ( - metricsCollectorName = "metrics-collector-deployment" - uwlMetricsCollectorName = "uwl-metrics-collector-deployment" - metricsCollector = "metrics-collector" - uwlMetricsCollector = "uwl-metrics-collector" - selectorKey = "component" - selectorValue = metricsCollector - caMounthPath = "/etc/serving-certs-ca-bundle" - caVolName = "serving-certs-ca-bundle" - mtlsCertName = "observability-controller-open-cluster-management.io-observability-signer-client-cert" - mtlsCaName = "observability-managed-cluster-certs" - mtlsServerCaName = "observability-server-ca-certs" - limitBytes = 1073741824 - defaultInterval = "30s" - uwlNamespace = "openshift-user-workload-monitoring" - uwlSts = "prometheus-user-workload" -) - -const ( - restartLabel = "cert/time-restarted" -) - -var ( - ocpPromURL = "https://prometheus-k8s.openshift-monitoring.svc:9091" - uwlPromURL = "https://prometheus-user-workload.openshift-user-workload-monitoring.svc:9092" - uwlQueryURL = "https://thanos-querier.openshift-monitoring.svc:9091" - promURL = "https://prometheus-k8s:9091" -) - -type CollectorParams struct { - isUWL bool - clusterID string - clusterType string - obsAddonSpec oashared.ObservabilityAddonSpec - hubInfo operatorconfig.HubInfo - allowlist operatorconfig.MetricsAllowlist - nodeSelector map[string]string - tolerations []corev1.Toleration - httpProxy string - httpsProxy string - noProxy string - CABundle string - replicaCount int32 -} - -func getCommands(params CollectorParams) []string { - interval := fmt.Sprint(params.obsAddonSpec.Interval) + "s" - if fmt.Sprint(params.obsAddonSpec.Interval) == "" { - interval = defaultInterval - } - evaluateInterval := "30s" - if params.obsAddonSpec.Interval < 30 { - evaluateInterval = interval - } - caFile := caMounthPath + "/service-ca.crt" - clusterID := params.clusterID - if params.clusterID == "" { - clusterID = params.hubInfo.ClusterName - // deprecated ca bundle, only used for ocp 3.11 env - caFile = "/var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt" - } - commands := []string{ - "/usr/bin/metrics-collector", - "--listen=:8080", - "--from=$(FROM)", - "--from-query=$(FROM_QUERY)", - "--to-upload=$(TO)", - "--to-upload-ca=/tlscerts/ca/ca.crt", - "--to-upload-cert=/tlscerts/certs/tls.crt", - "--to-upload-key=/tlscerts/certs/tls.key", - "--interval=" + interval, - "--evaluate-interval=" + evaluateInterval, - "--limit-bytes=" + strconv.Itoa(limitBytes), - fmt.Sprintf("--label=\"cluster=%s\"", params.hubInfo.ClusterName), - fmt.Sprintf("--label=\"clusterID=%s\"", clusterID), - } - commands = append(commands, "--from-token-file=/var/run/secrets/kubernetes.io/serviceaccount/token") - if !installPrometheus { - commands = append(commands, "--from-ca-file="+caFile) - } - if params.clusterType != defaultClusterType { - commands = append(commands, fmt.Sprintf("--label=\"clusterType=%s\"", params.clusterType)) - } - - dynamicMetricList := map[string]bool{} - for _, group := range params.allowlist.CollectRuleGroupList { - if group.Selector.MatchExpression != nil { - for _, expr := range group.Selector.MatchExpression { - if isHubMetricsCollector { - if !evluateMatchExpression(expr, clusterID, params.clusterType, params.hubInfo, - params.allowlist, params.nodeSelector, params.tolerations, params.replicaCount) { - continue - } - } else if !evluateMatchExpression(expr, clusterID, params.clusterType, params.obsAddonSpec, params.hubInfo, - params.allowlist, params.nodeSelector, params.tolerations, params.replicaCount) { - continue - } - for _, rule := range group.CollectRuleList { - matchList := []string{} - for _, match := range rule.Metrics.MatchList { - matchList = append(matchList, `"`+strings.ReplaceAll(match, `"`, `\"`)+`"`) - if name := getNameInMatch(match); name != "" { - dynamicMetricList[name] = false - } - } - for _, name := range rule.Metrics.NameList { - dynamicMetricList[name] = false - } - matchListStr := "[" + strings.Join(matchList, ",") + "]" - nameListStr := `["` + strings.Join(rule.Metrics.NameList, `","`) + `"]` - commands = append( - commands, - fmt.Sprintf("--collectrule={\"name\":\"%s\",\"expr\":\"%s\",\"for\":\"%s\",\"names\":%v,\"matches\":%v}", - rule.Collect, rule.Expr, rule.For, nameListStr, matchListStr), - ) - } - } - } - } - - for _, metrics := range params.allowlist.NameList { - if _, ok := dynamicMetricList[metrics]; !ok { - commands = append(commands, fmt.Sprintf("--match={__name__=\"%s\"}", metrics)) - } - } - for _, match := range params.allowlist.MatchList { - if name := getNameInMatch(match); name != "" { - if _, ok := dynamicMetricList[name]; ok { - continue - } - } - commands = append(commands, fmt.Sprintf("--match={%s}", match)) - } - - renamekeys := make([]string, 0, len(params.allowlist.RenameMap)) - for k := range params.allowlist.RenameMap { - renamekeys = append(renamekeys, k) - } - sort.Strings(renamekeys) - for _, k := range renamekeys { - commands = append(commands, fmt.Sprintf("--rename=\"%s=%s\"", k, params.allowlist.RenameMap[k])) - } - for _, rule := range params.allowlist.RecordingRuleList { - commands = append( - commands, - fmt.Sprintf("--recordingrule={\"name\":\"%s\",\"query\":\"%s\"}", rule.Record, rule.Expr), - ) - } - return commands -} - -func createDeployment(params CollectorParams) *appsv1.Deployment { - secretName := metricsCollector - if params.isUWL { - secretName = uwlMetricsCollector - } - volumes := []corev1.Volume{ - { - Name: "mtlscerts", - VolumeSource: corev1.VolumeSource{ - Secret: &corev1.SecretVolumeSource{ - SecretName: mtlsCertName, - }, - }, - }, - { - Name: "mtlsca", - VolumeSource: corev1.VolumeSource{ - Secret: &corev1.SecretVolumeSource{ - SecretName: mtlsCaName, - }, - }, - }, - } - - if params.clusterType != ocpThreeClusterType { - serviceCAOperatorGenerated := []corev1.Volume{ - { - Name: "secret-kube-rbac-proxy-tls", - VolumeSource: corev1.VolumeSource{ - Secret: &corev1.SecretVolumeSource{ - SecretName: secretName + "-kube-rbac-tls", - }, - }, - }, - { - Name: "secret-kube-rbac-proxy-metric", - VolumeSource: corev1.VolumeSource{ - Secret: &corev1.SecretVolumeSource{ - SecretName: secretName + "-kube-rbac-proxy-metric", - }, - }, - }, - { - Name: "metrics-client-ca", - VolumeSource: corev1.VolumeSource{ - ConfigMap: &corev1.ConfigMapVolumeSource{ - LocalObjectReference: corev1.LocalObjectReference{ - Name: secretName + "-clientca-metric", - }, - }, - }, - }, - } - - volumes = append(volumes, serviceCAOperatorGenerated...) - } - mounts := []corev1.VolumeMount{ - { - Name: "mtlscerts", - MountPath: "/tlscerts/certs", - }, - { - Name: "mtlsca", - MountPath: "/tlscerts/ca", - }, - } - if params.clusterID != "" { - volumes = append(volumes, corev1.Volume{ - Name: caVolName, - VolumeSource: corev1.VolumeSource{ - ConfigMap: &corev1.ConfigMapVolumeSource{ - LocalObjectReference: corev1.LocalObjectReference{ - Name: openshift.CaConfigmapName, - }, - }, - }, - }) - mounts = append(mounts, corev1.VolumeMount{ - Name: caVolName, - MountPath: caMounthPath, - }) - } - - commands := getCommands(params) - - from := promURL - if !installPrometheus { - from = ocpPromURL - if params.isUWL { - from = uwlPromURL - } - } - fromQuery := from - if params.isUWL { - fromQuery = uwlQueryURL - } - name := metricsCollectorName - if params.isUWL { - name = uwlMetricsCollectorName - } - metricsCollectorDep := &appsv1.Deployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: namespace, - Annotations: map[string]string{ - ownerLabelKey: ownerLabelValue, - }, - }, - Spec: appsv1.DeploymentSpec{ - Replicas: int32Ptr(params.replicaCount), - Selector: &metav1.LabelSelector{ - MatchLabels: map[string]string{ - selectorKey: secretName, - }, - }, - Template: corev1.PodTemplateSpec{ - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{ - ownerLabelKey: ownerLabelValue, - operatorconfig.WorkloadPartitioningPodAnnotationKey: operatorconfig.WorkloadPodExpectedValueJSON, - }, - Labels: map[string]string{ - selectorKey: secretName, - }, - }, - Spec: corev1.PodSpec{ - ServiceAccountName: serviceAccountName, - Containers: []corev1.Container{ - { - Name: metricsCollector, - Image: rendering.Images[operatorconfig.MetricsCollectorKey], - Command: commands, - Env: []corev1.EnvVar{ - { - Name: "FROM", - Value: from, - }, - { - Name: "FROM_QUERY", - Value: fromQuery, - }, - { - Name: "TO", - Value: params.hubInfo.ObservatoriumAPIEndpoint, - }, - }, - VolumeMounts: mounts, - ImagePullPolicy: corev1.PullIfNotPresent, - Ports: []corev1.ContainerPort{ - { - ContainerPort: 8080, - Name: "metrics", - }, - }, - }, - }, - Volumes: volumes, - NodeSelector: params.nodeSelector, - Tolerations: params.tolerations, - }, - }, - }, - } - - if params.httpProxy != "" || params.httpsProxy != "" || params.noProxy != "" { - metricsCollectorDep.Spec.Template.Spec.Containers[0].Env = append(metricsCollectorDep.Spec.Template.Spec.Containers[0].Env, - corev1.EnvVar{ - Name: "HTTP_PROXY", - Value: params.httpProxy, - }, - corev1.EnvVar{ - Name: "HTTPS_PROXY", - Value: params.httpsProxy, - }, - corev1.EnvVar{ - Name: "NO_PROXY", - Value: params.noProxy, - }) - } - if params.httpsProxy != "" && params.CABundle != "" { - metricsCollectorDep.Spec.Template.Spec.Containers[0].Env = append(metricsCollectorDep.Spec.Template.Spec.Containers[0].Env, - corev1.EnvVar{ - Name: "HTTPS_PROXY_CA_BUNDLE", - Value: params.CABundle, - }) - } - - if isHubMetricsCollector { - //to avoid hub metrics collector from sending status - metricsCollectorDep.Spec.Template.Spec.Containers[0].Env = append(metricsCollectorDep.Spec.Template.Spec.Containers[0].Env, - corev1.EnvVar{ - Name: "STANDALONE", - Value: "true", - }) - } - - privileged := false - readOnlyRootFilesystem := true - - metricsCollectorDep.Spec.Template.Spec.Containers[0].SecurityContext = &corev1.SecurityContext{ - Privileged: &privileged, - ReadOnlyRootFilesystem: &readOnlyRootFilesystem, - } - - if params.obsAddonSpec.Resources != nil { - metricsCollectorDep.Spec.Template.Spec.Containers[0].Resources = *params.obsAddonSpec.Resources - } - return metricsCollectorDep -} - -func createService(params CollectorParams) *corev1.Service { - name := metricsCollector - if params.isUWL { - name = uwlMetricsCollector - } - return &corev1.Service{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: namespace, - Labels: map[string]string{ - selectorKey: name, - }, - Annotations: map[string]string{ - ownerLabelKey: ownerLabelValue, - "service.beta.openshift.io/serving-cert-secret-name": name + "-kube-rbac-tls", - }, - }, - Spec: corev1.ServiceSpec{ - Selector: map[string]string{ - selectorKey: name, - }, - Ports: []corev1.ServicePort{ - { - Name: "metrics", - Port: 8080, - TargetPort: intstr.FromString("metrics"), - }, - }, - Type: corev1.ServiceTypeClusterIP, - }, - } -} - -func createAlertingRule(params CollectorParams) *monitoringv1.PrometheusRule { - name := metricsCollector - alert := "MetricsCollector" - replace := "acm_metrics_collector_" - if params.isUWL { - name = uwlMetricsCollector - alert = "UWLMetricsCollector" - replace = "acm_uwl_metrics_collector_" - } - - return &monitoringv1.PrometheusRule{ - ObjectMeta: metav1.ObjectMeta{ - Name: "acm-" + name + "-alerting-rules", - Namespace: namespace, - }, - Spec: monitoringv1.PrometheusRuleSpec{ - Groups: []monitoringv1.RuleGroup{ - { - Name: name + "-rules", - Rules: []monitoringv1.Rule{ - { - Alert: "ACM" + alert + "FederationError", - Annotations: map[string]string{ - "summary": "Error federating from in-cluster Prometheus.", - "description": "There are errors when federating from platform Prometheus", - }, - Expr: intstr.FromString(`(sum by (status_code, type) (rate(` + replace + `federate_requests_total{status_code!~"2.*"}[10m]))) > 10`), - For: "10m", - Labels: map[string]string{ - "severity": "critical", - }, - }, - { - Alert: "ACM" + alert + "ForwardRemoteWriteError", - Annotations: map[string]string{ - "summary": "Error forwarding to Hub Thanos.", - "description": "There are errors when remote writing to Hub hub Thanos", - }, - Expr: intstr.FromString(`(sum by (status_code, type) (rate(` + replace + `forward_write_requests_total{status_code!~"2.*"}[10m]))) > 10`), - For: "10m", - Labels: map[string]string{ - "severity": "critical", - }, - }, - }, - }, - }, - }, - } -} - -// createServiceMonitor creates a ServiceMonitor for the metrics collector. -func createServiceMonitor(params CollectorParams) *monitoringv1.ServiceMonitor { - name := metricsCollector - replace := "acm_metrics_collector_${1}" - if params.isUWL { - name = uwlMetricsCollector - replace = "acm_uwl_metrics_collector_${1}" - } - - return &monitoringv1.ServiceMonitor{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: namespace, - Labels: map[string]string{ - selectorKey: name, - }, - Annotations: map[string]string{ - ownerLabelKey: ownerLabelValue, - }, - }, - Spec: monitoringv1.ServiceMonitorSpec{ - Selector: metav1.LabelSelector{ - MatchLabels: map[string]string{ - selectorKey: name, - }, - }, - NamespaceSelector: monitoringv1.NamespaceSelector{ - MatchNames: []string{namespace}, - }, - Endpoints: []monitoringv1.Endpoint{ - { - Port: "metrics", - Path: "/metrics", - Scheme: "http", - MetricRelabelConfigs: []*monitoringv1.RelabelConfig{ - { - Action: "replace", - Regex: "(.+)", - Replacement: replace, - SourceLabels: []string{"__name__"}, - TargetLabel: "__name__", - }, - }, - }, - }, - }, - } -} - -func updateMetricsCollectors(ctx context.Context, c client.Client, obsAddonSpec oashared.ObservabilityAddonSpec, - hubInfo operatorconfig.HubInfo, clusterID string, clusterType string, - replicaCount int32, forceRestart bool) (bool, error) { - - list, uwlList, err := getMetricsAllowlist(ctx, c, clusterType) - if err != nil { - return false, err - } - endpointDeployment := getEndpointDeployment(ctx, c) - params := CollectorParams{ - isUWL: false, - clusterID: clusterID, - clusterType: clusterType, - obsAddonSpec: obsAddonSpec, - hubInfo: hubInfo, - allowlist: list, - replicaCount: replicaCount, - nodeSelector: endpointDeployment.Spec.Template.Spec.NodeSelector, - tolerations: endpointDeployment.Spec.Template.Spec.Tolerations, - } - - // stash away proxy settings from endpoint deployment - for _, container := range endpointDeployment.Spec.Template.Spec.Containers { - if container.Name == "endpoint-observability-operator" { - for _, env := range container.Env { - if env.Name == "HTTP_PROXY" { - params.httpProxy = env.Value - } else if env.Name == "HTTPS_PROXY" { - params.httpsProxy = env.Value - } else if env.Name == "NO_PROXY" { - params.noProxy = env.Value - } else if env.Name == "HTTPS_PROXY_CA_BUNDLE" { - params.CABundle = env.Value - } - } - } - } - - result, err := updateMetricsCollector(ctx, c, params, forceRestart) - if err != nil || !result { - return result, err - } - isUwl, err := isUWLMonitoringEnabled(ctx, c) - if err != nil { - return result, err - } - if isUwl && len(uwlList.NameList) != 0 { - params.isUWL = true - params.allowlist = uwlList - result, err = updateMetricsCollector(ctx, c, params, forceRestart) - } else { - err = deleteMetricsCollector(ctx, c, uwlMetricsCollectorName) - if err != nil { - return false, err - } - } - return result, err -} - -func updateMetricsCollector(ctx context.Context, c client.Client, params CollectorParams, - forceRestart bool) (bool, error) { - name := metricsCollectorName - resourceName := metricsCollector - if params.isUWL { - resourceName = uwlMetricsCollector - name = uwlMetricsCollectorName - } - - desiredService := createService(params) - retryErr := retry.RetryOnConflict(retry.DefaultBackoff, func() error { - foundService := &corev1.Service{} - err := c.Get(ctx, types.NamespacedName{Name: metricsCollector, Namespace: namespace}, foundService) - if err != nil && errors.IsNotFound(err) { - log.Info("Creating Service", "name", metricsCollector, "namespace", namespace) - if err := c.Create(ctx, desiredService); err != nil { - return fmt.Errorf("failed to create service %s/%s: %w", namespace, metricsCollector, err) - } - - return nil - } else if err != nil { - return fmt.Errorf("failed to get service %s/%s: %w", namespace, metricsCollector, err) - } - - if !equality.Semantic.DeepDerivative(desiredService.Spec, foundService.Spec) { - log.Info("Updating Service", "name", metricsCollector, "namespace", namespace) - - foundService.Spec = desiredService.Spec - if err := c.Update(ctx, foundService); err != nil { - return fmt.Errorf("failed to update service %s/%s: %w", namespace, metricsCollector, err) - } - } - - return nil - }) - - if retryErr != nil { - return false, retryErr - } - - desiredSm := createServiceMonitor(params) - retryErr = retry.RetryOnConflict(retry.DefaultBackoff, func() error { - foundSm := &monitoringv1.ServiceMonitor{} - err := c.Get(ctx, types.NamespacedName{Name: resourceName, Namespace: namespace}, foundSm) - if err != nil && errors.IsNotFound(err) { - log.Info("Creating ServiceMonitor", "name", resourceName, "namespace", namespace) - if err := c.Create(ctx, desiredSm); err != nil { - return fmt.Errorf("failed to create ServiceMonitor %s/%s: %w", namespace, resourceName, err) - } - - return nil - } else if err != nil { - return fmt.Errorf("failed to get ServiceMonitor %s/%s: %w", namespace, resourceName, err) - } - - if !equality.Semantic.DeepDerivative(desiredSm.Spec, foundSm.Spec) { - log.Info("Updating ServiceMonitor", "name", resourceName, "namespace", namespace) - - foundSm.Spec = desiredSm.Spec - if err := c.Update(ctx, foundSm); err != nil { - return fmt.Errorf("failed to update ServiceMonitor %s/%s: %w", namespace, resourceName, err) - } - } - - return nil - }) - - if retryErr != nil { - return false, retryErr - } - - promRuleName := "acm-" + resourceName + "-alerting-rules" - desiredPromRule := createAlertingRule(params) - retryErr = retry.RetryOnConflict(retry.DefaultBackoff, func() error { - foundPromRule := &monitoringv1.PrometheusRule{} - err := c.Get(ctx, types.NamespacedName{Name: promRuleName, Namespace: namespace}, foundPromRule) - if err != nil && errors.IsNotFound(err) { - log.Info("Creating PrometheusRule", "name", promRuleName, "namespace", namespace) - if err := c.Create(ctx, desiredPromRule); err != nil { - return fmt.Errorf("failed to create PrometheusRule %s/%s: %w", namespace, promRuleName, err) - } - - return nil - } else if err != nil { - return fmt.Errorf("failed to get PrometheusRule %s/%s: %w", namespace, promRuleName, err) - } - - if !equality.Semantic.DeepDerivative(desiredPromRule.Spec, foundPromRule.Spec) { - log.Info("Updating PrometheusRule", "name", promRuleName, "namespace", namespace) - - foundPromRule.Spec = desiredPromRule.Spec - if err := c.Update(ctx, foundPromRule); err != nil { - return fmt.Errorf("failed to update PrometheusRule %s/%s: %w", namespace, promRuleName, err) - } - } - - return nil - }) - - if retryErr != nil { - return false, retryErr - } - - desiredMetricsCollectorDep := createDeployment(params) - retryErr = retry.RetryOnConflict(retry.DefaultBackoff, func() error { - foundMetricsCollectorDep := &appsv1.Deployment{} - err := c.Get(ctx, types.NamespacedName{Name: name, Namespace: namespace}, foundMetricsCollectorDep) - if err != nil && errors.IsNotFound(err) { - log.Info("Creating Deployment", "name", name, "namespace", namespace) - if err := c.Create(ctx, desiredMetricsCollectorDep); err != nil { - return fmt.Errorf("failed to create Deployment %s/%s: %w", namespace, name, err) - } - } else if err != nil { - return fmt.Errorf("failed to get Deployment %s/%s: %w", namespace, name, err) - } - - isDifferentSpec := !equality.Semantic.DeepDerivative(desiredMetricsCollectorDep.Spec.Template.Spec, foundMetricsCollectorDep.Spec.Template.Spec) - isDifferentReplicas := !equality.Semantic.DeepEqual(desiredMetricsCollectorDep.Spec.Replicas, foundMetricsCollectorDep.Spec.Replicas) - if isDifferentSpec || isDifferentReplicas || forceRestart { - log.Info("Updating Deployment", "name", name, "namespace", namespace, "isDifferentSpec", isDifferentSpec, "isDifferentReplicas", isDifferentReplicas, "forceRestart", forceRestart) - if forceRestart && foundMetricsCollectorDep.Status.ReadyReplicas != 0 { - desiredMetricsCollectorDep.Spec.Template.ObjectMeta.Labels[restartLabel] = time.Now().Format("2006-1-2.1504") - } - - desiredMetricsCollectorDep.ResourceVersion = foundMetricsCollectorDep.ResourceVersion - - if err := c.Update(ctx, desiredMetricsCollectorDep); err != nil { - return fmt.Errorf("failed to update Deployment %s/%s: %w", namespace, name, err) - } - - return nil - } - - return nil - }) - - if retryErr != nil { - return false, retryErr - } - - return true, nil -} - -func deleteMetricsCollector(ctx context.Context, c client.Client, name string) error { - found := &appsv1.Deployment{} - err := c.Get(ctx, types.NamespacedName{Name: name, - Namespace: namespace}, found) - if err != nil { - if errors.IsNotFound(err) { - log.Info("The metrics collector deployment does not exist", "name", name) - return nil - } - log.Error(err, "Failed to check the metrics collector deployment", "name", name) - return err - } - err = c.Delete(ctx, found) - if err != nil { - log.Error(err, "Failed to delete the metrics collector deployment", "name", name) - return err - } - log.Info("metrics collector deployment deleted", "name", name) - - foundSM := &monitoringv1.ServiceMonitor{} - if err := c.Get(ctx, types.NamespacedName{Name: strings.TrimSuffix(name, "-deployment"), - Namespace: namespace}, foundSM); err != nil { - if errors.IsNotFound(err) { - log.Info("The metrics collector servicemonitor does not exist", "name", strings.TrimSuffix(name, "-deployment")) - return nil - } - log.Error(err, "Failed to check the metrics collector servicemonitor", "name", strings.TrimSuffix(name, "-deployment")) - return err - } - if err := c.Delete(ctx, foundSM); err != nil { - log.Error(err, "Failed to delete the metrics collector servicemonitor", "name", strings.TrimSuffix(name, "-deployment")) - return err - } - log.Info("metrics collector servicemonitor deleted", "name", strings.TrimSuffix(name, "-deployment")) - - foundAlerts := &monitoringv1.PrometheusRule{} - if err := c.Get(ctx, types.NamespacedName{Name: "acm-" + strings.TrimSuffix(name, "-deployment") + "-alerting-rules", - Namespace: namespace}, foundAlerts); err != nil { - if errors.IsNotFound(err) { - log.Info("The metrics collector alerting rules does not exist", "name", "acm-"+strings.TrimSuffix(name, "-deployment")+"-alerting-rules") - return nil - } - log.Error(err, "Failed to check the metrics collector alerting rules", "name", "acm-"+strings.TrimSuffix(name, "-deployment")+"-alerting-rules") - return err - } - if err := c.Delete(ctx, foundAlerts); err != nil { - log.Error(err, "Failed to delete the metrics collector alerting rules", "name", "acm-"+strings.TrimSuffix(name, "-deployment")+"-alerting-rules") - return err - } - log.Info("metrics collector alerting rules deleted", "name", "acm-"+strings.TrimSuffix(name, "-deployment")+"-alerting-rules") - - foundService := &corev1.Service{} - if err := c.Get(ctx, types.NamespacedName{Name: strings.TrimSuffix(name, "-deployment"), - Namespace: namespace}, foundService); err != nil { - if errors.IsNotFound(err) { - log.Info("The metrics collector service does not exist", "name", strings.TrimSuffix(name, "-deployment")) - return nil - } - log.Error(err, "Failed to check the metrics collector service", "name", strings.TrimSuffix(name, "-deployment")) - return err - } - if err := c.Delete(ctx, foundService); err != nil { - log.Error(err, "Failed to delete the metrics collector service", "name", strings.TrimSuffix(name, "-deployment")) - return err - } - log.Info("metrics collector service deleted", "name", strings.TrimSuffix(name, "-deployment")) - - return nil -} - -func int32Ptr(i int32) *int32 { return &i } - -func getMetricsAllowlist(ctx context.Context, c client.Client, - clusterType string) (operatorconfig.MetricsAllowlist, operatorconfig.MetricsAllowlist, error) { - l := &operatorconfig.MetricsAllowlist{} - ul := &operatorconfig.MetricsAllowlist{} - cm := &corev1.ConfigMap{} - err := c.Get(ctx, types.NamespacedName{Name: operatorconfig.AllowlistConfigMapName, - Namespace: namespace}, cm) - if err != nil { - log.Error(err, "Failed to get configmap") - } else { - if cm.Data != nil { - configmapKey := operatorconfig.MetricsConfigMapKey - if clusterType == ocpThreeClusterType { - configmapKey = operatorconfig.MetricsOcp311ConfigMapKey - } - err = yaml.Unmarshal([]byte(cm.Data[configmapKey]), l) - if err != nil { - log.Error(err, "Failed to unmarshal data in configmap") - return *l, *ul, err - } - if uwlData, ok := cm.Data[operatorconfig.UwlMetricsConfigMapKey]; ok { - err = yaml.Unmarshal([]byte(uwlData), ul) - if err != nil { - log.Error(err, "Failed to unmarshal uwl data in configmap") - return *l, *ul, err - } - } - } - } - - cmList := &corev1.ConfigMapList{} - _ = c.List(ctx, cmList, &client.ListOptions{}) - for _, allowlistCM := range cmList.Items { - if allowlistCM.ObjectMeta.Name == operatorconfig.AllowlistCustomConfigMapName { - log.Info("Parse custom allowlist configmap", "namespace", allowlistCM.ObjectMeta.Namespace, - "name", allowlistCM.ObjectMeta.Name) - customAllowlist, _, customUwlAllowlist, err := util.ParseAllowlistConfigMap(allowlistCM) - if err != nil { - log.Error(err, "Failed to parse data in configmap", "namespace", allowlistCM.ObjectMeta.Namespace, - "name", allowlistCM.ObjectMeta.Name) - } - if allowlistCM.ObjectMeta.Namespace != namespace { - customUwlAllowlist = injectNamespaceLabel(customUwlAllowlist, allowlistCM.ObjectMeta.Namespace) - } - l, _, ul = util.MergeAllowlist(l, customAllowlist, nil, ul, customUwlAllowlist) - } - } - - return *l, *ul, nil -} - -func getEndpointDeployment(ctx context.Context, c client.Client) appsv1.Deployment { - d := &appsv1.Deployment{} - err := c.Get(ctx, types.NamespacedName{Name: "endpoint-observability-operator", Namespace: namespace}, d) - if err != nil { - log.Error(err, "Failed to get deployment") - } - return *d -} - -func getNameInMatch(match string) string { - r := regexp.MustCompile(`__name__="([^,]*)"`) - m := r.FindAllStringSubmatch(match, -1) - if m != nil { - return m[0][1] - } - return "" -} - -func isUWLMonitoringEnabled(ctx context.Context, c client.Client) (bool, error) { - sts := &appsv1.StatefulSet{} - err := c.Get(ctx, types.NamespacedName{Namespace: uwlNamespace, Name: uwlSts}, sts) - if err != nil { - if !errors.IsNotFound(err) { - log.Error(err, "Failed to get uwl prometheus statefulset") - return false, err - } else { - return false, nil - } - } - return true, nil -} - -// for custom uwl allowlist: -// 1. only support "names" and "matches". -// 2. inject namespace label filter for all entries in the allowlist. -func injectNamespaceLabel(allowlist *operatorconfig.MetricsAllowlist, - namespace string) *operatorconfig.MetricsAllowlist { - updatedList := &operatorconfig.MetricsAllowlist{ - NameList: []string{}, - MatchList: []string{}, - } - for _, name := range allowlist.NameList { - updatedList.MatchList = append(updatedList.MatchList, - fmt.Sprintf("__name__=\"%s\",namespace=\"%s\"", name, namespace)) - } - for _, match := range allowlist.MatchList { - updatedList.MatchList = append(updatedList.MatchList, fmt.Sprintf("%s,namespace=\"%s\"", match, namespace)) - } - return updatedList -} diff --git a/operators/endpointmetrics/controllers/observabilityendpoint/metrics_collector_test.go b/operators/endpointmetrics/controllers/observabilityendpoint/metrics_collector_test.go deleted file mode 100644 index 709282427..000000000 --- a/operators/endpointmetrics/controllers/observabilityendpoint/metrics_collector_test.go +++ /dev/null @@ -1,245 +0,0 @@ -// Copyright (c) Red Hat, Inc. -// Copyright Contributors to the Open Cluster Management project -// Licensed under the Apache License 2.0 - -package observabilityendpoint - -import ( - "context" - "testing" - - appsv1 "k8s.io/api/apps/v1" - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" - "k8s.io/kubectl/pkg/scheme" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/client/fake" - - promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" - oashared "github.com/stolostron/multicluster-observability-operator/operators/multiclusterobservability/api/shared" - oav1beta1 "github.com/stolostron/multicluster-observability-operator/operators/multiclusterobservability/api/v1beta1" - operatorconfig "github.com/stolostron/multicluster-observability-operator/operators/pkg/config" - addonv1alpha1 "open-cluster-management.io/api/addon/v1alpha1" -) - -func getAllowlistCM() *corev1.ConfigMap { - return &corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{ - Name: operatorconfig.AllowlistConfigMapName, - Namespace: namespace, - }, - Data: map[string]string{ - operatorconfig.MetricsConfigMapKey: ` -names: - - a - - b -matches: - - __name__="c" -recording_rules: - - record: f - expr: g -collect_rules: - - name: h - selector: - matchExpressions: - - key: clusterType - operator: NotIn - values: ["SNO"] - rules: - - collect: j - expr: k - for: 1m - names: - - c - matches: - - __name__="a" -`, - operatorconfig.UwlMetricsConfigMapKey: ` -names: - - uwl_a - - uwl_b -`}, - } -} - -func getCustomAllowlistCM() *corev1.ConfigMap { - return &corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{ - Name: operatorconfig.AllowlistCustomConfigMapName, - Namespace: "default", - }, - Data: map[string]string{ - operatorconfig.UwlMetricsConfigMapKey: ` -names: - - custom_c -matches: - - __name__=test -`}, - } -} - -func init() { - s := scheme.Scheme - addonv1alpha1.AddToScheme(s) - oav1beta1.AddToScheme(s) - - namespace = testNamespace - hubNamespace = testHubNamspace -} - -func checkAnnotationsAndProxySettings( - ctx context.Context, - c client.Client, - deploymentName string, - t *testing.T) { - - deployment := &appsv1.Deployment{} - err := c.Get(ctx, types.NamespacedName{Name: deploymentName, - Namespace: namespace}, deployment) - if err != nil { - t.Fatalf("Failed to query deployment: %v, err: (%v)", deploymentName, err) - } - - annotations := deployment.Spec.Template.Annotations - v, found := annotations[operatorconfig.WorkloadPartitioningPodAnnotationKey] - if !found || v != operatorconfig.WorkloadPodExpectedValueJSON { - t.Fatalf("Failed to find annotation %v: %v on the pod spec of deployment: %v", - operatorconfig.WorkloadPartitioningPodAnnotationKey, - operatorconfig.WorkloadPodExpectedValueJSON, - deploymentName, - ) - } - - env := deployment.Spec.Template.Spec.Containers[0].Env - foundHTTPProxy := false - foundHTTPSProxy := false - foundNOProxy := false - foundCABundle := false - for _, e := range env { - if e.Name == "HTTP_PROXY" { - foundHTTPProxy = true - if e.Value != "http://foo.com" { - t.Fatalf("HTTP_PROXY is not set correctly: expected %s, got %s", "http://foo.com", e.Value) - } - } else if e.Name == "HTTPS_PROXY" { - foundHTTPSProxy = true - if e.Value != "https://foo.com" { - t.Fatalf("HTTPS_PROXY is not set correctly: expected %s, got %s", "https://foo.com", e.Value) - } - } else if e.Name == "NO_PROXY" { - foundNOProxy = true - if e.Value != "bar.com" { - t.Fatalf("NO_PROXY is not set correctly: expected %s, got %s", "bar.com", e.Value) - } - } else if e.Name == "HTTPS_PROXY_CA_BUNDLE" { - foundCABundle = true - if e.Value != "custom-ca.crt" { - t.Fatalf("HTTPS_PROXY_CA_BUNDLE is not set correctly: expected %s, got %s", "custom-ca.crt", e.Value) - } - } - } - if !foundHTTPProxy { - t.Fatalf("HTTP_PROXY is not present in env") - } - if !foundHTTPSProxy { - t.Fatalf("HTTPS_PROXY is not present in env") - } - if !foundNOProxy { - t.Fatalf("NO_PROXY is not present in env") - } - if !foundCABundle { - t.Fatalf("HTTPS_PROXY_CA_BUNDLE is not present in env") - } -} - -func TestMetricsCollector(t *testing.T) { - hubInfo := &operatorconfig.HubInfo{ - ClusterName: "test-cluster", - ObservatoriumAPIEndpoint: "http://test-endpoint", - } - obsAddon := oashared.ObservabilityAddonSpec{ - EnableMetrics: true, - Interval: 60, - } - - ctx := context.TODO() - objs := []runtime.Object{getAllowlistCM(), getCustomAllowlistCM(), &corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{ - Name: "extension-apiserver-authentication", - Namespace: "kube-system", - }, - Data: map[string]string{ - "client-ca-file": "test", - }, - }} - promv1.AddToScheme(scheme.Scheme) - c := fake.NewClientBuilder().WithScheme(scheme.Scheme).WithRuntimeObjects(objs...).Build() - - list, uwlList, err := getMetricsAllowlist(ctx, c, "") - if err != nil { - t.Fatalf("Failed to get allowlist: (%v)", err) - } - // Default deployment with instance count 1 - params := CollectorParams{ - isUWL: false, - clusterID: testClusterID, - clusterType: defaultClusterType, - obsAddonSpec: obsAddon, - hubInfo: *hubInfo, - allowlist: list, - replicaCount: 1, - httpProxy: "http://foo.com", - httpsProxy: "https://foo.com", - noProxy: "bar.com", - CABundle: "custom-ca.crt", - } - - _, err = updateMetricsCollector(ctx, c, params, false) - if err != nil { - t.Fatalf("Failed to create metrics collector deployment: (%v)", err) - } - checkAnnotationsAndProxySettings(ctx, c, metricsCollectorName, t) - - // Update deployment to reduce instance count to zero - params.replicaCount = 0 - _, err = updateMetricsCollector(ctx, c, params, false) - if err != nil { - t.Fatalf("Failed to create metrics collector deployment: (%v)", err) - } - checkAnnotationsAndProxySettings(ctx, c, metricsCollectorName, t) - - params.replicaCount = 1 - params.clusterID = testClusterID + "-update" - params.clusterType = snoClusterType - _, err = updateMetricsCollector(ctx, c, params, false) - if err != nil { - t.Fatalf("Failed to create metrics collector deployment: (%v)", err) - } - checkAnnotationsAndProxySettings(ctx, c, metricsCollectorName, t) - - _, err = updateMetricsCollector(ctx, c, params, true) - if err != nil { - t.Fatalf("Failed to update metrics collector deployment: (%v)", err) - } - checkAnnotationsAndProxySettings(ctx, c, metricsCollectorName, t) - - params.isUWL = true - params.allowlist = uwlList - _, err = updateMetricsCollector(ctx, c, params, true) - if err != nil { - t.Fatalf("Failed to create uwl metrics collector deployment: (%v)", err) - } - checkAnnotationsAndProxySettings(ctx, c, uwlMetricsCollectorName, t) - - err = deleteMetricsCollector(ctx, c, metricsCollectorName) - if err != nil { - t.Fatalf("Failed to delete metrics collector deployment: (%v)", err) - } - - err = deleteMetricsCollector(ctx, c, uwlMetricsCollectorName) - if err != nil { - t.Fatalf("Failed to delete uwl metrics collector deployment: (%v)", err) - } -} diff --git a/operators/endpointmetrics/controllers/observabilityendpoint/observabilityaddon_controller.go b/operators/endpointmetrics/controllers/observabilityendpoint/observabilityaddon_controller.go index c4a709a56..4160b85f2 100644 --- a/operators/endpointmetrics/controllers/observabilityendpoint/observabilityaddon_controller.go +++ b/operators/endpointmetrics/controllers/observabilityendpoint/observabilityaddon_controller.go @@ -8,9 +8,6 @@ import ( "context" "fmt" "os" - "strconv" - - operatorutil "github.com/stolostron/multicluster-observability-operator/operators/pkg/util" "golang.org/x/exp/slices" "gopkg.in/yaml.v2" @@ -31,9 +28,11 @@ import ( "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "github.com/stolostron/multicluster-observability-operator/operators/endpointmetrics/pkg/collector" "github.com/stolostron/multicluster-observability-operator/operators/endpointmetrics/pkg/hypershift" "github.com/stolostron/multicluster-observability-operator/operators/endpointmetrics/pkg/openshift" "github.com/stolostron/multicluster-observability-operator/operators/endpointmetrics/pkg/rendering" + "github.com/stolostron/multicluster-observability-operator/operators/endpointmetrics/pkg/status" "github.com/stolostron/multicluster-observability-operator/operators/endpointmetrics/pkg/util" oav1beta1 "github.com/stolostron/multicluster-observability-operator/operators/multiclusterobservability/api/v1beta1" oav1beta2 "github.com/stolostron/multicluster-observability-operator/operators/multiclusterobservability/api/v1beta2" @@ -43,39 +42,33 @@ import ( ) var ( - log = ctrl.Log.WithName("controllers").WithName("ObservabilityAddon") - installPrometheus, _ = strconv.ParseBool(os.Getenv(operatorconfig.InstallPrometheus)) - globalRes = []*unstructured.Unstructured{} + log = ctrl.Log.WithName("controllers").WithName("ObservabilityAddon") + globalRes = []*unstructured.Unstructured{} ) const ( obAddonName = "observability-addon" - ownerLabelKey = "owner" - ownerLabelValue = "observabilityaddon" obsAddonFinalizer = "observability.open-cluster-management.io/addon-cleanup" promSvcName = "prometheus-k8s" promNamespace = "openshift-monitoring" openShiftClusterMonitoringlabel = "openshift.io/cluster-monitoring" -) - -const ( - defaultClusterType = "" - ocpThreeClusterType = "ocp3" - snoClusterType = "SNO" -) - -var ( - namespace = os.Getenv("WATCH_NAMESPACE") - hubNamespace = os.Getenv("HUB_NAMESPACE") - isHubMetricsCollector = os.Getenv("HUB_ENDPOINT_OPERATOR") == "true" - serviceAccountName = os.Getenv("SERVICE_ACCOUNT") + mtlsCertName = "observability-controller-open-cluster-management.io-observability-signer-client-cert" + mtlsCaName = "observability-managed-cluster-certs" + metricsCollectorName = "metrics-collector-deployment" + uwlMetricsCollectorName = "uwl-metrics-collector-deployment" + uwlNamespace = "openshift-user-workload-monitoring" ) // ObservabilityAddonReconciler reconciles a ObservabilityAddon object. type ObservabilityAddonReconciler struct { - Client client.Client - Scheme *runtime.Scheme - HubClient *util.ReloadableHubClient + Client client.Client + Scheme *runtime.Scheme + HubClient *util.ReloadableHubClient + IsHubMetricsCollector bool + Namespace string + HubNamespace string + ServiceAccountName string + InstallPrometheus bool } // +kubebuilder:rbac:groups=observability.open-cluster-management.io.open-cluster-management.io,resources=observabilityaddons,verbs=get;list;watch;create;update;patch;delete @@ -87,34 +80,31 @@ type ObservabilityAddonReconciler struct { // The Controller will requeue the Request to be processed again if the returned error is non-nil or // Result.Requeue is true, otherwise upon completion it will remove the work from the queue. func (r *ObservabilityAddonReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - log := log.WithValues("Request.Namespace", req.Namespace, "Request.Name", req.Name) - log.Info("Reconciling") + log.Info("Reconciling", "Request", req.String()) isHypershift := true if os.Getenv("UNIT_TEST") != "true" { - crdClient, err := operatorutil.GetOrCreateCRDClient() - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to get/create CRD client: %w", err) - } - isHypershift, err = operatorutil.CheckCRDExist(crdClient, "hostedclusters.hypershift.openshift.io") + var err error + isHypershift, err = hypershift.IsHypershiftCluster() if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to check if the CRD hostedclusters.hypershift.openshift.io exists: %w", err) + return ctrl.Result{}, fmt.Errorf("failed to check if the cluster is hypershift: %w", err) } } + hubObsAddon := &oav1beta1.ObservabilityAddon{} obsAddon := &oav1beta1.ObservabilityAddon{} deleteFlag := false // ACM 8509: Special case for hub/local cluster metrics collection // We do not have an ObservabilityAddon instance in the local cluster so skipping the below block - if !isHubMetricsCollector { + if !r.IsHubMetricsCollector { if err := r.ensureOpenShiftMonitoringLabelAndRole(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to ensure OpenShift monitoring label and role: %w", err) } // Fetch the ObservabilityAddon instance in hub cluster fetchAddon := func() error { - return r.HubClient.Get(ctx, types.NamespacedName{Name: obAddonName, Namespace: hubNamespace}, hubObsAddon) + return r.HubClient.Get(ctx, types.NamespacedName{Name: obAddonName, Namespace: r.HubNamespace}, hubObsAddon) } if err := fetchAddon(); err != nil { if r.HubClient, err = r.HubClient.Reload(); err != nil { @@ -128,7 +118,7 @@ func (r *ObservabilityAddonReconciler) Reconcile(ctx context.Context, req ctrl.R } // Fetch the ObservabilityAddon instance in local cluster - err := r.Client.Get(ctx, types.NamespacedName{Name: obAddonName, Namespace: namespace}, obsAddon) + err := r.Client.Get(ctx, types.NamespacedName{Name: obAddonName, Namespace: r.Namespace}, obsAddon) if err != nil { if !errors.IsNotFound(err) { return ctrl.Result{}, fmt.Errorf("failed to get observabilityaddon: %w", err) @@ -153,11 +143,11 @@ func (r *ObservabilityAddonReconciler) Reconcile(ctx context.Context, req ctrl.R hubSecret := &corev1.Secret{} err := r.Client.Get( ctx, - types.NamespacedName{Name: operatorconfig.HubInfoSecretName, Namespace: namespace}, + types.NamespacedName{Name: operatorconfig.HubInfoSecretName, Namespace: r.Namespace}, hubSecret, ) if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to get hub info secret: %w", err) + return ctrl.Result{}, fmt.Errorf("failed to get hub info secret %s/%s: %w", r.Namespace, operatorconfig.HubInfoSecretName, err) } hubInfo := &operatorconfig.HubInfo{} err = yaml.Unmarshal(hubSecret.Data[operatorconfig.HubInfoSecretKey], &hubInfo) @@ -166,21 +156,21 @@ func (r *ObservabilityAddonReconciler) Reconcile(ctx context.Context, req ctrl.R } hubInfo.ClusterName = string(hubSecret.Data[operatorconfig.ClusterNameKey]) - clusterType := defaultClusterType + clusterType := operatorconfig.DefaultClusterType clusterID := "" // read the image configmap imagesCM := &corev1.ConfigMap{} err = r.Client.Get(ctx, types.NamespacedName{ Name: operatorconfig.ImageConfigMap, - Namespace: namespace, + Namespace: r.Namespace, }, imagesCM) if err != nil { return ctrl.Result{}, fmt.Errorf("failed to get images configmap: %w", err) } rendering.Images = imagesCM.Data - if isHypershift { + if r.IsHubMetricsCollector && isHypershift { updatedHCs, err := hypershift.ReconcileHostedClustersServiceMonitors(ctx, r.Client) if err != nil { log.Error(err, "Failed to create ServiceMonitors for hypershift") @@ -189,7 +179,7 @@ func (r *ObservabilityAddonReconciler) Reconcile(ctx context.Context, req ctrl.R } } - if !installPrometheus { + if !r.InstallPrometheus { // If no prometheus service found, set status as NotSupported promSvc := &corev1.Service{} err = r.Client.Get(ctx, types.NamespacedName{ @@ -201,8 +191,8 @@ func (r *ObservabilityAddonReconciler) Reconcile(ctx context.Context, req ctrl.R log.Error(err, "OCP prometheus service does not exist") // ACM 8509: Special case for hub/local cluster metrics collection // We do not report status for hub endpoint operator - if !isHubMetricsCollector { - if err := util.ReportStatus(ctx, r.Client, util.NotSupported, obsAddon.Name, obsAddon.Namespace); err != nil { + if !r.IsHubMetricsCollector { + if err := status.ReportStatus(ctx, r.Client, status.NotSupported, obsAddon.Name, obsAddon.Namespace); err != nil { log.Error(err, "Failed to report status") } } @@ -227,38 +217,44 @@ func (r *ObservabilityAddonReconciler) Reconcile(ctx context.Context, req ctrl.R // OCP 3.11 has no cluster id, set it as empty string clusterID = "" // to differentiate ocp 3.x - clusterType = ocpThreeClusterType + clusterType = operatorconfig.OcpThreeClusterType } - isSNO, err := openshift.IsSNO(ctx, r.Client) - if err != nil { + if isSNO, err := openshift.IsSNO(ctx, r.Client); err != nil { log.Error(err, "Failed to check if the cluster is SNO") } else if isSNO { - clusterType = snoClusterType + clusterType = operatorconfig.SnoClusterType } - err = openshift.CreateMonitoringClusterRoleBinding(ctx, log, r.Client, namespace, serviceAccountName) + err = openshift.CreateMonitoringClusterRoleBinding(ctx, log, r.Client, r.Namespace, r.ServiceAccountName) if err != nil { return ctrl.Result{}, fmt.Errorf("failed to create monitoring cluster role binding: %w", err) } - err = openshift.CreateCAConfigmap(ctx, r.Client, namespace) + err = openshift.CreateCAConfigmap(ctx, r.Client, r.Namespace) if err != nil { return ctrl.Result{}, fmt.Errorf("failed to create CA configmap: %w", err) } } else { // Render the prometheus templates renderer := rendererutil.NewRenderer() - toDeploy, err := rendering.Render(ctx, renderer, r.Client, hubInfo) + toDeploy, err := rendering.Render(ctx, renderer, r.Client, hubInfo, r.Namespace) if err != nil { return ctrl.Result{}, fmt.Errorf("failed to render prometheus templates: %w", err) } + deployer := deploying.NewDeployer(r.Client) + + // Ordering resources to ensure they are applied in the correct order + slices.SortFunc(toDeploy, func(a, b *unstructured.Unstructured) int { + return resourcePriority(a) - resourcePriority(b) + }) + for _, res := range toDeploy { - if res.GetNamespace() != namespace { + if res.GetNamespace() != r.Namespace { globalRes = append(globalRes, res) } - if !isHubMetricsCollector { + if !r.IsHubMetricsCollector { // For kind tests we need to deploy prometheus in hub but cannot set controller // reference as there is no observabilityaddon @@ -273,62 +269,51 @@ func (r *ObservabilityAddonReconciler) Reconcile(ctx context.Context, req ctrl.R } if err := deployer.Deploy(ctx, res); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to deploy %s %s/%s: %w", res.GetKind(), namespace, res.GetName(), err) + return ctrl.Result{}, fmt.Errorf("failed to deploy %s %s/%s: %w", res.GetKind(), r.Namespace, res.GetName(), err) } } } // create or update the cluster-monitoring-config configmap and relevant resources - if err := createOrUpdateClusterMonitoringConfig(ctx, hubInfo, clusterID, r.Client, installPrometheus); err != nil { + if err := createOrUpdateClusterMonitoringConfig(ctx, hubInfo, clusterID, r.Client, r.InstallPrometheus, r.Namespace); err != nil { return ctrl.Result{}, fmt.Errorf("failed to create or update cluster monitoring config: %w", err) } - forceRestart := req.Name == mtlsCertName || req.Name == mtlsCaName || req.Name == openshift.CaConfigmapName - - if obsAddon.Spec.EnableMetrics || isHubMetricsCollector { - if isHubMetricsCollector { - mcoList := &oav1beta2.MultiClusterObservabilityList{} - err := r.HubClient.List(ctx, mcoList, client.InNamespace(corev1.NamespaceAll)) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to get multiclusterobservability: %w", err) - } - if len(mcoList.Items) != 1 { - log.Error(nil, fmt.Sprintf("Expected 1 multiclusterobservability, found %d", len(mcoList.Items))) - return ctrl.Result{}, nil - } - obsAddon.Spec = *mcoList.Items[0].Spec.ObservabilityAddonSpec - } - created, err := updateMetricsCollectors( - ctx, - r.Client, - obsAddon.Spec, - *hubInfo, clusterID, - clusterType, - 1, - forceRestart) + if r.IsHubMetricsCollector { + mcoList := &oav1beta2.MultiClusterObservabilityList{} + err := r.HubClient.List(ctx, mcoList, client.InNamespace(corev1.NamespaceAll)) if err != nil { - if !isHubMetricsCollector { - if err := util.ReportStatus(ctx, r.Client, util.Degraded, obsAddon.Name, obsAddon.Namespace); err != nil { - log.Error(err, "Failed to report status") - } - } - return ctrl.Result{}, fmt.Errorf("failed to update metrics collectors: %w", err) + return ctrl.Result{}, fmt.Errorf("failed to get multiclusterobservability: %w", err) } - if created && !isHubMetricsCollector { - if err := util.ReportStatus(ctx, r.Client, util.Deployed, obsAddon.Name, obsAddon.Namespace); err != nil { - log.Error(err, "Failed to report status") - } - } - } else { - deleted, err := updateMetricsCollectors(ctx, r.Client, obsAddon.Spec, *hubInfo, clusterID, clusterType, 0, false) - if err != nil { - return ctrl.Result{}, fmt.Errorf("failed to update metrics collectors: %w", err) + if len(mcoList.Items) != 1 { + log.Error(nil, fmt.Sprintf("Expected 1 multiclusterobservability, found %d", len(mcoList.Items))) + return ctrl.Result{}, nil } - if deleted && !isHubMetricsCollector { - if err := util.ReportStatus(ctx, r.Client, util.Disabled, obsAddon.Name, obsAddon.Namespace); err != nil { - log.Error(err, "Failed to report status") - } + obsAddon.Spec = *mcoList.Items[0].Spec.ObservabilityAddonSpec + } + + metricsCollector := collector.MetricsCollector{ + Client: r.Client, + ClusterInfo: collector.ClusterInfo{ + ClusterID: clusterID, + ClusterType: clusterType, + InstallPrometheus: r.InstallPrometheus, + IsHubMetricsCollector: r.IsHubMetricsCollector, + }, + HubInfo: hubInfo, + Log: log.WithName("metrics-collector"), + Namespace: r.Namespace, + ObsAddon: obsAddon, + ServiceAccountName: r.ServiceAccountName, + } + + if err := metricsCollector.Update(ctx, req); err != nil { + wrappedErr := fmt.Errorf("failed to update metrics collector: %w", err) + if errors.IsConflict(err) || util.IsTransientClientErr(err) { + log.Info("Retrying due to conflict or transient client error") + return ctrl.Result{Requeue: true}, wrappedErr } + return ctrl.Result{}, wrappedErr } return ctrl.Result{}, nil @@ -340,16 +325,18 @@ func (r *ObservabilityAddonReconciler) initFinalization( ) (bool, error) { if delete && slices.Contains(hubObsAddon.GetFinalizers(), obsAddonFinalizer) { log.Info("To clean observability components/configurations in the cluster") - err := deleteMetricsCollector(ctx, r.Client, metricsCollectorName) - if err != nil { - return false, err + + metricsCollector := collector.MetricsCollector{ + Client: r.Client, + Log: log.WithName("metrics-collector"), + Namespace: r.Namespace, } - err = deleteMetricsCollector(ctx, r.Client, uwlMetricsCollectorName) - if err != nil { - return false, err + if err := metricsCollector.Delete(ctx); err != nil { + return false, fmt.Errorf("failed to delete metrics collector: %w", err) } + // revert the change to cluster monitoring stack - err = revertClusterMonitoringConfig(ctx, r.Client) + err := revertClusterMonitoringConfig(ctx, r.Client) if err != nil { return false, err } @@ -365,14 +352,14 @@ func (r *ObservabilityAddonReconciler) initFinalization( // SHould we return true if metricscollector is not found as that means // metrics collector is not present? Moved this part up as we need to clean // up cm and crb before we remove the finalizer - is that the right way to do it? - if !installPrometheus { + if !r.InstallPrometheus { err = openshift.DeleteMonitoringClusterRoleBinding(ctx, r.Client) if err != nil { log.Error(err, "Failed to delete monitoring cluster role binding") return false, err } log.Info("clusterrolebinding deleted") - err = openshift.DeleteCAConfigmap(ctx, r.Client, namespace) + err = openshift.DeleteCAConfigmap(ctx, r.Client, r.Namespace) if err != nil { log.Error(err, "Failed to delete CA configmap") return false, err @@ -410,7 +397,7 @@ func (r *ObservabilityAddonReconciler) initFinalization( func (r *ObservabilityAddonReconciler) ensureOpenShiftMonitoringLabelAndRole(ctx context.Context) error { existingNs := &corev1.Namespace{} - resNS := namespace + resNS := r.Namespace role := rbacv1.Role{ ObjectMeta: metav1.ObjectMeta{ @@ -462,7 +449,7 @@ func (r *ObservabilityAddonReconciler) ensureOpenShiftMonitoringLabelAndRole(ctx err = r.Client.Update(ctx, existingNs) if err != nil { log.Error(err, fmt.Sprintf("Failed to update namespace for Endpoint Operator: %s with the label: %s", - namespace, openShiftClusterMonitoringlabel)) + r.Namespace, openShiftClusterMonitoringlabel)) return err } } @@ -504,16 +491,12 @@ func (r *ObservabilityAddonReconciler) ensureOpenShiftMonitoringLabelAndRole(ctx // SetupWithManager sets up the controller with the Manager. func (r *ObservabilityAddonReconciler) SetupWithManager(mgr ctrl.Manager) error { - if os.Getenv("NAMESPACE") != "" { - namespace = os.Getenv("NAMESPACE") - } - ctrlBuilder := ctrl.NewControllerManagedBy(mgr).For( &oav1beta1.ObservabilityAddon{}, - builder.WithPredicates(getPred(obAddonName, namespace, true, true, true)), + builder.WithPredicates(getPred(obAddonName, r.Namespace, true, true, true)), ) - if isHubMetricsCollector { + if r.IsHubMetricsCollector { ctrlBuilder = ctrlBuilder.Watches( &oav1beta2.MultiClusterObservability{}, &handler.EnqueueRequestForObject{}, @@ -524,27 +507,27 @@ func (r *ObservabilityAddonReconciler) SetupWithManager(mgr ctrl.Manager) error Watches( &corev1.Secret{}, &handler.EnqueueRequestForObject{}, - builder.WithPredicates(getPred(operatorconfig.HubInfoSecretName, namespace, true, true, false)), + builder.WithPredicates(getPred(operatorconfig.HubInfoSecretName, r.Namespace, true, true, false)), ). Watches( &corev1.Secret{}, &handler.EnqueueRequestForObject{}, - builder.WithPredicates(getPred(mtlsCertName, namespace, true, true, false)), + builder.WithPredicates(getPred(mtlsCertName, r.Namespace, true, true, false)), ). Watches( &corev1.Secret{}, &handler.EnqueueRequestForObject{}, - builder.WithPredicates(getPred(mtlsCaName, namespace, true, true, false)), + builder.WithPredicates(getPred(mtlsCaName, r.Namespace, true, true, false)), ). Watches( &corev1.Secret{}, &handler.EnqueueRequestForObject{}, - builder.WithPredicates(getPred(hubAmAccessorSecretName, namespace, true, true, false)), + builder.WithPredicates(getPred(hubAmAccessorSecretName, r.Namespace, true, true, false)), ). Watches( &corev1.ConfigMap{}, &handler.EnqueueRequestForObject{}, - builder.WithPredicates(getPred(operatorconfig.AllowlistConfigMapName, namespace, true, true, false)), + builder.WithPredicates(getPred(operatorconfig.AllowlistConfigMapName, r.Namespace, true, true, false)), ). Watches( &corev1.ConfigMap{}, @@ -554,17 +537,17 @@ func (r *ObservabilityAddonReconciler) SetupWithManager(mgr ctrl.Manager) error Watches( &corev1.ConfigMap{}, &handler.EnqueueRequestForObject{}, - builder.WithPredicates(getPred(openshift.CaConfigmapName, namespace, false, true, true)), + builder.WithPredicates(getPred(openshift.CaConfigmapName, r.Namespace, false, true, true)), ). Watches( &appsv1.Deployment{}, &handler.EnqueueRequestForObject{}, - builder.WithPredicates(getPred(metricsCollectorName, namespace, true, true, true)), + builder.WithPredicates(getPred(metricsCollectorName, r.Namespace, true, true, true)), ). Watches( &appsv1.Deployment{}, &handler.EnqueueRequestForObject{}, - builder.WithPredicates(getPred(uwlMetricsCollectorName, namespace, true, true, true)), + builder.WithPredicates(getPred(uwlMetricsCollectorName, r.Namespace, true, true, true)), ). Watches( &rbacv1.ClusterRoleBinding{}, @@ -574,7 +557,7 @@ func (r *ObservabilityAddonReconciler) SetupWithManager(mgr ctrl.Manager) error Watches( &corev1.ConfigMap{}, &handler.EnqueueRequestForObject{}, - builder.WithPredicates(getPred(operatorconfig.ImageConfigMap, namespace, true, true, false)), + builder.WithPredicates(getPred(operatorconfig.ImageConfigMap, r.Namespace, true, true, false)), ). Watches( &appsv1.StatefulSet{}, @@ -588,11 +571,11 @@ func (r *ObservabilityAddonReconciler) SetupWithManager(mgr ctrl.Manager) error return []reconcile.Request{ {NamespacedName: types.NamespacedName{ Name: "metrics-collector-clientca-metric", - Namespace: namespace, + Namespace: r.Namespace, }}, {NamespacedName: types.NamespacedName{ Name: "uwl-metrics-collector-clientca-metric", - Namespace: namespace, + Namespace: r.Namespace, }}, } } @@ -611,3 +594,18 @@ func remove(list []string, s string) []string { } return result } + +// resourcePriority returns the priority of the resource. +// This is used to order the resources to be created in the correct order. +func resourcePriority(resource *unstructured.Unstructured) int { + switch resource.GetKind() { + case "Role", "ClusterRole": + return 1 + case "RoleBinding", "ClusterRoleBinding": + return 2 + case "CustomResourceDefinition": + return 3 + default: + return 4 + } +} diff --git a/operators/endpointmetrics/controllers/observabilityendpoint/observabilityaddon_controller_integration_test.go b/operators/endpointmetrics/controllers/observabilityendpoint/observabilityaddon_controller_integration_test.go index cee4b84a6..552ac60b3 100644 --- a/operators/endpointmetrics/controllers/observabilityendpoint/observabilityaddon_controller_integration_test.go +++ b/operators/endpointmetrics/controllers/observabilityendpoint/observabilityaddon_controller_integration_test.go @@ -8,6 +8,7 @@ package observabilityendpoint import ( "context" + "fmt" "os" "path/filepath" "testing" @@ -17,6 +18,7 @@ import ( promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" "github.com/stolostron/multicluster-observability-operator/operators/endpointmetrics/pkg/hypershift" "github.com/stolostron/multicluster-observability-operator/operators/endpointmetrics/pkg/util" + observabilityshared "github.com/stolostron/multicluster-observability-operator/operators/multiclusterobservability/api/shared" oav1beta1 "github.com/stolostron/multicluster-observability-operator/operators/multiclusterobservability/api/v1beta1" mcov1beta2 "github.com/stolostron/multicluster-observability-operator/operators/multiclusterobservability/api/v1beta2" "github.com/stretchr/testify/assert" @@ -30,6 +32,7 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/wait" kubescheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/envtest" @@ -37,28 +40,34 @@ import ( metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" ) +var ( + testEnvSpoke *envtest.Environment + restCfgSpoke *rest.Config + testEnvHub *envtest.Environment + restCfgHub *rest.Config +) + // TestIntegrationReconcileHypershift tests the reconcile function for hypershift CRDs. func TestIntegrationReconcileHypershift(t *testing.T) { - testNamespace := "open-cluster-management-addon-observability" - namespace = testNamespace - hubNamespace = "local-cluster" - isHubMetricsCollector = true - installPrometheus = false - serviceAccountName = "endpoint-monitoring-operator" + testNamespace := "test-ns" + + scheme := createBaseScheme() + hyperv1.AddToScheme(scheme) + + k8sClient, err := client.New(restCfgHub, client.Options{Scheme: scheme}) + if err != nil { + t.Fatal(err) + } - testEnv, k8sClient := setupTestEnv(t) - defer testEnv.Stop() + setupCommonHubResources(t, k8sClient, testNamespace) + defer tearDownCommonHubResources(t, k8sClient, testNamespace) hostedClusterNs := "hosted-cluster-ns" hostedClusterName := "myhostedcluster" hostedCluster := newHostedCluster(hostedClusterName, hostedClusterNs) + // Create resources required for the hypershift case resourcesDeps := []client.Object{ - // Create resources required for the observability addon controller - makeNamespace(testNamespace), - newHubInfoSecret([]byte{}, testNamespace), - newImagesCM(testNamespace), - // Create resources required for the hypershift case makeNamespace(hostedClusterNs), makeNamespace(hypershift.HostedClusterNamespace(hostedCluster)), hostedCluster, @@ -69,9 +78,9 @@ func TestIntegrationReconcileHypershift(t *testing.T) { t.Fatalf("Failed to create resources: %v", err) } - mgr, err := ctrl.NewManager(testEnv.Config, ctrl.Options{ + mgr, err := ctrl.NewManager(testEnvHub.Config, ctrl.Options{ Scheme: k8sClient.Scheme(), - Metrics: metricsserver.Options{BindAddress: "0"}, + Metrics: metricsserver.Options{BindAddress: "0"}, // Avoids port conflict with the default port 8080 }) assert.NoError(t, err) @@ -80,15 +89,24 @@ func TestIntegrationReconcileHypershift(t *testing.T) { }) assert.NoError(t, err) reconciler := ObservabilityAddonReconciler{ - Client: k8sClient, - HubClient: hubClientWithReload, + Client: k8sClient, + HubClient: hubClientWithReload, + IsHubMetricsCollector: true, + Scheme: scheme, + Namespace: testNamespace, + HubNamespace: "local-cluster", + ServiceAccountName: "endpoint-monitoring-operator", + InstallPrometheus: false, } err = reconciler.SetupWithManager(mgr) assert.NoError(t, err) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go func() { - err = mgr.Start(ctrl.SetupSignalHandler()) + err = mgr.Start(ctx) assert.NoError(t, err) }() @@ -105,58 +123,131 @@ func TestIntegrationReconcileHypershift(t *testing.T) { assert.NoError(t, err) } -// setupTestEnv starts the test environment (etcd and kube api-server). -func setupTestEnv(t *testing.T) (*envtest.Environment, client.Client) { +func TestMain(m *testing.M) { + opts := zap.Options{ + Development: true, + } + ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + rootPath := filepath.Join("..", "..", "..") - crds := readCRDFiles(t, + spokeCrds := readCRDFiles( + filepath.Join(rootPath, "multiclusterobservability", "config", "crd", "bases", "observability.open-cluster-management.io_observabilityaddons.yaml"), + ) + testEnvSpoke = &envtest.Environment{ + CRDDirectoryPaths: []string{filepath.Join("testdata", "crd"), filepath.Join("..", "..", "config", "crd", "bases")}, + CRDs: spokeCrds, + ControlPlaneStopTimeout: 5 * time.Minute, + } + + var err error + restCfgSpoke, err = testEnvSpoke.Start() + if err != nil { + panic(fmt.Sprintf("Failed to start spoke test environment: %v", err)) + } + + hubCRDs := readCRDFiles( filepath.Join(rootPath, "multiclusterobservability", "config", "crd", "bases", "observability.open-cluster-management.io_multiclusterobservabilities.yaml"), filepath.Join(rootPath, "endpointmetrics", "manifests", "prometheus", "crd", "servicemonitor_crd_0_53_1.yaml"), - filepath.Join(rootPath, "endpointmetrics", "manifests", "prometheus", "crd", "prometheusrule_crd_0_53_1.yaml"), ) - testEnv := &envtest.Environment{ - CRDDirectoryPaths: []string{filepath.Join("testdata", "crd"), filepath.Join("..", "..", "config", "crd", "bases")}, - CRDs: crds, + hubCRDs = append(hubCRDs, spokeCrds...) + + testEnvHub = &envtest.Environment{ + CRDDirectoryPaths: []string{filepath.Join("testdata", "crd"), filepath.Join("..", "..", "..", "config", "crd", "bases")}, + CRDs: hubCRDs, + ControlPlaneStopTimeout: 5 * time.Minute, } - cfg, err := testEnv.Start() + restCfgHub, err = testEnvHub.Start() if err != nil { - t.Fatal(err) + panic(fmt.Sprintf("Failed to start hub test environment: %v", err)) + } + + code := m.Run() + + err = testEnvSpoke.Stop() + if err != nil { + panic(fmt.Sprintf("Failed to stop spoke test environment: %v", err)) } + err = testEnvHub.Stop() + if err != nil { + panic(fmt.Sprintf("Failed to stop hub test environment: %v", err)) + } + + os.Exit(code) +} + +func createBaseScheme() *runtime.Scheme { scheme := runtime.NewScheme() kubescheme.AddToScheme(scheme) - hyperv1.AddToScheme(scheme) promv1.AddToScheme(scheme) oav1beta1.AddToScheme(scheme) mcov1beta2.AddToScheme(scheme) + return scheme +} - k8sClient, err := client.New(cfg, client.Options{Scheme: scheme}) - if err != nil { - t.Fatal(err) +func setupCommonHubResources(t *testing.T, k8sClient client.Client, ns string) { + // Create resources required for the observability addon controller + resourcesDeps := []client.Object{ + makeNamespace(ns), + newHubInfoSecret([]byte{}, ns), + newImagesCM(ns), + } + if err := createResources(k8sClient, resourcesDeps...); err != nil { + t.Fatalf("Failed to create resources: %v", err) + } +} + +func tearDownCommonHubResources(t *testing.T, k8sClient client.Client, ns string) { + // Delete resources required for the observability addon controller + resourcesDeps := []client.Object{ + makeNamespace(ns), + } + for _, resource := range resourcesDeps { + if err := k8sClient.Delete(context.Background(), resource); err != nil { + t.Fatalf("Failed to delete resource: %v", err) + } } +} - opts := zap.Options{ - Development: true, +func setupCommonSpokeResources(t *testing.T, k8sClient client.Client) { + // Create resources required for the observability addon controller + resourcesDeps := []client.Object{ + makeNamespace("open-cluster-management-addon-observability"), + newHubInfoSecret([]byte{}, "open-cluster-management-addon-observability"), + newImagesCM("open-cluster-management-addon-observability"), } - ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + if err := createResources(k8sClient, resourcesDeps...); err != nil { + t.Fatalf("Failed to create resources: %v", err) + } +} - return testEnv, k8sClient +func tearDownCommonSpokeResources(t *testing.T, k8sClient client.Client) { + // Delete resources required for the observability addon controller + resourcesDeps := []client.Object{ + makeNamespace("open-cluster-management-addon-observability"), + } + for _, resource := range resourcesDeps { + if err := k8sClient.Delete(context.Background(), resource); err != nil { + t.Fatalf("Failed to delete resource: %v", err) + } + } } -func readCRDFiles(t *testing.T, crdPaths ...string) []*apiextensionsv1.CustomResourceDefinition { +func readCRDFiles(crdPaths ...string) []*apiextensionsv1.CustomResourceDefinition { ret := []*apiextensionsv1.CustomResourceDefinition{} for _, crdPath := range crdPaths { crdYamlData, err := os.ReadFile(crdPath) if err != nil { - t.Fatalf("Failed to read CRD file: %v", err) + panic(fmt.Sprintf("Failed to read CRD file: %v", err)) } dec := yaml.NewDecodingSerializer(unstructured.UnstructuredJSONScheme) var crd apiextensionsv1.CustomResourceDefinition _, _, err = dec.Decode(crdYamlData, nil, &crd) if err != nil { - t.Fatalf("Failed to decode CRD: %v", err) + panic(fmt.Sprintf("Failed to decode CRD: %v", err)) } ret = append(ret, &crd) @@ -183,6 +274,18 @@ func createResources(client client.Client, resources ...client.Object) error { return nil } +func newObservabilityAddonBis(name, ns string) *oav1beta1.ObservabilityAddon { + return &oav1beta1.ObservabilityAddon{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: ns, + }, + Spec: observabilityshared.ObservabilityAddonSpec{ + EnableMetrics: true, + }, + } +} + func newHostedCluster(name, ns string) *hyperv1.HostedCluster { return &hyperv1.HostedCluster{ ObjectMeta: metav1.ObjectMeta{ @@ -226,3 +329,30 @@ func newServiceMonitor(name, namespace string) *promv1.ServiceMonitor { }, } } + +func newMicroshiftVersionCM(namespace string) *corev1.ConfigMap { + return &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "microshift-version", + Namespace: namespace, + }, + Data: map[string]string{ + "version": "v4.15.15", + }, + } +} + +func newMetricsAllowlistCM(namespace string) *corev1.ConfigMap { + return &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "observability-metrics-allowlist", + Namespace: namespace, + }, + Data: map[string]string{ + "metrics_list.yaml": ` +names: + - apiserver_watch_events_sizes_bucket +`, + }, + } +} diff --git a/operators/endpointmetrics/controllers/observabilityendpoint/observabilityaddon_controller_test.go b/operators/endpointmetrics/controllers/observabilityendpoint/observabilityaddon_controller_test.go index 904b46b8a..28093cf2d 100644 --- a/operators/endpointmetrics/controllers/observabilityendpoint/observabilityaddon_controller_test.go +++ b/operators/endpointmetrics/controllers/observabilityendpoint/observabilityaddon_controller_test.go @@ -23,28 +23,25 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/kubernetes/scheme" + kubescheme "k8s.io/client-go/kubernetes/scheme" clusterv1 "open-cluster-management.io/api/cluster/v1" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/controller-runtime/pkg/client/interceptor" - addonv1alpha1 "open-cluster-management.io/api/addon/v1alpha1" - "github.com/stolostron/multicluster-observability-operator/operators/endpointmetrics/pkg/openshift" "github.com/stolostron/multicluster-observability-operator/operators/endpointmetrics/pkg/util" oashared "github.com/stolostron/multicluster-observability-operator/operators/multiclusterobservability/api/shared" oav1beta1 "github.com/stolostron/multicluster-observability-operator/operators/multiclusterobservability/api/v1beta1" mcov1beta2 "github.com/stolostron/multicluster-observability-operator/operators/multiclusterobservability/api/v1beta2" operatorconfig "github.com/stolostron/multicluster-observability-operator/operators/pkg/config" + addonv1alpha1 "open-cluster-management.io/api/addon/v1alpha1" ) const ( - name = "observability-addon" - testNamespace = "test-ns" - testHubNamspace = "test-hub-ns" - testBearerToken = "test-bearer-token" + name = "observability-addon" + restartLabel = "cert/time-restarted" ) var ( @@ -97,14 +94,14 @@ func newHubInfoSecret(data []byte, ns string) *corev1.Secret { } } -func newAMAccessorSecret(ns string) *corev1.Secret { +func newAMAccessorSecret(ns string, val string) *corev1.Secret { return &corev1.Secret{ ObjectMeta: metav1.ObjectMeta{ Name: hubAmAccessorSecretName, Namespace: ns, }, Data: map[string][]byte{ - "token": []byte(testBearerToken), + "token": []byte(val), }, } } @@ -134,22 +131,17 @@ func newImagesCM(ns string) *corev1.ConfigMap { Namespace: ns, }, Data: map[string]string{ - operatorconfig.MetricsCollectorKey: "metrics-collector-image", + operatorconfig.MetricsCollectorKey: "metrics-collector-image", + operatorconfig.NodeExporterKey: "node-exporter-image", + operatorconfig.KubeStateMetricsKey: "kube-state-metrics-image", + operatorconfig.KubeRbacProxyKey: "kube-rbac-proxy-image", + operatorconfig.PrometheusOperatorKey: "prometheus-operator-image", }, } } func init() { os.Setenv("UNIT_TEST", "true") - s := scheme.Scheme - addonv1alpha1.AddToScheme(s) - oav1beta1.AddToScheme(s) - ocinfrav1.AddToScheme(s) - hyperv1.AddToScheme(s) - promv1.AddToScheme(s) - - namespace = testNamespace - hubNamespace = testHubNamspace } func TestObservabilityAddonController(t *testing.T) { @@ -162,10 +154,12 @@ alertmanager-router-ca: | -----END CERTIFICATE----- `) + testNamespace := "test-ns" + testHubNamespace := "test-hub-ns" hubObjs := []runtime.Object{} hubInfo := newHubInfoSecret(hubInfoData, testNamespace) - amAccessSrt := newAMAccessorSecret(testNamespace) - allowList := getAllowlistCM() + amAccessSrt := newAMAccessorSecret(testNamespace, "test-token") + allowList := getAllowlistCM(testNamespace) images := newImagesCM(testNamespace) objs := []runtime.Object{hubInfo, amAccessSrt, allowList, images, cv, infra, &corev1.ConfigMap{ @@ -182,34 +176,24 @@ alertmanager-router-ca: | Name: "test-ns", }, }, + &appv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "endpoint-observability-operator", + Namespace: "test-ns", + }, + }, } - scheme := scheme.Scheme - addonv1alpha1.AddToScheme(scheme) - mcov1beta2.AddToScheme(scheme) - oav1beta1.AddToScheme(scheme) - corev1.AddToScheme(scheme) - clusterv1.AddToScheme(scheme) - ocinfrav1.AddToScheme(scheme) + s := runtime.NewScheme() + kubescheme.AddToScheme(s) + addonv1alpha1.AddToScheme(s) + oav1beta1.AddToScheme(s) + ocinfrav1.AddToScheme(s) + hyperv1.AddToScheme(s) + promv1.AddToScheme(s) - hubClient := fake.NewClientBuilder(). - WithScheme(scheme). - WithRuntimeObjects(hubObjs...). - WithStatusSubresource( - &addonv1alpha1.ManagedClusterAddOn{}, - &mcov1beta2.MultiClusterObservability{}, - &oav1beta1.ObservabilityAddon{}, - ). - Build() - c := fake.NewClientBuilder(). - WithScheme(scheme). - WithRuntimeObjects(objs...). - WithStatusSubresource( - &addonv1alpha1.ManagedClusterAddOn{}, - &mcov1beta2.MultiClusterObservability{}, - &oav1beta1.ObservabilityAddon{}, - ). - Build() + hubClient := fake.NewClientBuilder().WithScheme(s).WithRuntimeObjects(hubObjs...).Build() + c := fake.NewClientBuilder().WithScheme(s).WithRuntimeObjects(objs...).Build() hubClientWithReload, err := util.NewReloadableHubClientWithReloadFunc(func() (client.Client, error) { return hubClient, nil @@ -218,8 +202,13 @@ alertmanager-router-ca: | t.Fatalf("Failed to create hub client with reload: %v", err) } r := &ObservabilityAddonReconciler{ - Client: c, - HubClient: hubClientWithReload, + Client: c, + HubClient: hubClientWithReload, + Scheme: s, + IsHubMetricsCollector: false, + Namespace: testNamespace, + HubNamespace: testHubNamespace, + ServiceAccountName: "test-sa", } // test error in reconcile if missing obervabilityaddon @@ -236,7 +225,7 @@ alertmanager-router-ca: | } // test reconcile w/o prometheus-k8s svc - err = hubClient.Create(ctx, newObservabilityAddon(name, testHubNamspace)) + err = hubClient.Create(ctx, newObservabilityAddon(name, testHubNamespace)) if err != nil { t.Fatalf("failed to create hub oba to install: (%v)", err) } @@ -280,19 +269,19 @@ alertmanager-router-ca: | } cm := &corev1.ConfigMap{} err = c.Get(ctx, types.NamespacedName{Name: openshift.CaConfigmapName, - Namespace: namespace}, cm) + Namespace: testNamespace}, cm) if err != nil { t.Fatalf("Required configmap not created: (%v)", err) } deploy := &appv1.Deployment{} err = c.Get(ctx, types.NamespacedName{Name: metricsCollectorName, - Namespace: namespace}, deploy) + Namespace: testNamespace}, deploy) if err != nil { t.Fatalf("Metrics collector deployment not created: (%v)", err) } foundOba := &oav1beta1.ObservabilityAddon{} err = hubClient.Get(ctx, types.NamespacedName{Name: obAddonName, - Namespace: hubNamespace}, foundOba) + Namespace: testHubNamespace}, foundOba) if err != nil { t.Fatalf("Failed to get observabilityAddon: (%v)", err) } @@ -303,7 +292,7 @@ alertmanager-router-ca: | // test reconcile metrics collector deployment updated if cert secret updated found := &appv1.Deployment{} err = c.Get(ctx, types.NamespacedName{Name: metricsCollectorName, - Namespace: namespace}, found) + Namespace: testNamespace}, found) if err != nil { t.Fatalf("Metrics collector deployment not found: (%v)", err) } @@ -323,7 +312,7 @@ alertmanager-router-ca: | t.Fatalf("reconcile for update: (%v)", err) } err = c.Get(ctx, types.NamespacedName{Name: metricsCollectorName, - Namespace: namespace}, deploy) + Namespace: testNamespace}, deploy) if err != nil { t.Fatalf("Metrics collector deployment not found: (%v)", err) } @@ -353,7 +342,7 @@ alertmanager-router-ca: | t.Fatalf("reconcile for disable: (%v)", err) } err = c.Get(ctx, types.NamespacedName{Name: metricsCollectorName, - Namespace: namespace}, deploy) + Namespace: testNamespace}, deploy) if err != nil { t.Fatalf("Metrics collector deployment not created: (%v)", err) } @@ -382,18 +371,18 @@ alertmanager-router-ca: | t.Fatalf("Required clusterrolebinding not deleted") } err = c.Get(ctx, types.NamespacedName{Name: openshift.CaConfigmapName, - Namespace: namespace}, cm) + Namespace: testNamespace}, cm) if !errors.IsNotFound(err) { t.Fatalf("Required configmap not deleted") } err = c.Get(ctx, types.NamespacedName{Name: metricsCollectorName, - Namespace: namespace}, deploy) + Namespace: testNamespace}, deploy) if !errors.IsNotFound(err) { t.Fatalf("Metrics collector deployment not deleted") } foundOba1 := &oav1beta1.ObservabilityAddon{} err = hubClient.Get(ctx, types.NamespacedName{Name: obAddonName, - Namespace: hubNamespace}, foundOba1) + Namespace: testHubNamespace}, foundOba1) if err != nil { t.Fatalf("Failed to get observabilityAddon: (%v)", err) } @@ -411,13 +400,15 @@ alertmanager-router-ca: | xxxxxxxxxxxxxxxxxxxxxxxxxxx -----END CERTIFICATE----- `) + testNamespace := "test-ns" + testHubNamespace := "test-hub-ns" hubObjs := []runtime.Object{ - newObservabilityAddon(name, testHubNamspace), + newObservabilityAddon(name, testHubNamespace), } hubInfo := newHubInfoSecret(hubInfoData, testNamespace) - amAccessSrt := newAMAccessorSecret(testNamespace) - allowList := getAllowlistCM() + amAccessSrt := newAMAccessorSecret(testNamespace, "test-token") + allowList := getAllowlistCM(testNamespace) images := newImagesCM(testNamespace) objs := []runtime.Object{hubInfo, amAccessSrt, allowList, images, infra, &corev1.ConfigMap{ @@ -436,15 +427,22 @@ alertmanager-router-ca: | }, newObservabilityAddon(name, testNamespace), newPromSvc(), + &appv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "endpoint-observability-operator", + Namespace: "test-ns", + }, + }, } - scheme := scheme.Scheme + scheme := runtime.NewScheme() + kubescheme.AddToScheme(scheme) addonv1alpha1.AddToScheme(scheme) mcov1beta2.AddToScheme(scheme) oav1beta1.AddToScheme(scheme) - corev1.AddToScheme(scheme) clusterv1.AddToScheme(scheme) ocinfrav1.AddToScheme(scheme) + promv1.AddToScheme(scheme) hubClient := fake.NewClientBuilder(). WithScheme(scheme). @@ -483,14 +481,19 @@ alertmanager-router-ca: | t.Fatalf("Failed to create hub client with reload: %v", err) } r := &ObservabilityAddonReconciler{ - Client: c, - HubClient: hubClientWithReload, + Client: c, + HubClient: hubClientWithReload, + Scheme: scheme, + IsHubMetricsCollector: false, + Namespace: testNamespace, + HubNamespace: testHubNamespace, + ServiceAccountName: "test-sa", } checkMetricsCollector := func() { deploy := &appv1.Deployment{} err = c.Get(context.Background(), types.NamespacedName{Name: metricsCollectorName, - Namespace: namespace}, deploy) + Namespace: testNamespace}, deploy) if err != nil { t.Fatalf("Metrics collector deployment not created: (%v)", err) } @@ -530,3 +533,43 @@ alertmanager-router-ca: | } checkMetricsCollector() } + +func getAllowlistCM(ns string) *corev1.ConfigMap { + return &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: operatorconfig.AllowlistConfigMapName, + Namespace: ns, + }, + Data: map[string]string{ + operatorconfig.MetricsConfigMapKey: ` +names: + - a + - b +matches: + - __name__="c" +recording_rules: + - record: f + expr: g +collect_rules: + - name: h + selector: + matchExpressions: + - key: clusterType + operator: NotIn + values: ["SNO"] + rules: + - collect: j + expr: k + for: 1m + names: + - c + matches: + - __name__="a" +`, + operatorconfig.UwlMetricsConfigMapKey: ` +names: + - uwl_a + - uwl_b +`}, + } +} diff --git a/operators/endpointmetrics/controllers/observabilityendpoint/ocp_monitoring_config.go b/operators/endpointmetrics/controllers/observabilityendpoint/ocp_monitoring_config.go index 88b058ca0..88fec2719 100644 --- a/operators/endpointmetrics/controllers/observabilityendpoint/ocp_monitoring_config.go +++ b/operators/endpointmetrics/controllers/observabilityendpoint/ocp_monitoring_config.go @@ -41,12 +41,12 @@ var ( // initializes clusterMonitoringConfigReverted based on the presence of clusterMonitoringRevertedName // configmap in openshift-monitoring namespace. -func initPersistedRevertState(ctx context.Context, client client.Client) error { +func initPersistedRevertState(ctx context.Context, client client.Client, ns string) error { if !persistedRevertStateRead { // check if reverted configmap is present found := &corev1.ConfigMap{} err := client.Get(ctx, types.NamespacedName{Name: clusterMonitoringRevertedName, - Namespace: namespace}, found) + Namespace: ns}, found) if err != nil { // treat this as non-fatal error if errors.IsNotFound(err) { @@ -68,9 +68,9 @@ func initPersistedRevertState(ctx context.Context, client client.Client) error { return nil } -func isRevertedAlready(ctx context.Context, client client.Client) (bool, error) { +func isRevertedAlready(ctx context.Context, client client.Client, ns string) (bool, error) { log.Info("in isRevertedAlready") - err := initPersistedRevertState(ctx, client) + err := initPersistedRevertState(ctx, client, ns) if err != nil { log.Info("isRevertedAlready: error from initPersistedRevertState", "error:", err.Error()) return clusterMonitoringConfigReverted, err @@ -80,8 +80,8 @@ func isRevertedAlready(ctx context.Context, client client.Client) (bool, error) } } -func setConfigReverted(ctx context.Context, client client.Client) error { - err := initPersistedRevertState(ctx, client) +func setConfigReverted(ctx context.Context, client client.Client, ns string) error { + err := initPersistedRevertState(ctx, client, ns) if err != nil { return err } @@ -90,7 +90,7 @@ func setConfigReverted(ctx context.Context, client client.Client) error { c := &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Name: clusterMonitoringRevertedName, - Namespace: namespace, + Namespace: ns, }, } err = client.Create(ctx, c) @@ -107,8 +107,8 @@ func setConfigReverted(ctx context.Context, client client.Client) error { return nil } -func unsetConfigReverted(ctx context.Context, client client.Client) error { - err := initPersistedRevertState(ctx, client) +func unsetConfigReverted(ctx context.Context, client client.Client, ns string) error { + err := initPersistedRevertState(ctx, client, ns) if err != nil { return err } @@ -116,7 +116,7 @@ func unsetConfigReverted(ctx context.Context, client client.Client) error { // delete any persistent state if present c := &corev1.ConfigMap{} err = client.Get(ctx, types.NamespacedName{Name: clusterMonitoringRevertedName, - Namespace: namespace}, c) + Namespace: ns}, c) if err != nil { if errors.IsNotFound(err) { log.Info("persistent state already set. cluster-monitoring-reverted configmap does not exist") @@ -160,39 +160,36 @@ func createHubAmRouterCASecret( Namespace: targetNamespace}, found) if err != nil { if errors.IsNotFound(err) { + log.Info(fmt.Sprintf("creating %s/%s secret", targetNamespace, hubAmRouterCASecretName)) err = client.Create(ctx, hubAmRouterCASecret) if err != nil { - log.Error(err, "failed to create the hub-alertmanager-router-ca secret") - return err + return fmt.Errorf("failed to create %s/%s secret: %w", targetNamespace, hubAmRouterCASecretName, err) } - log.Info("the hub-alertmanager-router-ca secret is created") return nil } else { - log.Error(err, "failed to check the hub-alertmanager-router-ca secret") - return err + return fmt.Errorf("failed to check the %s/%s secret: %w", targetNamespace, hubAmRouterCASecretName, err) } } - log.Info("the hub-alertmanager-router-ca secret already exists, check if it needs to be updated") if reflect.DeepEqual(found.Data, dataMap) { - log.Info("no change for the hub-alertmanager-router-ca secret") return nil - } else { - err = client.Update(ctx, hubAmRouterCASecret) - if err != nil { - log.Error(err, "failed to update the hub-alertmanager-router-ca secret") - return nil - } - log.Info("the hub-alertmanager-router-ca secret is updated") - return err } + + log.Info(fmt.Sprintf("updating %s/%s secret", targetNamespace, hubAmRouterCASecretName)) + err = client.Update(ctx, hubAmRouterCASecret) + if err != nil { + return fmt.Errorf("failed to update the %s/%s secret: %w", targetNamespace, hubAmRouterCASecretName, err) + } + + return err + } // createHubAmAccessorTokenSecret creates the secret that contains access token of the Hub's Alertmanager. -func createHubAmAccessorTokenSecret(ctx context.Context, client client.Client, targetNamespace string) error { - amAccessorToken, err := getAmAccessorToken(ctx, client) +func createHubAmAccessorTokenSecret(ctx context.Context, client client.Client, namespace, targetNamespace string) error { + amAccessorToken, err := getAmAccessorToken(ctx, client, namespace) if err != nil { - return fmt.Errorf("fail to get the alertmanager accessor token: %w", err) + return fmt.Errorf("fail to get %s/%s secret: %w", namespace, hubAmAccessorSecretName, err) } dataMap := map[string][]byte{hubAmAccessorSecretKey: []byte(amAccessorToken)} @@ -238,10 +235,10 @@ func createHubAmAccessorTokenSecret(ctx context.Context, client client.Client, t } // getAmAccessorToken retrieves the alertmanager access token from observability-alertmanager-accessor secret. -func getAmAccessorToken(ctx context.Context, client client.Client) (string, error) { +func getAmAccessorToken(ctx context.Context, client client.Client, ns string) (string, error) { amAccessorSecret := &corev1.Secret{} if err := client.Get(ctx, types.NamespacedName{Name: hubAmAccessorSecretName, - Namespace: namespace}, amAccessorSecret); err != nil { + Namespace: ns}, amAccessorSecret); err != nil { return "", err } @@ -294,6 +291,7 @@ func createOrUpdateClusterMonitoringConfig( clusterID string, client client.Client, installProm bool, + namespace string, ) error { targetNamespace := promNamespace if installProm { @@ -305,20 +303,19 @@ func createOrUpdateClusterMonitoringConfig( // create the hub-alertmanager-router-ca secret if it doesn't exist or update it if needed if err := createHubAmRouterCASecret(ctx, hubInfo, client, targetNamespace); err != nil { log.Error(err, "failed to create or update the hub-alertmanager-router-ca secret") - return err + return fmt.Errorf("failed to create or update the hub-alertmanager-router-ca secret: %w", err) } // create the observability-alertmanager-accessor secret if it doesn't exist or update it if needed - if err := createHubAmAccessorTokenSecret(ctx, client, targetNamespace); err != nil { - log.Error(err, "failed to create or update the observability-alertmanager-accessor secret") - return err + if err := createHubAmAccessorTokenSecret(ctx, client, namespace, targetNamespace); err != nil { + return fmt.Errorf("failed to create or update the alertmanager accessor token secret: %w", err) } // create or update the cluster-monitoring-config configmap and relevant resources if hubInfo.AlertmanagerEndpoint == "" { log.Info("request to disable alert forwarding") // only revert (once) if not done already and remember state - revertedAlready, err := isRevertedAlready(ctx, client) + revertedAlready, err := isRevertedAlready(ctx, client, namespace) if err != nil { return err } @@ -326,7 +323,7 @@ func createOrUpdateClusterMonitoringConfig( if err = revertClusterMonitoringConfig(ctx, client); err != nil { return err } - if err = setConfigReverted(ctx, client); err != nil { + if err = setConfigReverted(ctx, client, namespace); err != nil { return err } } else { @@ -337,7 +334,7 @@ func createOrUpdateClusterMonitoringConfig( if installProm { // no need to create configmap cluster-monitoring-config for *KS - return unset(ctx, client) + return unset(ctx, client, namespace) } // init the prometheus k8s config @@ -388,7 +385,7 @@ func createOrUpdateClusterMonitoringConfig( return err } log.Info("configmap created", "name", clusterMonitoringConfigName) - return unset(ctx, client) + return unset(ctx, client, namespace) } else { log.Error(err, "failed to check configmap", "name", clusterMonitoringConfigName) return err @@ -413,7 +410,7 @@ func createOrUpdateClusterMonitoringConfig( return err } log.Info("configmap updated", "name", clusterMonitoringConfigName) - return unset(ctx, client) + return unset(ctx, client, namespace) } log.Info("configmap already exists and key config.yaml exists, check if the value needs update", @@ -484,15 +481,15 @@ func createOrUpdateClusterMonitoringConfig( return err } log.Info("configmap updated", "name", clusterMonitoringConfigName) - return unset(ctx, client) + return unset(ctx, client, namespace) } // unset config reverted flag after successfully updating cluster-monitoring-config -func unset(ctx context.Context, client client.Client) error { +func unset(ctx context.Context, client client.Client, ns string) error { // if reverted before, reset so we can revert again - revertedAlready, err := isRevertedAlready(ctx, client) + revertedAlready, err := isRevertedAlready(ctx, client, ns) if err == nil && revertedAlready { - err = unsetConfigReverted(ctx, client) + err = unsetConfigReverted(ctx, client, ns) } return err } diff --git a/operators/endpointmetrics/controllers/observabilityendpoint/ocp_monitoring_config_test.go b/operators/endpointmetrics/controllers/observabilityendpoint/ocp_monitoring_config_test.go index c16b66368..ec9cc8154 100644 --- a/operators/endpointmetrics/controllers/observabilityendpoint/ocp_monitoring_config_test.go +++ b/operators/endpointmetrics/controllers/observabilityendpoint/ocp_monitoring_config_test.go @@ -67,6 +67,7 @@ prometheusK8s: ) func TestClusterMonitoringConfig(t *testing.T) { + testNamespace := "test-ns" ctrl.SetLogger(zap.New(zap.UseFlagOptions(&zap.Options{Development: true}))) tests := []struct { name string @@ -137,7 +138,8 @@ prometheusK8s: t.Fatalf("Failed to unmarshal hubInfo: (%v)", err) } hubInfoObj := newHubInfoSecret([]byte(hubInfoYAML), testNamespace) - amAccessSrt := newAMAccessorSecret(testNamespace) + tokenValue := "test-token" + amAccessSrt := newAMAccessorSecret(testNamespace, tokenValue) for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { @@ -145,12 +147,13 @@ prometheusK8s: if tt.ClusterMonitoringConfigCMExist { objs = append(objs, newClusterMonitoringConfigCM(tt.ClusterMonitoringConfigDataYaml, tt.Manager)) } - testCreateOrUpdateClusterMonitoringConfig(t, hubInfo, fake.NewClientBuilder().WithRuntimeObjects(objs...).Build(), tt.ExpectedDeleteClusterMonitoringConfigCM) + testCreateOrUpdateClusterMonitoringConfig(t, hubInfo, fake.NewClientBuilder().WithRuntimeObjects(objs...).Build(), tt.ExpectedDeleteClusterMonitoringConfigCM, tokenValue, testNamespace) }) } } func TestClusterMonitoringConfigAlertsDisabled(t *testing.T) { + testNamespace := "test-ns" ctrl.SetLogger(zap.New(zap.UseFlagOptions(&zap.Options{Development: true}))) ctx := context.TODO() @@ -160,7 +163,7 @@ func TestClusterMonitoringConfigAlertsDisabled(t *testing.T) { t.Fatalf("Failed to unmarshal hubInfo: (%v)", err) } hubInfoObj := newHubInfoSecret([]byte(hubInfoYAMLAlertsDisabled), testNamespace) - amAccessSrt := newAMAccessorSecret(testNamespace) + amAccessSrt := newAMAccessorSecret(testNamespace, "test-token") // Scenario 1: // create cluster-monitoring-config configmap with "manager: endpoint-monitoring-operator" @@ -298,14 +301,14 @@ func TestClusterMonitoringConfigAlertsDisabled(t *testing.T) { t.Fatalf("could not recreate hubInfoObject to enable alerts again") } t.Run("Reenable alert forwarding", func(t *testing.T) { - err = createOrUpdateClusterMonitoringConfig(ctx, hubInfo, testClusterID, c, false) + err = createOrUpdateClusterMonitoringConfig(ctx, hubInfo, testClusterID, c, false, testNamespace) if err != nil { t.Fatalf("Failed to create or update the cluster-monitoring-config configmap: (%v)", err) } foundclusterMonitoringRevertedCM := &corev1.ConfigMap{} err = c.Get(ctx, types.NamespacedName{Name: clusterMonitoringRevertedName, - Namespace: namespace}, foundclusterMonitoringRevertedCM) + Namespace: testNamespace}, foundclusterMonitoringRevertedCM) if err == nil { t.Fatalf("configmap %s still present after reenabling alerts", clusterMonitoringRevertedName) } @@ -346,9 +349,9 @@ func TestClusterMonitoringConfigAlertsDisabled(t *testing.T) { }) } -func testCreateOrUpdateClusterMonitoringConfig(t *testing.T, hubInfo *operatorconfig.HubInfo, c client.Client, expectedCMDelete bool) { +func testCreateOrUpdateClusterMonitoringConfig(t *testing.T, hubInfo *operatorconfig.HubInfo, c client.Client, expectedCMDelete bool, tokenValue, ns string) { ctx := context.TODO() - err := createOrUpdateClusterMonitoringConfig(ctx, hubInfo, testClusterID, c, false) + err := createOrUpdateClusterMonitoringConfig(ctx, hubInfo, testClusterID, c, false, ns) if err != nil { t.Fatalf("Failed to create or update the cluster-monitoring-config configmap: (%v)", err) } @@ -402,8 +405,8 @@ func testCreateOrUpdateClusterMonitoringConfig(t *testing.T, hubInfo *operatorco if !ok { t.Fatalf("no key %s found in the observability-alertmanager-accessor secret", hubAmAccessorSecretKey) } - if string(foundAmAccessorToken) != testBearerToken { - t.Fatalf("incorrect token found in the observability-alertmanager-accessor secret, got token: %s, expected value %s", foundAmAccessorToken, testBearerToken) + if string(foundAmAccessorToken) != tokenValue { + t.Fatalf("incorrect token found in the observability-alertmanager-accessor secret, got token: %s, expected value %s", foundAmAccessorToken, tokenValue) } } } diff --git a/operators/endpointmetrics/controllers/observabilityendpoint/predicate_func_test.go b/operators/endpointmetrics/controllers/observabilityendpoint/predicate_func_test.go index c142829fe..833a2d9ba 100644 --- a/operators/endpointmetrics/controllers/observabilityendpoint/predicate_func_test.go +++ b/operators/endpointmetrics/controllers/observabilityendpoint/predicate_func_test.go @@ -15,6 +15,7 @@ import ( func TestPredFunc(t *testing.T) { name := "test-obj" + testNamespace := "test-ns" caseList := []struct { caseName string namespace string @@ -59,6 +60,7 @@ func TestPredFunc(t *testing.T) { for _, c := range caseList { t.Run(c.caseName, func(t *testing.T) { + replicas := int32(2) pred := getPred(name, c.namespace, c.create, c.update, c.delete) ce := event.CreateEvent{ Object: &appsv1.Deployment{ @@ -67,7 +69,7 @@ func TestPredFunc(t *testing.T) { Namespace: c.namespace, }, Spec: appsv1.DeploymentSpec{ - Replicas: int32Ptr(2), + Replicas: &replicas, }, }, } @@ -141,7 +143,7 @@ func TestPredFunc(t *testing.T) { Namespace: c.namespace, }, Spec: appsv1.DeploymentSpec{ - Replicas: int32Ptr(2), + Replicas: &replicas, }, }, } diff --git a/operators/endpointmetrics/controllers/status/status_controller.go b/operators/endpointmetrics/controllers/status/status_controller.go index dffdd83d5..77716622f 100644 --- a/operators/endpointmetrics/controllers/status/status_controller.go +++ b/operators/endpointmetrics/controllers/status/status_controller.go @@ -56,7 +56,7 @@ func (r *StatusReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr return ctrl.Result{Requeue: true}, nil } - if isTransientErr(err) { + if util.IsTransientClientErr(err) { r.Logger.Info("Failed to get ObservabilityAddon in hub cluster, requeue with delay", "error", err) return requeueWithOptionalDelay(err), nil } @@ -85,7 +85,7 @@ func (r *StatusReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr return r.HubClient.Status().Update(ctx, updatedAddon) }) if retryErr != nil { - if isTransientErr(retryErr) || errors.IsConflict(retryErr) { + if util.IsTransientClientErr(retryErr) || errors.IsConflict(retryErr) { r.Logger.Info("Retryable error while updating status, request will be retried.", "error", retryErr) return requeueWithOptionalDelay(retryErr), nil } @@ -120,23 +120,6 @@ func (r *StatusReconciler) SetupWithManager(mgr ctrl.Manager) error { Complete(r) } -// isTransientErr checks if the error is a transient error -// This suggests that a retry (without any change) might be successful -func isTransientErr(err error) bool { - if _, ok := err.(net.Error); ok { - return true - } - - if statusErr, ok := err.(*errors.StatusError); ok { - code := statusErr.Status().Code - if code >= 500 && code < 600 && code != 501 { - return true - } - } - - return errors.IsTimeout(err) || errors.IsServerTimeout(err) || errors.IsTooManyRequests(err) -} - // isAuthOrConnectionErr checks if the error is an authentication error or a connection error // This suggests an issue with the client configuration and a reload might be needed func isAuthOrConnectionErr(err error) bool { diff --git a/operators/endpointmetrics/main.go b/operators/endpointmetrics/main.go index 167374810..fa6fe82d0 100644 --- a/operators/endpointmetrics/main.go +++ b/operators/endpointmetrics/main.go @@ -9,6 +9,7 @@ import ( "fmt" "os" "runtime" + "strconv" // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) // to ensure that exec-entrypoint and run can make use of them. @@ -130,19 +131,33 @@ func main() { os.Exit(1) } + namespace := os.Getenv("NAMESPACE") + if namespace == "" { + namespace = os.Getenv("WATCH_NAMESPACE") + } + + var installPrometheus bool + if envVal := os.Getenv(operatorconfig.InstallPrometheus); envVal != "" { + installPrometheus, err = strconv.ParseBool(envVal) + if err != nil { + setupLog.Error(err, "Failed to parse the value of the environment variable", "variable", operatorconfig.InstallPrometheus) + } + } + if err = (&obsepctl.ObservabilityAddonReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - HubClient: hubClientWithReload, + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + HubClient: hubClientWithReload, + HubNamespace: os.Getenv("HUB_NAMESPACE"), + Namespace: namespace, + ServiceAccountName: os.Getenv("SERVICE_ACCOUNT"), + IsHubMetricsCollector: os.Getenv("HUB_ENDPOINT_OPERATOR") == "true", + InstallPrometheus: installPrometheus, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "ObservabilityAddon") os.Exit(1) } - namespace := os.Getenv("NAMESPACE") - if namespace == "" { - namespace = os.Getenv("WATCH_NAMESPACE") - } if err = (&statusctl.StatusReconciler{ Client: mgr.GetClient(), HubClient: hubClientWithReload, diff --git a/operators/endpointmetrics/pkg/collector/match_evaluator.go b/operators/endpointmetrics/pkg/collector/match_evaluator.go new file mode 100644 index 000000000..f78fcc42b --- /dev/null +++ b/operators/endpointmetrics/pkg/collector/match_evaluator.go @@ -0,0 +1,26 @@ +// Copyright (c) Red Hat, Inc. +// Copyright Contributors to the Open Cluster Management project +// Licensed under the Apache License 2.0 + +package collector + +import ( + "slices" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func evaluateMatchExpression(expr metav1.LabelSelectorRequirement, key, value string) bool { + if expr.Key != key { + return false + } + + switch expr.Operator { + case metav1.LabelSelectorOpIn: + return slices.Contains(expr.Values, value) + case metav1.LabelSelectorOpNotIn: + return !slices.Contains(expr.Values, value) + default: + return false + } +} diff --git a/operators/endpointmetrics/controllers/observabilityendpoint/match_evaluator_test.go b/operators/endpointmetrics/pkg/collector/match_evaluator_test.go similarity index 79% rename from operators/endpointmetrics/controllers/observabilityendpoint/match_evaluator_test.go rename to operators/endpointmetrics/pkg/collector/match_evaluator_test.go index 749359670..981cf30b5 100644 --- a/operators/endpointmetrics/controllers/observabilityendpoint/match_evaluator_test.go +++ b/operators/endpointmetrics/pkg/collector/match_evaluator_test.go @@ -2,7 +2,7 @@ // Copyright Contributors to the Open Cluster Management project // Licensed under the Apache License 2.0 -package observabilityendpoint +package collector import ( "testing" @@ -22,7 +22,7 @@ func TestEvluateMatchExpression(t *testing.T) { expr: metav1.LabelSelectorRequirement{ Key: "test_key", Operator: "In", - Values: []string{snoClusterType}, + Values: []string{"test_value"}, }, expectedResult: false, }, @@ -31,7 +31,7 @@ func TestEvluateMatchExpression(t *testing.T) { expr: metav1.LabelSelectorRequirement{ Key: "clusterType", Operator: "test_op", - Values: []string{snoClusterType}, + Values: []string{"test_value"}, }, expectedResult: false, }, @@ -40,9 +40,9 @@ func TestEvluateMatchExpression(t *testing.T) { expr: metav1.LabelSelectorRequirement{ Key: "clusterType", Operator: "NotIn", - Values: []string{snoClusterType}, + Values: []string{"SNO"}, }, - clusterType: snoClusterType, + clusterType: "SNO", expectedResult: false, }, { @@ -50,7 +50,7 @@ func TestEvluateMatchExpression(t *testing.T) { expr: metav1.LabelSelectorRequirement{ Key: "clusterType", Operator: "In", - Values: []string{snoClusterType}, + Values: []string{"SNO"}, }, clusterType: "", expectedResult: false, @@ -60,7 +60,7 @@ func TestEvluateMatchExpression(t *testing.T) { expr: metav1.LabelSelectorRequirement{ Key: "clusterType", Operator: "NotIn", - Values: []string{snoClusterType}, + Values: []string{"SNO"}, }, clusterType: "", expectedResult: true, @@ -70,17 +70,17 @@ func TestEvluateMatchExpression(t *testing.T) { expr: metav1.LabelSelectorRequirement{ Key: "clusterType", Operator: "In", - Values: []string{snoClusterType}, + Values: []string{"SNO"}, }, - clusterType: snoClusterType, + clusterType: "SNO", expectedResult: true, }, } for _, c := range caseList { t.Run(c.name, func(t *testing.T) { - params := append([]interface{}{"id"}, c.clusterType) - r := evluateMatchExpression(c.expr, params...) + // params := append([]interface{}{"id"}, c.clusterType) + r := evaluateMatchExpression(c.expr, "clusterType", c.clusterType) if r != c.expectedResult { t.Fatalf("Wrong result for test %s, expected %v, got %v", c.name, c.expectedResult, r) } diff --git a/operators/endpointmetrics/pkg/collector/metrics_collector.go b/operators/endpointmetrics/pkg/collector/metrics_collector.go new file mode 100644 index 000000000..2a1f911dc --- /dev/null +++ b/operators/endpointmetrics/pkg/collector/metrics_collector.go @@ -0,0 +1,1032 @@ +// Copyright (c) Red Hat, Inc. +// Copyright Contributors to the Open Cluster Management project +// Licensed under the Apache License 2.0 + +package collector + +import ( + "context" + "fmt" + "regexp" + "sort" + "strconv" + "strings" + "time" + + "github.com/go-logr/logr" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "gopkg.in/yaml.v2" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/equality" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/client-go/util/retry" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/stolostron/multicluster-observability-operator/operators/endpointmetrics/pkg/openshift" + "github.com/stolostron/multicluster-observability-operator/operators/endpointmetrics/pkg/rendering" + "github.com/stolostron/multicluster-observability-operator/operators/endpointmetrics/pkg/status" + oav1beta1 "github.com/stolostron/multicluster-observability-operator/operators/multiclusterobservability/api/v1beta1" + operatorconfig "github.com/stolostron/multicluster-observability-operator/operators/pkg/config" + "github.com/stolostron/multicluster-observability-operator/operators/pkg/util" +) + +const ( + metricsCollectorName = "metrics-collector-deployment" + uwlMetricsCollectorName = "uwl-metrics-collector-deployment" + metricsCollector = "metrics-collector" + uwlMetricsCollector = "uwl-metrics-collector" + selectorKey = "component" + selectorValue = metricsCollector + caMounthPath = "/etc/serving-certs-ca-bundle" + caVolName = "serving-certs-ca-bundle" + mtlsCertName = "observability-controller-open-cluster-management.io-observability-signer-client-cert" + mtlsCaName = "observability-managed-cluster-certs" + mtlsServerCaName = "observability-server-ca-certs" + limitBytes = 1073741824 + defaultInterval = "30s" + uwlNamespace = "openshift-user-workload-monitoring" + uwlSts = "prometheus-user-workload" +) + +const ( + restartLabel = "cert/time-restarted" + ownerLabelKey = "owner" + ownerLabelValue = "observabilityaddon" +) + +var ( + ocpPromURL = "https://prometheus-k8s.openshift-monitoring.svc:9091" + uwlPromURL = "https://prometheus-user-workload.openshift-user-workload-monitoring.svc:9092" + uwlQueryURL = "https://thanos-querier.openshift-monitoring.svc:9091" + promURL = "https://prometheus-k8s:9091" +) + +type ClusterInfo struct { + ClusterID string + ClusterType string + InstallPrometheus bool + IsHubMetricsCollector bool +} + +type MetricsCollector struct { + Client client.Client + ClusterInfo ClusterInfo + HubInfo *operatorconfig.HubInfo + Log logr.Logger + Namespace string + ObsAddon *oav1beta1.ObservabilityAddon + ServiceAccountName string +} + +type proxyConfig struct { + caBundle string + httpProxy string + httpsProxy string + noProxy string +} + +type deploymentParams struct { + allowlist *operatorconfig.MetricsAllowlist + forceRestart bool + nodeSelector map[string]string + proxyConfig proxyConfig + tolerations []corev1.Toleration + uwlList *operatorconfig.MetricsAllowlist +} + +// Update updates the metrics collector resources and the addon status when needed. +func (m *MetricsCollector) Update(ctx context.Context, req ctrl.Request) error { + deployParams, err := m.generateDeployParams(ctx, req) + if err != nil { + m.reportStatus(ctx, status.Degraded) + return err + } + + var mcResult, uwlResult ensureDeploymentResult + if mcResult, err = m.updateMetricsCollector(ctx, false, deployParams); err != nil { + m.reportStatus(ctx, status.Degraded) + return err + } + + isUwl, err := m.isUWLMonitoringEnabled(ctx) + if err != nil { + m.reportStatus(ctx, status.Degraded) + return err + } + + uwlMetricsLen := len(deployParams.uwlList.NameList) + len(deployParams.uwlList.MatchList) + if isUwl && uwlMetricsLen != 0 { + if uwlResult, err = m.updateMetricsCollector(ctx, true, deployParams); err != nil { + m.reportStatus(ctx, status.Degraded) + return err + } + } else { + if err := m.deleteMetricsCollector(ctx, true); err != nil { + m.reportStatus(ctx, status.Degraded) + return err + } + } + + if mcResult == deploymentCreated || uwlResult == deploymentCreated { + m.reportStatus(ctx, status.Deployed) + } else if mcResult == deploymentUpdated && !m.ObsAddon.Spec.EnableMetrics { + m.reportStatus(ctx, status.Disabled) + } + + return nil +} + +func (m *MetricsCollector) Delete(ctx context.Context) error { + if err := m.deleteMetricsCollector(ctx, false); err != nil { + return err + } + + if err := m.deleteMetricsCollector(ctx, true); err != nil { + return err + } + + return nil +} + +func (m *MetricsCollector) reportStatus(ctx context.Context, conditionReason status.ConditionReason) { + if m.ClusterInfo.IsHubMetricsCollector { + return + } + m.Log.Info("Reporting status", "conditionReason", conditionReason) + if err := status.ReportStatus(ctx, m.Client, conditionReason, m.ObsAddon.Name, m.Namespace); err != nil { + m.Log.Error(err, "Failed to report status") + } +} + +func (m *MetricsCollector) generateDeployParams(ctx context.Context, req ctrl.Request) (*deploymentParams, error) { + list, uwlList, err := m.getMetricsAllowlist(ctx) + if err != nil { + return nil, err + } + + endpointDeployment, err := m.getEndpointDeployment(ctx) + if err != nil { + return nil, err + } + + deployParams := &deploymentParams{ + allowlist: list, + forceRestart: req.Name == mtlsCertName || req.Name == mtlsCaName || req.Name == openshift.CaConfigmapName, + nodeSelector: endpointDeployment.Spec.Template.Spec.NodeSelector, + tolerations: endpointDeployment.Spec.Template.Spec.Tolerations, + uwlList: uwlList, + } + + // stash away proxy settings from endpoint deployment + for _, container := range endpointDeployment.Spec.Template.Spec.Containers { + if container.Name == "endpoint-observability-operator" { + for _, env := range container.Env { + switch env.Name { + case "HTTP_PROXY": + deployParams.proxyConfig.httpProxy = env.Value + case "HTTPS_PROXY": + deployParams.proxyConfig.httpsProxy = env.Value + case "NO_PROXY": + deployParams.proxyConfig.noProxy = env.Value + case "HTTPS_PROXY_CA_BUNDLE": + deployParams.proxyConfig.caBundle = env.Value + } + } + } + } + + return deployParams, nil +} + +func (m *MetricsCollector) deleteMetricsCollector(ctx context.Context, isUWL bool) error { + deployName := metricsCollectorName + name := metricsCollector + if isUWL { + deployName = uwlMetricsCollectorName + name = uwlMetricsCollector + } + + objects := []client.Object{ + &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: deployName, + Namespace: m.Namespace, + }, + }, + &monitoringv1.ServiceMonitor{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: m.Namespace, + }, + }, + &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Name: "acm-" + name + "-alerting-rules", + Namespace: m.Namespace, + }, + }, + &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: m.Namespace, + }, + }, + } + + for _, obj := range objects { + if err := m.deleteResourceIfExists(ctx, obj); err != nil { + return err + } + } + + return nil +} + +func (m *MetricsCollector) updateMetricsCollector(ctx context.Context, isUWL bool, deployParams *deploymentParams) (ensureDeploymentResult, error) { + if err := m.ensureService(ctx, isUWL); err != nil { + return "", err + } + + if err := m.ensureServiceMonitor(ctx, isUWL); err != nil { + return "", err + } + + if err := m.ensureAlertingRule(ctx, isUWL); err != nil { + return "", err + } + + res, err := m.ensureDeployment(ctx, isUWL, deployParams) + if err != nil { + return "", err + } + + return res, nil +} + +func (m *MetricsCollector) ensureService(ctx context.Context, isUWL bool) error { + name := metricsCollector + if isUWL { + name = uwlMetricsCollector + } + + desiredService := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: m.Namespace, + Labels: map[string]string{ + selectorKey: name, + }, + Annotations: map[string]string{ + ownerLabelKey: ownerLabelValue, + "service.beta.openshift.io/serving-cert-secret-name": name + "-kube-rbac-tls", + }, + }, + Spec: corev1.ServiceSpec{ + Selector: map[string]string{ + selectorKey: name, + }, + Ports: []corev1.ServicePort{ + { + Name: "metrics", + Port: 8080, + TargetPort: intstr.FromString("metrics"), + Protocol: corev1.ProtocolTCP, + }, + }, + Type: corev1.ServiceTypeClusterIP, + }, + } + + retryErr := retry.RetryOnConflict(retry.DefaultBackoff, func() error { + foundService := &corev1.Service{} + err := m.Client.Get(ctx, types.NamespacedName{Name: name, Namespace: m.Namespace}, foundService) + if err != nil && errors.IsNotFound(err) { + m.Log.Info("Creating Service", "name", name, "namespace", m.Namespace) + if err := m.Client.Create(ctx, desiredService); err != nil { + return fmt.Errorf("failed to create service %s/%s: %w", m.Namespace, name, err) + } + + return nil + } + if err != nil { + return fmt.Errorf("failed to get service %s/%s: %w", m.Namespace, name, err) + } + + if !equality.Semantic.DeepDerivative(desiredService.Spec, foundService.Spec) { + m.Log.Info("Updating Service", "name", name, "namespace", m.Namespace) + + foundService.Spec = desiredService.Spec + if err := m.Client.Update(ctx, foundService); err != nil { + return fmt.Errorf("failed to update service %s/%s: %w", m.Namespace, name, err) + } + } + + return nil + }) + + if retryErr != nil { + return retryErr + } + + return nil +} + +// createServiceMonitor creates a ServiceMonitor for the metrics collector. +func (m *MetricsCollector) ensureServiceMonitor(ctx context.Context, isUWL bool) error { + name := metricsCollector + replace := "acm_metrics_collector_${1}" + if isUWL { + name = uwlMetricsCollector + replace = "acm_uwl_metrics_collector_${1}" + } + + desiredSm := &monitoringv1.ServiceMonitor{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: m.Namespace, + Labels: map[string]string{ + selectorKey: name, + }, + Annotations: map[string]string{ + ownerLabelKey: ownerLabelValue, + }, + }, + Spec: monitoringv1.ServiceMonitorSpec{ + Selector: metav1.LabelSelector{ + MatchLabels: map[string]string{ + selectorKey: name, + }, + }, + NamespaceSelector: monitoringv1.NamespaceSelector{ + MatchNames: []string{m.Namespace}, + }, + Endpoints: []monitoringv1.Endpoint{ + { + Port: "metrics", + Path: "/metrics", + Scheme: "http", + MetricRelabelConfigs: []*monitoringv1.RelabelConfig{ + { + Action: "replace", + Regex: "(.+)", + Replacement: replace, + SourceLabels: []string{"__name__"}, + TargetLabel: "__name__", + }, + }, + }, + }, + }, + } + + retryErr := retry.RetryOnConflict(retry.DefaultBackoff, func() error { + foundSm := &monitoringv1.ServiceMonitor{} + err := m.Client.Get(ctx, types.NamespacedName{Name: name, Namespace: m.Namespace}, foundSm) + if err != nil && errors.IsNotFound(err) { + m.Log.Info("Creating ServiceMonitor", "name", name, "namespace", m.Namespace) + if err := m.Client.Create(ctx, desiredSm); err != nil { + return fmt.Errorf("failed to create ServiceMonitor %s/%s: %w", m.Namespace, name, err) + } + + return nil + } + if err != nil { + return fmt.Errorf("failed to get ServiceMonitor %s/%s: %w", m.Namespace, name, err) + } + + if !equality.Semantic.DeepDerivative(desiredSm.Spec, foundSm.Spec) { + m.Log.Info("Updating ServiceMonitor", "name", name, "namespace", m.Namespace) + + foundSm.Spec = desiredSm.Spec + if err := m.Client.Update(ctx, foundSm); err != nil { + return fmt.Errorf("failed to update ServiceMonitor %s/%s: %w", m.Namespace, name, err) + } + } + + return nil + }) + + if retryErr != nil { + return retryErr + } + + return nil +} + +func (m *MetricsCollector) ensureAlertingRule(ctx context.Context, isUWL bool) error { + baseName := metricsCollector + alert := "MetricsCollector" + replace := "acm_metrics_collector_" + if isUWL { + baseName = uwlMetricsCollector + alert = "UWLMetricsCollector" + replace = "acm_uwl_metrics_collector_" + } + + name := fmt.Sprintf("acm-%s-alerting-rules", baseName) + + desiredPromRule := &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: m.Namespace, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: baseName + "-rules", + Rules: []monitoringv1.Rule{ + { + Alert: "ACM" + alert + "FederationError", + Annotations: map[string]string{ + "summary": "Error federating from in-cluster Prometheus.", + "description": "There are errors when federating from platform Prometheus", + }, + Expr: intstr.FromString(`(sum by (status_code, type) (rate(` + replace + `federate_requests_total{status_code!~"2.*"}[10m]))) > 10`), + For: "10m", + Labels: map[string]string{ + "severity": "critical", + }, + }, + { + Alert: "ACM" + alert + "ForwardRemoteWriteError", + Annotations: map[string]string{ + "summary": "Error forwarding to Hub Thanos.", + "description": "There are errors when remote writing to Hub hub Thanos", + }, + Expr: intstr.FromString(`(sum by (status_code, type) (rate(` + replace + `forward_write_requests_total{status_code!~"2.*"}[10m]))) > 10`), + For: "10m", + Labels: map[string]string{ + "severity": "critical", + }, + }, + }, + }, + }, + }, + } + + retryErr := retry.RetryOnConflict(retry.DefaultBackoff, func() error { + foundPromRule := &monitoringv1.PrometheusRule{} + err := m.Client.Get(ctx, types.NamespacedName{Name: name, Namespace: m.Namespace}, foundPromRule) + if err != nil && errors.IsNotFound(err) { + m.Log.Info("Creating PrometheusRule", "name", name, "namespace", m.Namespace) + if err := m.Client.Create(ctx, desiredPromRule); err != nil { + return fmt.Errorf("failed to create PrometheusRule %s/%s: %w", m.Namespace, name, err) + } + + return nil + } + if err != nil { + return fmt.Errorf("failed to get PrometheusRule %s/%s: %w", m.Namespace, name, err) + } + + if !equality.Semantic.DeepDerivative(desiredPromRule.Spec, foundPromRule.Spec) { + m.Log.Info("Updating PrometheusRule", "name", name, "namespace", m.Namespace) + + foundPromRule.Spec = desiredPromRule.Spec + if err := m.Client.Update(ctx, foundPromRule); err != nil { + return fmt.Errorf("failed to update PrometheusRule %s/%s: %w", m.Namespace, name, err) + } + } + + return nil + }) + + if retryErr != nil { + return retryErr + } + + return nil +} + +type ensureDeploymentResult string + +const ( + deploymentCreated ensureDeploymentResult = "created" + deploymentUpdated ensureDeploymentResult = "updated" + deploymentNoop ensureDeploymentResult = "noop" +) + +func (m *MetricsCollector) ensureDeployment(ctx context.Context, isUWL bool, deployParams *deploymentParams) (ensureDeploymentResult, error) { + secretName := metricsCollector + if isUWL { + secretName = uwlMetricsCollector + } + + defaultMode := int32(420) + volumes := []corev1.Volume{ + { + Name: "mtlscerts", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{ + SecretName: mtlsCertName, + DefaultMode: &defaultMode, + }, + }, + }, + { + Name: "mtlsca", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{ + SecretName: mtlsCaName, + DefaultMode: &defaultMode, + }, + }, + }, + } + + if m.ClusterInfo.ClusterType != operatorconfig.OcpThreeClusterType { + serviceCAOperatorGenerated := []corev1.Volume{ + { + Name: "secret-kube-rbac-proxy-tls", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{ + SecretName: secretName + "-kube-rbac-tls", + DefaultMode: &defaultMode, + }, + }, + }, + { + Name: "secret-kube-rbac-proxy-metric", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{ + SecretName: secretName + "-kube-rbac-proxy-metric", + DefaultMode: &defaultMode, + }, + }, + }, + { + Name: "metrics-client-ca", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + DefaultMode: &defaultMode, + LocalObjectReference: corev1.LocalObjectReference{ + Name: secretName + "-clientca-metric", + }, + }, + }, + }, + } + + volumes = append(volumes, serviceCAOperatorGenerated...) + } + + mounts := []corev1.VolumeMount{ + { + Name: "mtlscerts", + MountPath: "/tlscerts/certs", + }, + { + Name: "mtlsca", + MountPath: "/tlscerts/ca", + }, + } + + if m.ClusterInfo.ClusterID != "" { + volumes = append(volumes, corev1.Volume{ + Name: caVolName, + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + DefaultMode: &defaultMode, + LocalObjectReference: corev1.LocalObjectReference{ + Name: openshift.CaConfigmapName, + }, + }, + }, + }) + mounts = append(mounts, corev1.VolumeMount{ + Name: caVolName, + MountPath: caMounthPath, + }) + } + + commands := m.getCommands(isUWL, deployParams) + + from := promURL + if !m.ClusterInfo.InstallPrometheus { + from = ocpPromURL + if isUWL { + from = uwlPromURL + } + } + + fromQuery := from + name := metricsCollectorName + if isUWL { + fromQuery = uwlQueryURL + name = uwlMetricsCollectorName + } + + replicaCount := int32(0) + if m.ObsAddon.Spec.EnableMetrics || m.ClusterInfo.IsHubMetricsCollector { + replicaCount = 1 + } + + trueVal := true + desiredMetricsCollectorDep := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: m.Namespace, + Annotations: map[string]string{ + ownerLabelKey: ownerLabelValue, + }, + }, + Spec: appsv1.DeploymentSpec{ + Replicas: &replicaCount, + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + selectorKey: secretName, + }, + }, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + ownerLabelKey: ownerLabelValue, + operatorconfig.WorkloadPartitioningPodAnnotationKey: operatorconfig.WorkloadPodExpectedValueJSON, + }, + Labels: map[string]string{ + selectorKey: secretName, + }, + }, + Spec: corev1.PodSpec{ + ServiceAccountName: m.ServiceAccountName, + Containers: []corev1.Container{ + { + Name: metricsCollector, + Image: rendering.Images[operatorconfig.MetricsCollectorKey], + Command: commands, + Env: []corev1.EnvVar{ + { + Name: "FROM", + Value: from, + }, + { + Name: "FROM_QUERY", + Value: fromQuery, + }, + { + Name: "TO", + Value: m.HubInfo.ObservatoriumAPIEndpoint, + }, + }, + VolumeMounts: mounts, + ImagePullPolicy: corev1.PullIfNotPresent, + Ports: []corev1.ContainerPort{ + { + ContainerPort: 8080, + Name: "metrics", + Protocol: corev1.ProtocolTCP, + }, + }, + SecurityContext: &corev1.SecurityContext{ + RunAsNonRoot: &trueVal, + ReadOnlyRootFilesystem: &trueVal, + AllowPrivilegeEscalation: new(bool), + Capabilities: &corev1.Capabilities{ + Drop: []corev1.Capability{"ALL"}, + }, + }, + }, + }, + Volumes: volumes, + NodeSelector: deployParams.nodeSelector, + Tolerations: deployParams.tolerations, + }, + }, + }, + } + + if deployParams.proxyConfig.httpProxy != "" || deployParams.proxyConfig.httpsProxy != "" || deployParams.proxyConfig.noProxy != "" { + desiredMetricsCollectorDep.Spec.Template.Spec.Containers[0].Env = append(desiredMetricsCollectorDep.Spec.Template.Spec.Containers[0].Env, + corev1.EnvVar{ + Name: "HTTP_PROXY", + Value: deployParams.proxyConfig.httpProxy, + }, + corev1.EnvVar{ + Name: "HTTPS_PROXY", + Value: deployParams.proxyConfig.httpsProxy, + }, + corev1.EnvVar{ + Name: "NO_PROXY", + Value: deployParams.proxyConfig.noProxy, + }) + } + if deployParams.proxyConfig.httpsProxy != "" && deployParams.proxyConfig.caBundle != "" { + desiredMetricsCollectorDep.Spec.Template.Spec.Containers[0].Env = append(desiredMetricsCollectorDep.Spec.Template.Spec.Containers[0].Env, + corev1.EnvVar{ + Name: "HTTPS_PROXY_CA_BUNDLE", + Value: deployParams.proxyConfig.caBundle, + }) + } + + if m.ClusterInfo.IsHubMetricsCollector { + //to avoid hub metrics collector from sending status + desiredMetricsCollectorDep.Spec.Template.Spec.Containers[0].Env = append(desiredMetricsCollectorDep.Spec.Template.Spec.Containers[0].Env, + corev1.EnvVar{ + Name: "STANDALONE", + Value: "true", + }) + } + + privileged := false + readOnlyRootFilesystem := true + desiredMetricsCollectorDep.Spec.Template.Spec.Containers[0].SecurityContext = &corev1.SecurityContext{ + Privileged: &privileged, + ReadOnlyRootFilesystem: &readOnlyRootFilesystem, + } + + if m.ObsAddon.Spec.Resources != nil { + desiredMetricsCollectorDep.Spec.Template.Spec.Containers[0].Resources = *m.ObsAddon.Spec.Resources + } + + result := deploymentNoop + + retryErr := retry.RetryOnConflict(retry.DefaultBackoff, func() error { + foundMetricsCollectorDep := &appsv1.Deployment{} + err := m.Client.Get(ctx, types.NamespacedName{Name: name, Namespace: m.Namespace}, foundMetricsCollectorDep) + if err != nil && errors.IsNotFound(err) { + m.Log.Info("Creating Deployment", "name", name, "namespace", m.Namespace) + if err := m.Client.Create(ctx, desiredMetricsCollectorDep); err != nil { + return fmt.Errorf("failed to create Deployment %s/%s: %w", m.Namespace, name, err) + } + + result = deploymentCreated + return nil + } + if err != nil { + return fmt.Errorf("failed to get Deployment %s/%s: %w", m.Namespace, name, err) + } + + isDifferentSpec := !equality.Semantic.DeepDerivative(desiredMetricsCollectorDep.Spec.Template.Spec, foundMetricsCollectorDep.Spec.Template.Spec) + isDifferentReplicas := !equality.Semantic.DeepEqual(desiredMetricsCollectorDep.Spec.Replicas, foundMetricsCollectorDep.Spec.Replicas) + if isDifferentSpec || isDifferentReplicas || deployParams.forceRestart { + m.Log.Info("Updating Deployment", "name", name, "namespace", m.Namespace, "isDifferentSpec", isDifferentSpec, "isDifferentReplicas", isDifferentReplicas, "forceRestart", deployParams.forceRestart) + if deployParams.forceRestart && foundMetricsCollectorDep.Status.ReadyReplicas != 0 { + desiredMetricsCollectorDep.Spec.Template.ObjectMeta.Labels[restartLabel] = time.Now().Format("2006-1-2.1504") + } + + desiredMetricsCollectorDep.ResourceVersion = foundMetricsCollectorDep.ResourceVersion + + if err := m.Client.Update(ctx, desiredMetricsCollectorDep); err != nil { + return fmt.Errorf("failed to update Deployment %s/%s: %w", m.Namespace, name, err) + } + + result = deploymentUpdated + return nil + } + + return nil + }) + + if retryErr != nil { + return deploymentNoop, retryErr + } + + return result, nil +} + +func (m *MetricsCollector) getCommands(isUSW bool, deployParams *deploymentParams) []string { + interval := defaultInterval + if m.ObsAddon.Spec.Interval != 0 { + interval = fmt.Sprintf("%ds", m.ObsAddon.Spec.Interval) + } + + evaluateInterval := "30s" + if m.ObsAddon.Spec.Interval < 30 { + evaluateInterval = interval + } + + caFile := caMounthPath + "/service-ca.crt" + clusterID := m.ClusterInfo.ClusterID + if clusterID == "" { + clusterID = m.HubInfo.ClusterName + // deprecated ca bundle, only used for ocp 3.11 env + caFile = "/var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt" + } + + allowList := deployParams.allowlist + if isUSW { + allowList = deployParams.uwlList + } + + commands := []string{ + "/usr/bin/metrics-collector", + "--listen=:8080", + "--from=$(FROM)", + "--from-query=$(FROM_QUERY)", + "--to-upload=$(TO)", + "--to-upload-ca=/tlscerts/ca/ca.crt", + "--to-upload-cert=/tlscerts/certs/tls.crt", + "--to-upload-key=/tlscerts/certs/tls.key", + "--interval=" + interval, + "--evaluate-interval=" + evaluateInterval, + "--limit-bytes=" + strconv.Itoa(limitBytes), + fmt.Sprintf("--label=\"cluster=%s\"", m.HubInfo.ClusterName), + fmt.Sprintf("--label=\"clusterID=%s\"", clusterID), + } + commands = append(commands, "--from-token-file=/var/run/secrets/kubernetes.io/serviceaccount/token") + if !m.ClusterInfo.InstallPrometheus { + commands = append(commands, "--from-ca-file="+caFile) + } + if m.ClusterInfo.ClusterType != operatorconfig.DefaultClusterType { + commands = append(commands, fmt.Sprintf("--label=\"clusterType=%s\"", m.ClusterInfo.ClusterType)) + } + + dynamicMetricList := map[string]bool{} + for _, group := range allowList.CollectRuleGroupList { + if group.Selector.MatchExpression != nil { + for _, expr := range group.Selector.MatchExpression { + if !evaluateMatchExpression(expr, "clusterType", m.ClusterInfo.ClusterType) { + continue + } + + for _, rule := range group.CollectRuleList { + matchList := []string{} + for _, match := range rule.Metrics.MatchList { + matchList = append(matchList, `"`+strings.ReplaceAll(match, `"`, `\"`)+`"`) + if name := getNameInMatch(match); name != "" { + dynamicMetricList[name] = false + } + } + for _, name := range rule.Metrics.NameList { + dynamicMetricList[name] = false + } + matchListStr := "[" + strings.Join(matchList, ",") + "]" + nameListStr := `["` + strings.Join(rule.Metrics.NameList, `","`) + `"]` + commands = append( + commands, + fmt.Sprintf("--collectrule={\"name\":\"%s\",\"expr\":\"%s\",\"for\":\"%s\",\"names\":%v,\"matches\":%v}", + rule.Collect, rule.Expr, rule.For, nameListStr, matchListStr), + ) + } + } + } + } + + for _, metrics := range allowList.NameList { + if _, ok := dynamicMetricList[metrics]; !ok { + commands = append(commands, fmt.Sprintf("--match={__name__=\"%s\"}", metrics)) + } + } + for _, match := range allowList.MatchList { + if name := getNameInMatch(match); name != "" { + if _, ok := dynamicMetricList[name]; ok { + continue + } + } + commands = append(commands, fmt.Sprintf("--match={%s}", match)) + } + + renamekeys := make([]string, 0, len(allowList.RenameMap)) + for k := range allowList.RenameMap { + renamekeys = append(renamekeys, k) + } + sort.Strings(renamekeys) + for _, k := range renamekeys { + commands = append(commands, fmt.Sprintf("--rename=\"%s=%s\"", k, allowList.RenameMap[k])) + } + for _, rule := range allowList.RecordingRuleList { + commands = append( + commands, + fmt.Sprintf("--recordingrule={\"name\":\"%s\",\"query\":\"%s\"}", rule.Record, rule.Expr), + ) + } + return commands +} + +func (m *MetricsCollector) getMetricsAllowlist(ctx context.Context) (*operatorconfig.MetricsAllowlist, *operatorconfig.MetricsAllowlist, error) { + allowList := &operatorconfig.MetricsAllowlist{} + userAllowList := &operatorconfig.MetricsAllowlist{} + + // get allowlist configmap + cm := &corev1.ConfigMap{} + err := m.Client.Get(ctx, types.NamespacedName{Name: operatorconfig.AllowlistConfigMapName, + Namespace: m.Namespace}, cm) + if err != nil { + m.Log.Error(err, "Failed to get configmap", "name", operatorconfig.AllowlistConfigMapName, "namespace", m.Namespace) + } + + if cm.Data != nil { + configmapKey := operatorconfig.MetricsConfigMapKey + if m.ClusterInfo.ClusterType == operatorconfig.OcpThreeClusterType { + configmapKey = operatorconfig.MetricsOcp311ConfigMapKey + } + + err = yaml.Unmarshal([]byte(cm.Data[configmapKey]), allowList) + if err != nil { + return allowList, userAllowList, fmt.Errorf("failed to unmarshal allowList data in configmap %s/%s: %w", cm.Namespace, cm.Name, err) + } + + // get default user allowlist in configmap + if uwlData, ok := cm.Data[operatorconfig.UwlMetricsConfigMapKey]; ok { + err = yaml.Unmarshal([]byte(uwlData), userAllowList) + if err != nil { + return allowList, userAllowList, fmt.Errorf("failed to unmarshal user allowList data in configmap %s/%s: %w", cm.Namespace, cm.Name, err) + } + } + } + + // get custom allowlist configmap in all namespaces + cmList := &corev1.ConfigMapList{} + cmNamespaces := []string{} + err = m.Client.List(ctx, cmList, &client.ListOptions{}) + if err != nil { + m.Log.Error(err, "Failed to list configmaps") + } + + for _, allowlistCM := range cmList.Items { + if allowlistCM.ObjectMeta.Name != operatorconfig.AllowlistCustomConfigMapName { + continue + } + + cmNamespaces = append(cmNamespaces, allowlistCM.ObjectMeta.Namespace) + + customAllowlist, _, customUwlAllowlist, err := util.ParseAllowlistConfigMap(allowlistCM) + if err != nil { + m.Log.Error(err, "Failed to parse data in configmap", "namespace", allowlistCM.ObjectMeta.Namespace, "name", allowlistCM.ObjectMeta.Name) + continue + } + + if allowlistCM.ObjectMeta.Namespace != m.Namespace { + customUwlAllowlist = injectNamespaceLabel(customUwlAllowlist, allowlistCM.ObjectMeta.Namespace) + } + + allowList, _, userAllowList = util.MergeAllowlist(allowList, customAllowlist, nil, userAllowList, customUwlAllowlist) + } + + if len(cmNamespaces) > 0 { + m.Log.Info("Merged allowLists from following namespaces", "namespaces", cmNamespaces) + } + + return allowList, userAllowList, nil +} + +func (m *MetricsCollector) getEndpointDeployment(ctx context.Context) (*appsv1.Deployment, error) { + ret := &appsv1.Deployment{} + err := m.Client.Get(ctx, types.NamespacedName{Name: "endpoint-observability-operator", Namespace: m.Namespace}, ret) + if err != nil { + return nil, fmt.Errorf("failed to get endpoint deployment %s/%s: %w", m.Namespace, "endpoint-observability-operator", err) + } + + return ret, nil +} + +func (m *MetricsCollector) isUWLMonitoringEnabled(ctx context.Context) (bool, error) { + sts := &appsv1.StatefulSet{} + err := m.Client.Get(ctx, types.NamespacedName{Namespace: uwlNamespace, Name: uwlSts}, sts) + if err != nil { + if errors.IsNotFound(err) { + return false, nil + } + + return false, fmt.Errorf("failed to get uwl prometheus statefulset %s/%s: %w", uwlNamespace, uwlSts, err) + } + + return true, nil +} + +func (m *MetricsCollector) deleteResourceIfExists(ctx context.Context, obj client.Object) error { + err := m.Client.Delete(ctx, obj) + if err != nil { + if !errors.IsNotFound(err) { + return fmt.Errorf("failed to delete object %s %s/%s: %w", obj.GetObjectKind().GroupVersionKind().Kind, obj.GetNamespace(), obj.GetName(), err) + } + } else { + m.Log.Info("Deleted object", "kind", obj.GetObjectKind().GroupVersionKind().Kind, "name", obj.GetName(), "namespace", obj.GetNamespace()) + } + + return nil +} + +func getNameInMatch(match string) string { + r := regexp.MustCompile(`__name__="([^,]*)"`) + m := r.FindAllStringSubmatch(match, -1) + if m != nil { + return m[0][1] + } + return "" +} + +// for custom uwl allowlist: +// 1. only support "names" and "matches". +// 2. inject namespace label filter for all entries in the allowlist. +func injectNamespaceLabel(allowlist *operatorconfig.MetricsAllowlist, + namespace string) *operatorconfig.MetricsAllowlist { + updatedList := &operatorconfig.MetricsAllowlist{ + NameList: []string{}, + MatchList: []string{}, + } + for _, name := range allowlist.NameList { + updatedList.MatchList = append(updatedList.MatchList, + fmt.Sprintf("__name__=\"%s\",namespace=\"%s\"", name, namespace)) + } + for _, match := range allowlist.MatchList { + updatedList.MatchList = append(updatedList.MatchList, fmt.Sprintf("%s,namespace=\"%s\"", match, namespace)) + } + return updatedList +} diff --git a/operators/endpointmetrics/pkg/collector/metrics_collector_test.go b/operators/endpointmetrics/pkg/collector/metrics_collector_test.go new file mode 100644 index 000000000..011622d48 --- /dev/null +++ b/operators/endpointmetrics/pkg/collector/metrics_collector_test.go @@ -0,0 +1,356 @@ +// Copyright (c) Red Hat, Inc. +// Copyright Contributors to the Open Cluster Management project +// Licensed under the Apache License 2.0 + +package collector_test + +import ( + "context" + "fmt" + "maps" + "slices" + "testing" + + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/kubectl/pkg/scheme" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/kustomize/kyaml/yaml" + + "github.com/go-logr/logr" + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "github.com/stolostron/multicluster-observability-operator/operators/endpointmetrics/pkg/collector" + oashared "github.com/stolostron/multicluster-observability-operator/operators/multiclusterobservability/api/shared" + oav1beta1 "github.com/stolostron/multicluster-observability-operator/operators/multiclusterobservability/api/v1beta1" + operatorconfig "github.com/stolostron/multicluster-observability-operator/operators/pkg/config" +) + +const ( + metricsCollectorName = "metrics-collector-deployment" + namespace = "testNamespace" + uwlMetricsCollectorName = "uwl-metrics-collector-deployment" + uwlNamespace = "openshift-user-workload-monitoring" + uwlSts = "prometheus-user-workload" +) + +func TestMetricsCollectorResourcesUpdate(t *testing.T) { + baseMetricsCollector := func() *collector.MetricsCollector { + return &collector.MetricsCollector{ + // Client is set in each test case + ClusterInfo: collector.ClusterInfo{ + ClusterID: "test-cluster", + }, + HubInfo: &operatorconfig.HubInfo{ + ClusterName: "test-cluster", + ObservatoriumAPIEndpoint: "http://test-endpoint", + }, + Log: logr.Logger{}, + Namespace: namespace, + ObsAddon: &oav1beta1.ObservabilityAddon{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-addon", + Namespace: namespace, + }, + Spec: oashared.ObservabilityAddonSpec{ + EnableMetrics: true, + Interval: 60, + }, + }, + ServiceAccountName: "test-sa", + } + } + + testCases := map[string]struct { + newMetricsCollector func() *collector.MetricsCollector + clientObjects func() []runtime.Object + request ctrl.Request + expects func(*testing.T, *appsv1.Deployment, *appsv1.Deployment) + }{ + "Should replicate endpoint operator settings": { + newMetricsCollector: func() *collector.MetricsCollector { + return baseMetricsCollector() + }, + clientObjects: func() []runtime.Object { return []runtime.Object{getEndpointOperatorDeployment()} }, + expects: func(t *testing.T, deployment, uwlDeployment *appsv1.Deployment) { + // Check env vars + operatorEnv := getEndpointOperatorDeployment().Spec.Template.Spec.Containers[0].Env + collectorEnv := deployment.Spec.Template.Spec.Containers[0].Env + if err := checkProxyEnvVars(operatorEnv, collectorEnv); err != nil { + t.Fatalf("Failed to ensure proxy env vars: %v", err) + } + + // Check toleration and node selector + if !slices.Equal(deployment.Spec.Template.Spec.Tolerations, getEndpointOperatorDeployment().Spec.Template.Spec.Tolerations) { + t.Fatalf("Tolerations are not set correctly: expected %v, got %v", + getEndpointOperatorDeployment().Spec.Template.Spec.Tolerations, deployment.Spec.Template.Spec.Tolerations) + } + if !maps.Equal(deployment.Spec.Template.Spec.NodeSelector, getEndpointOperatorDeployment().Spec.Template.Spec.NodeSelector) { + t.Fatalf("NodeSelector is not set correctly: expected %v, got %v", + getEndpointOperatorDeployment().Spec.Template.Spec.NodeSelector, deployment.Spec.Template.Spec.NodeSelector) + } + + // Check annotations + v, ok := deployment.Spec.Template.Annotations[operatorconfig.WorkloadPartitioningPodAnnotationKey] + if !ok || v != operatorconfig.WorkloadPodExpectedValueJSON { + t.Fatalf("Failed to find annotation %v: %v on the pod spec of deployment: %v", + operatorconfig.WorkloadPartitioningPodAnnotationKey, + operatorconfig.WorkloadPodExpectedValueJSON, + metricsCollectorName, + ) + } + }, + }, + "Should have 0 replicas when metrics is disabled and is not hub collector": { + newMetricsCollector: func() *collector.MetricsCollector { + ret := baseMetricsCollector() + ret.ObsAddon.Spec.EnableMetrics = false + ret.ClusterInfo.IsHubMetricsCollector = false + return ret + }, + clientObjects: func() []runtime.Object { return []runtime.Object{getEndpointOperatorDeployment()} }, + expects: func(t *testing.T, deployment *appsv1.Deployment, uwlDeployment *appsv1.Deployment) { + if *deployment.Spec.Replicas != 0 { + t.Fatalf("Replicas should be 0 when metrics is disabled and is not hub collector") + } + }, + }, + "Hub metrics collector should have 1 replica even if metrics is disabled": { + newMetricsCollector: func() *collector.MetricsCollector { + ret := baseMetricsCollector() + ret.ObsAddon.Spec.EnableMetrics = false + ret.ClusterInfo.IsHubMetricsCollector = true + return ret + }, + clientObjects: func() []runtime.Object { return []runtime.Object{getEndpointOperatorDeployment()} }, + expects: func(t *testing.T, deployment *appsv1.Deployment, uwlDeployment *appsv1.Deployment) { + if *deployment.Spec.Replicas != 1 { + t.Fatalf("Hub metrics collector should have 1 replica even if metrics is disabled") + } + }, + }, + "Should force reload if certs are updated": { + newMetricsCollector: func() *collector.MetricsCollector { + return baseMetricsCollector() + }, + clientObjects: func() []runtime.Object { + ret := []runtime.Object{getEndpointOperatorDeployment()} + metricsCollector := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: metricsCollectorName, + Namespace: namespace, + }, + Spec: appsv1.DeploymentSpec{}, + } + metricsCollector.Status.ReadyReplicas = 1 + ret = append(ret, metricsCollector) + return ret + }, + request: ctrl.Request{NamespacedName: types.NamespacedName{Name: "observability-managed-cluster-certs"}}, + expects: func(t *testing.T, deployment *appsv1.Deployment, uwlDeployment *appsv1.Deployment) { + if _, ok := deployment.Spec.Template.ObjectMeta.Labels["cert/time-restarted"]; !ok { + t.Fatalf("Should force reload if certs are updated. Label not found: %v", deployment.Spec.Template.ObjectMeta.Labels) + } + }, + }, + "Should create a uwl metrics collector if a custom uwl allowlist is present and uwl prometheus is present": { + newMetricsCollector: func() *collector.MetricsCollector { + return baseMetricsCollector() + }, + clientObjects: func() []runtime.Object { + data := map[string]operatorconfig.MetricsAllowlist{ + operatorconfig.UwlMetricsConfigMapKey: { + NameList: []string{"custom_c"}, + }, + } + uwlAllowlistCM := newAllowListCm(operatorconfig.AllowlistCustomConfigMapName, "default", data) + ret := []runtime.Object{getEndpointOperatorDeployment(), newUwlPrometheus(), uwlAllowlistCM} + return ret + }, + expects: func(t *testing.T, deployment *appsv1.Deployment, uwlDeployment *appsv1.Deployment) { + if uwlDeployment == nil { + t.Fatalf("Should create a uwl metrics collector if a custom allowlist is present and uwl prometheus is present") + } + + command := uwlDeployment.Spec.Template.Spec.Containers[0].Command + if !slices.Contains(command, `--match={__name__="custom_c",namespace="default"}`) { + t.Fatalf("Custom allowlist not found in args: %v", command) + } + }, + }, + "Should not create a uwl metrics collector if no custom allowlist is present": { + newMetricsCollector: func() *collector.MetricsCollector { + return baseMetricsCollector() + }, + clientObjects: func() []runtime.Object { + ret := []runtime.Object{getEndpointOperatorDeployment(), newUwlPrometheus()} + return ret + }, + expects: func(t *testing.T, deployment *appsv1.Deployment, uwlDeployment *appsv1.Deployment) { + if uwlDeployment != nil { + t.Fatalf("Should not create a uwl metrics collector if no custom allowlist is present") + } + }, + }, + "Should delete uwl metrics collector if uwl prometheus is removed": { + newMetricsCollector: func() *collector.MetricsCollector { + return baseMetricsCollector() + }, + clientObjects: func() []runtime.Object { + uwlDeploy := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: uwlMetricsCollectorName, + Namespace: namespace, + }, + } + data := map[string]operatorconfig.MetricsAllowlist{ + operatorconfig.UwlMetricsConfigMapKey: { + NameList: []string{"custom_c"}, + }, + } + uwlAllowlistCM := newAllowListCm(operatorconfig.AllowlistCustomConfigMapName, "default", data) + ret := []runtime.Object{getEndpointOperatorDeployment(), uwlAllowlistCM, uwlDeploy} + return ret + }, + expects: func(t *testing.T, deployment *appsv1.Deployment, uwlDeployment *appsv1.Deployment) { + if uwlDeployment != nil { + t.Fatalf("Should delete uwl metrics collector if uwl prometheus is removed") + } + }, + }, + } + + for name, tc := range testCases { + t.Run(name, func(t *testing.T) { + s := scheme.Scheme + promv1.AddToScheme(s) + c := fake.NewClientBuilder().WithScheme(s).WithRuntimeObjects(tc.clientObjects()...).Build() + + metricsCollector := tc.newMetricsCollector() + metricsCollector.Client = c + if err := metricsCollector.Update(context.Background(), tc.request); err != nil { + t.Fatalf("Failed to update metrics collector: %v", err) + } + + deployment := getMetricsCollectorDeployment(t, context.Background(), c, metricsCollectorName) + uwlDeployment := getMetricsCollectorDeployment(t, context.Background(), c, uwlMetricsCollectorName) + tc.expects(t, deployment, uwlDeployment) + }) + } + +} + +func getEndpointOperatorDeployment() *appsv1.Deployment { + return &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "endpoint-observability-operator", + Namespace: namespace, + }, + Spec: appsv1.DeploymentSpec{ + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "endpoint-observability-operator", + Env: []corev1.EnvVar{ + { + Name: "HTTP_PROXY", + Value: "http://foo.com", + }, + { + Name: "HTTPS_PROXY", + Value: "https://foo.com", + }, + { + Name: "NO_PROXY", + Value: "bar.com", + }, + { + Name: "HTTPS_PROXY_CA_BUNDLE", + Value: "custom-ca.crt", + }, + }, + }, + }, + }, + }, + }, + } +} + +func getMetricsCollectorDeployment(t *testing.T, ctx context.Context, c client.Client, name string) *appsv1.Deployment { + deployment := &appsv1.Deployment{} + err := c.Get(ctx, types.NamespacedName{Name: name, Namespace: namespace}, deployment) + if err != nil { + if errors.IsNotFound(err) { + return nil + } + t.Fatalf("Failed to get deployment %s/%s: %v", namespace, name, err) + } + return deployment +} + +func newUwlPrometheus() *appsv1.StatefulSet { + return &appsv1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: uwlSts, + Namespace: uwlNamespace, + }, + } +} + +func newAllowListCm(name, namespace string, data map[string]operatorconfig.MetricsAllowlist) *corev1.ConfigMap { + cmData := make(map[string]string, len(data)) + for k, v := range data { + strData, err := yaml.Marshal(v) + if err != nil { + panic(err) + } + cmData[k] = string(strData) + } + return &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + Data: cmData, + } +} + +func checkProxyEnvVars(expect, has []corev1.EnvVar) error { + toCompare := map[string]string{"HTTP_PROXY": "", "HTTPS_PROXY": "", "NO_PROXY": "", "HTTPS_PROXY_CA_BUNDLE": ""} + expectMap := make(map[string]string, len(toCompare)) + for _, e := range expect { + if _, ok := toCompare[e.Name]; ok { + if len(e.Value) == 0 { + return fmt.Errorf("Env var %s is empty in the expected list", e.Name) + } + expectMap[e.Name] = e.Value + } + } + + if len(expect) != len(toCompare) { + return fmt.Errorf("Some env vars are missing in the expected list: expected %v, got %v", toCompare, expect) + } + + hasMap := make(map[string]string, len(toCompare)) + for _, e := range has { + if v, ok := expectMap[e.Name]; ok { + if v != e.Value { + return fmt.Errorf("Env var %s is not set correctly: expected %s, got %s", e.Name, v, e.Value) + } + hasMap[e.Name] = e.Value + } + } + + if len(hasMap) != len(toCompare) { + return fmt.Errorf("Some env vars are missing in the actual list: expected %v, got %v", toCompare, hasMap) + } + + return nil +} diff --git a/operators/endpointmetrics/pkg/hypershift/hypershift.go b/operators/endpointmetrics/pkg/hypershift/hypershift.go index a3bbf03cd..816d8fc55 100644 --- a/operators/endpointmetrics/pkg/hypershift/hypershift.go +++ b/operators/endpointmetrics/pkg/hypershift/hypershift.go @@ -11,6 +11,7 @@ import ( hyperv1 "github.com/openshift/hypershift/api/v1alpha1" promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + operatorutil "github.com/stolostron/multicluster-observability-operator/operators/pkg/util" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -93,6 +94,21 @@ func HostedClusterNamespace(cluster *hyperv1.HostedCluster) string { return fmt.Sprintf("%s-%s", cluster.ObjectMeta.Namespace, cluster.ObjectMeta.Name) } +func IsHypershiftCluster() (bool, error) { + var isHypershift bool + crdClient, err := operatorutil.GetOrCreateCRDClient() + if err != nil { + return false, fmt.Errorf("failed to get/create CRD client: %w", err) + } + + isHypershift, err = operatorutil.CheckCRDExist(crdClient, "hostedclusters.hypershift.openshift.io") + if err != nil { + return false, fmt.Errorf("failed to check if the CRD hostedclusters.hypershift.openshift.io exists: %w", err) + } + + return isHypershift, nil +} + func createOrUpdateSM(ctx context.Context, c client.Client, smDesired *promv1.ServiceMonitor) error { smCurrent := &promv1.ServiceMonitor{} if err := c.Get(ctx, types.NamespacedName{Name: smDesired.GetName(), Namespace: smDesired.GetNamespace()}, smCurrent); err != nil { diff --git a/operators/endpointmetrics/pkg/rendering/renderer.go b/operators/endpointmetrics/pkg/rendering/renderer.go index 5501bd94f..956645625 100644 --- a/operators/endpointmetrics/pkg/rendering/renderer.go +++ b/operators/endpointmetrics/pkg/rendering/renderer.go @@ -12,7 +12,6 @@ import ( "strings" prometheusv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" - "golang.org/x/exp/slices" v1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" @@ -34,7 +33,6 @@ const ( ) var ( - namespace = os.Getenv("WATCH_NAMESPACE") log = logf.Log.WithName("renderer") disabledMetrics = []string{ "apiserver_admission_controller_admission_duration_seconds_bucket", @@ -60,6 +58,7 @@ func Render( r *rendererutil.Renderer, c runtimeclient.Client, hubInfo *operatorconfig.HubInfo, + namespace string, ) ([]*unstructured.Unstructured, error) { isKindTest := false @@ -225,7 +224,7 @@ func Render( } // replace the disabled metrics - disabledMetricsSt, err := getDisabledMetrics(ctx, c) + disabledMetricsSt, err := getDisabledMetrics(ctx, c, namespace) if err != nil { return nil, err } @@ -270,36 +269,10 @@ func Render( } } - // Ordering resources to ensure they are applied in the correct order - slices.SortFunc(resources, func(a, b *unstructured.Unstructured) int { - aPriority := resourcePriority(a) - bPriority := resourcePriority(b) - if aPriority < bPriority { - return -1 - } - if aPriority > bPriority { - return 1 - } - return 0 - }) - return resources, nil } -func resourcePriority(resource *unstructured.Unstructured) int { - switch resource.GetKind() { - case "Role", "ClusterRole": - return 1 - case "RoleBinding", "ClusterRoleBinding": - return 2 - case "CustomResourceDefinition": - return 3 - default: - return 4 - } -} - -func getDisabledMetrics(ctx context.Context, c runtimeclient.Client) (string, error) { +func getDisabledMetrics(ctx context.Context, c runtimeclient.Client, namespace string) (string, error) { cm := &corev1.ConfigMap{} err := c.Get(ctx, types.NamespacedName{Name: operatorconfig.AllowlistConfigMapName, Namespace: namespace}, cm) diff --git a/operators/endpointmetrics/pkg/rendering/renderer_test.go b/operators/endpointmetrics/pkg/rendering/renderer_test.go index b18d4b4a6..4e40b38c9 100644 --- a/operators/endpointmetrics/pkg/rendering/renderer_test.go +++ b/operators/endpointmetrics/pkg/rendering/renderer_test.go @@ -18,13 +18,14 @@ import ( operatorconfig "github.com/stolostron/multicluster-observability-operator/operators/pkg/config" rendererutil "github.com/stolostron/multicluster-observability-operator/operators/pkg/rendering" templatesutil "github.com/stolostron/multicluster-observability-operator/operators/pkg/rendering/templates" + "github.com/stretchr/testify/assert" ) -func getAllowlistCM() *corev1.ConfigMap { +func getAllowlistCM(ns string) *corev1.ConfigMap { return &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Name: operatorconfig.AllowlistConfigMapName, - Namespace: namespace, + Namespace: ns, }, Data: map[string]string{ metricsConfigMapKey: ` @@ -50,18 +51,12 @@ func TestRender(t *testing.T) { AlertmanagerEndpoint: "testing.com", AlertmanagerRouterCA: "testing", } - c := fake.NewClientBuilder().WithRuntimeObjects([]runtime.Object{getAllowlistCM()}...).Build() + c := fake.NewClientBuilder().WithRuntimeObjects([]runtime.Object{getAllowlistCM("test-ns")}...).Build() - objs, err := Render(context.Background(), renderer, c, hubInfo) + objs, err := Render(context.Background(), renderer, c, hubInfo, "test-ns") if err != nil { t.Fatalf("failed to render endpoint templates: %v", err) } - // ensure that objects are sorted - for i := 0; i < len(objs)-1; i++ { - if resourcePriority(objs[i]) > resourcePriority(objs[i+1]) { - t.Errorf("objects are not sorted") - } - } - + assert.Greater(t, len(objs), 2) } diff --git a/operators/endpointmetrics/pkg/util/status.go b/operators/endpointmetrics/pkg/status/status.go similarity index 98% rename from operators/endpointmetrics/pkg/util/status.go rename to operators/endpointmetrics/pkg/status/status.go index ef4944b4d..b6e39a587 100644 --- a/operators/endpointmetrics/pkg/util/status.go +++ b/operators/endpointmetrics/pkg/status/status.go @@ -2,7 +2,7 @@ // Copyright Contributors to the Open Cluster Management project // Licensed under the Apache License 2.0 -package util +package status import ( "context" @@ -14,6 +14,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/util/retry" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -54,6 +55,7 @@ var ( Status: metav1.ConditionTrue, }, } + log = ctrl.Log.WithName("status") ) func ReportStatus(ctx context.Context, client client.Client, conditionReason ConditionReason, addonName, addonNs string) error { diff --git a/operators/endpointmetrics/pkg/util/status_test.go b/operators/endpointmetrics/pkg/status/status_test.go similarity index 88% rename from operators/endpointmetrics/pkg/util/status_test.go rename to operators/endpointmetrics/pkg/status/status_test.go index e1989a451..d5227448a 100644 --- a/operators/endpointmetrics/pkg/util/status_test.go +++ b/operators/endpointmetrics/pkg/status/status_test.go @@ -2,7 +2,7 @@ // Copyright Contributors to the Open Cluster Management project // Licensed under the Apache License 2.0 -package util_test +package status_test import ( "context" @@ -10,7 +10,7 @@ import ( "testing" "time" - "github.com/stolostron/multicluster-observability-operator/operators/endpointmetrics/pkg/util" + "github.com/stolostron/multicluster-observability-operator/operators/endpointmetrics/pkg/status" oav1beta1 "github.com/stolostron/multicluster-observability-operator/operators/multiclusterobservability/api/v1beta1" mcov1beta2 "github.com/stolostron/multicluster-observability-operator/operators/multiclusterobservability/api/v1beta2" "github.com/stretchr/testify/assert" @@ -45,15 +45,15 @@ func TestReportStatus(t *testing.T) { testCases := map[string]struct { currentConditions []oav1beta1.StatusCondition - newCondition util.ConditionReason + newCondition status.ConditionReason expects func(*testing.T, []oav1beta1.StatusCondition) }{ "new status should be appended": { currentConditions: []oav1beta1.StatusCondition{}, - newCondition: util.Deployed, + newCondition: status.Deployed, expects: func(t *testing.T, conditions []oav1beta1.StatusCondition) { assert.Len(t, conditions, 1) - assert.EqualValues(t, util.Deployed, conditions[0].Reason) + assert.EqualValues(t, status.Deployed, conditions[0].Reason) assert.Equal(t, metav1.ConditionTrue, conditions[0].Status) assert.Equal(t, "Progressing", conditions[0].Type) assert.InEpsilon(t, time.Now().Unix(), conditions[0].LastTransitionTime.Unix(), 1) @@ -63,7 +63,7 @@ func TestReportStatus(t *testing.T) { currentConditions: []oav1beta1.StatusCondition{ { Type: "Progressing", - Reason: string(util.Deployed), + Reason: string(status.Deployed), Message: "Metrics collector deployed", Status: metav1.ConditionTrue, LastTransitionTime: metav1.Time{ @@ -72,7 +72,7 @@ func TestReportStatus(t *testing.T) { }, { Type: "Disabled", - Reason: string(util.Disabled), + Reason: string(status.Disabled), Message: "enableMetrics is set to False", Status: metav1.ConditionTrue, LastTransitionTime: metav1.Time{ @@ -80,20 +80,20 @@ func TestReportStatus(t *testing.T) { }, }, }, - newCondition: util.Disabled, + newCondition: status.Disabled, expects: func(t *testing.T, conditions []oav1beta1.StatusCondition) { assert.Len(t, conditions, 2) found := false for _, c := range conditions { - if c.Reason == string(util.Disabled) { + if c.Reason == string(status.Disabled) { found = true - assert.EqualValues(t, util.Disabled, c.Reason) + assert.EqualValues(t, status.Disabled, c.Reason) assert.Equal(t, metav1.ConditionTrue, c.Status) assert.Equal(t, "Disabled", c.Type) assert.InEpsilon(t, time.Now().Unix(), c.LastTransitionTime.Unix(), 1) } else { // other condition should not be changed - assert.EqualValues(t, util.Deployed, c.Reason) + assert.EqualValues(t, status.Deployed, c.Reason) assert.InEpsilon(t, time.Now().Add(-time.Minute).Unix(), c.LastTransitionTime.Unix(), 1) } } @@ -104,7 +104,7 @@ func TestReportStatus(t *testing.T) { currentConditions: []oav1beta1.StatusCondition{ { Type: "Progressing", - Reason: string(util.Deployed), + Reason: string(status.Deployed), Message: "Metrics collector deployed", Status: metav1.ConditionTrue, LastTransitionTime: metav1.Time{ @@ -118,10 +118,10 @@ func TestReportStatus(t *testing.T) { }, }, }, - newCondition: util.Deployed, + newCondition: status.Deployed, expects: func(t *testing.T, conditions []oav1beta1.StatusCondition) { assert.Len(t, conditions, 2) - assert.EqualValues(t, util.Deployed, conditions[0].Reason) + assert.EqualValues(t, status.Deployed, conditions[0].Reason) assert.InEpsilon(t, time.Now().Add(-time.Minute).Unix(), conditions[0].LastTransitionTime.Unix(), 1) }, }, @@ -130,10 +130,10 @@ func TestReportStatus(t *testing.T) { {Type: "1"}, {Type: "2"}, {Type: "3"}, {Type: "4"}, {Type: "5"}, {Type: "6"}, {Type: "7"}, {Type: "8"}, {Type: "9"}, {Type: "10"}, }, - newCondition: util.Deployed, + newCondition: status.Deployed, expects: func(t *testing.T, conditions []oav1beta1.StatusCondition) { - assert.Len(t, conditions, util.MaxStatusConditionsCount) - assert.EqualValues(t, util.Deployed, conditions[len(conditions)-1].Reason) + assert.Len(t, conditions, status.MaxStatusConditionsCount) + assert.EqualValues(t, status.Deployed, conditions[len(conditions)-1].Reason) }, }, "duplicated conditions should be removed": { @@ -143,7 +143,7 @@ func TestReportStatus(t *testing.T) { {Type: "Progressing", LastTransitionTime: metav1.Time{Time: time.Now().Add(-time.Minute)}}, {Type: "Degraded", LastTransitionTime: metav1.Time{Time: time.Now().Add(-time.Minute)}}, }, - newCondition: util.Deployed, + newCondition: status.Deployed, expects: func(t *testing.T, conditions []oav1beta1.StatusCondition) { assert.Len(t, conditions, 2) for _, c := range conditions { @@ -164,7 +164,7 @@ func TestReportStatus(t *testing.T) { {Type: "Degraded", Status: metav1.ConditionTrue}, {Type: "Available", Status: metav1.ConditionTrue}, }, - newCondition: util.Deployed, + newCondition: status.Deployed, expects: func(t *testing.T, conditions []oav1beta1.StatusCondition) { assert.Len(t, conditions, 3) for _, c := range conditions { @@ -191,7 +191,7 @@ func TestReportStatus(t *testing.T) { } // test - if err := util.ReportStatus(context.Background(), client, tc.newCondition, baseAddon.Name, baseAddon.Namespace); err != nil { + if err := status.ReportStatus(context.Background(), client, tc.newCondition, baseAddon.Name, baseAddon.Namespace); err != nil { t.Fatalf("Error reporting status: %v", err) } newAddon := &oav1beta1.ObservabilityAddon{} @@ -219,7 +219,7 @@ func TestReportStatus_Conflict(t *testing.T) { conflictErr := errors.NewConflict(schema.GroupResource{Group: oav1beta1.GroupVersion.Group, Resource: "resource"}, name, fmt.Errorf("conflict")) c := newClientWithUpdateError(fakeClient, conflictErr) - if err := util.ReportStatus(context.Background(), c, util.Deployed, name, testNamespace); err == nil { + if err := status.ReportStatus(context.Background(), c, status.Deployed, name, testNamespace); err == nil { t.Fatalf("Conflict error should be retried and return an error if it fails") } if c.UpdateCallsCount() <= 1 { diff --git a/operators/endpointmetrics/pkg/util/error.go b/operators/endpointmetrics/pkg/util/error.go new file mode 100644 index 000000000..9e448fd2f --- /dev/null +++ b/operators/endpointmetrics/pkg/util/error.go @@ -0,0 +1,28 @@ +// Copyright (c) Red Hat, Inc. +// Copyright Contributors to the Open Cluster Management project +// Licensed under the Apache License 2.0 + +package util + +import ( + "net" + + "k8s.io/apimachinery/pkg/api/errors" +) + +// IsTransientClientErr checks if the error is a transient error +// This suggests that a retry (without any change) might be successful +func IsTransientClientErr(err error) bool { + if _, ok := err.(net.Error); ok { + return true + } + + if statusErr, ok := err.(*errors.StatusError); ok { + code := statusErr.Status().Code + if code >= 500 && code < 600 && code != 501 { + return true + } + } + + return errors.IsTimeout(err) || errors.IsServerTimeout(err) || errors.IsTooManyRequests(err) +} diff --git a/operators/multiclusterobservability/controllers/placementrule/manifestwork.go b/operators/multiclusterobservability/controllers/placementrule/manifestwork.go index 4cef3f8a3..eb4ecadee 100644 --- a/operators/multiclusterobservability/controllers/placementrule/manifestwork.go +++ b/operators/multiclusterobservability/controllers/placementrule/manifestwork.go @@ -409,6 +409,8 @@ func createManifestWorks( Name: "HUB_ENDPOINT_OPERATOR", Value: "true", }) + + dep.ObjectMeta.Name = config.HubEndpointOperatorName } endpointMetricsOperatorDeployCopy.Spec.Template.Spec = spec manifests = injectIntoWork(manifests, endpointMetricsOperatorDeployCopy) diff --git a/operators/pkg/config/config.go b/operators/pkg/config/config.go index 85a087f9b..bdf0e02e4 100644 --- a/operators/pkg/config/config.go +++ b/operators/pkg/config/config.go @@ -71,3 +71,9 @@ var ImageKeyNameMap = map[string]string{ var ( IsMCOTerminating = false ) + +const ( + DefaultClusterType = "" + OcpThreeClusterType = "ocp3" + SnoClusterType = "SNO" +)