diff --git a/cmd/controller/main.go b/cmd/controller/main.go index 236f98d..1f3be99 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -79,6 +79,9 @@ func main() { klog.Fatal("could not create client for dns resources: ", err.Error()) } + // Initialize/start metrics server + util.InitMetricsServer() + // context for the reconciliation controller ctx, cancel := context.WithCancel(context.Background()) defer cancel() diff --git a/cmd/server/server.go b/cmd/server/server.go index 4bd94d8..c489098 100644 --- a/cmd/server/server.go +++ b/cmd/server/server.go @@ -21,7 +21,11 @@ import ( func main() { klog.SetLogger(util.GetLogger()) subHandler := getSubscriptionHandler() - http.HandleFunc("/provision/", subHandler.HandleRequest) + + http.HandleFunc("/provision/", util.InstrumentHttpHandler(subHandler.HandleRequest, "cap_op_subscription_requests", "subscription-server requests.")) + + // Initialize/start metrics server + util.InitMetricsServer() // Default port port := "4000" diff --git a/go.mod b/go.mod index b461286..bb8e952 100644 --- a/go.mod +++ b/go.mod @@ -13,8 +13,8 @@ require ( github.com/google/go-cmp v0.6.0 github.com/google/uuid v1.6.0 github.com/lestrrat-go/jwx/v2 v2.1.2 - github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.77.2 - github.com/prometheus-operator/prometheus-operator/pkg/client v0.77.2 + github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.78.1 + github.com/prometheus-operator/prometheus-operator/pkg/client v0.78.1 github.com/prometheus/client_golang v1.20.5 github.com/prometheus/common v0.60.1 go.uber.org/zap v1.27.0 @@ -32,6 +32,8 @@ require ( ) require ( + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/decred/dcrd/dcrec/secp256k1/v4 v4.3.0 // indirect github.com/emicklei/go-restful/v3 v3.12.1 // indirect @@ -48,6 +50,7 @@ require ( github.com/imdario/mergo v0.3.16 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect + github.com/klauspost/compress v1.17.11 // indirect github.com/lestrrat-go/blackmagic v1.0.2 // indirect github.com/lestrrat-go/httpcc v1.0.1 // indirect github.com/lestrrat-go/httprc v1.0.6 // indirect @@ -59,6 +62,7 @@ require ( github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/prometheus/client_model v0.6.1 // indirect + github.com/prometheus/procfs v0.15.1 // indirect github.com/segmentio/asm v1.2.0 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/x448/float16 v0.8.4 // indirect @@ -72,15 +76,15 @@ require ( golang.org/x/text v0.19.0 // indirect golang.org/x/time v0.7.0 // indirect golang.org/x/tools v0.26.0 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20241021214115-324edc3d5d38 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20241104194629-dd2ea8efbc28 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/apiextensions-apiserver v0.31.2 // indirect k8s.io/gengo/v2 v2.0.0-20240826214909-a7b603a56eb7 // indirect - k8s.io/kube-openapi v0.0.0-20241009091222-67ed5848f094 // indirect - k8s.io/utils v0.0.0-20240921022957-49e7df575cb6 // indirect + k8s.io/kube-openapi v0.0.0-20240903163716-9e1beecbcb38 // indirect + k8s.io/utils v0.0.0-20241104163129-6fe5fd82f078 // indirect sigs.k8s.io/controller-runtime v0.19.1 // indirect sigs.k8s.io/gateway-api v1.2.0 // indirect sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect diff --git a/go.sum b/go.sum index 4b88956..6c69385 100644 --- a/go.sum +++ b/go.sum @@ -64,10 +64,14 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc= +github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/lestrrat-go/blackmagic v1.0.2 h1:Cg2gVSc9h7sz9NOByczrbUvLopQmXrfFx//N+AkAr5k= github.com/lestrrat-go/blackmagic v1.0.2/go.mod h1:UrEqBzIR2U6CnzVyUtfM6oZNMt/7O7Vohk2J0OGSAtU= github.com/lestrrat-go/httpcc v1.0.1 h1:ydWCStUeJLkpYyjLDHihupbn2tYmZ7m22BGkcvZZrIE= @@ -100,10 +104,10 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.77.2 h1:F/MALZ518KfI1zEg+Kg8/uTzoXKDyqw+LNC/5irJlJE= -github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.77.2/go.mod h1:D0KY8md81DQKdaR/cXwnhoWB3MYYyc/UjvqE8GFkIvA= -github.com/prometheus-operator/prometheus-operator/pkg/client v0.77.2 h1:rEBVlnqBSkNc6ufXnDu/GtSyS2jL1l6imtufzJUfOXI= -github.com/prometheus-operator/prometheus-operator/pkg/client v0.77.2/go.mod h1:Q+R7L3DIlJ2a8R+W+7wjk0B25Ci/VTy7V07Zvq327x0= +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.78.1 h1:Fm9Z+FabnB+6EoGq15j+pyLmaK6hYrYOpBlTzOLTQ+E= +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.78.1/go.mod h1:SvsRXw4m1F2vk7HquU5h475bFpke27mIUswfyw9u3ug= +github.com/prometheus-operator/prometheus-operator/pkg/client v0.78.1 h1:Wn7xwtLvf4xEahQ8/VgtbhMLnu4JD5gcd3bbPPxVKEE= +github.com/prometheus-operator/prometheus-operator/pkg/client v0.78.1/go.mod h1:JnLIE/lPIVgHiUNaY5y7MVf+J/V7vS0dicM5KsgGp3I= github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y= github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= @@ -181,8 +185,8 @@ golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8T golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/genproto/googleapis/api v0.0.0-20241021214115-324edc3d5d38 h1:2oV8dfuIkM1Ti7DwXc0BJfnwr9csz4TDXI9EmiI+Rbw= -google.golang.org/genproto/googleapis/api v0.0.0-20241021214115-324edc3d5d38/go.mod h1:vuAjtvlwkDKF6L1GQ0SokiRLCGFfeBUXWr/aFFkHACc= +google.golang.org/genproto/googleapis/api v0.0.0-20241104194629-dd2ea8efbc28 h1:M0KvPgPmDZHPlbRbaNU1APr28TvwvvdUPlSv7PUvy8g= +google.golang.org/genproto/googleapis/api v0.0.0-20241104194629-dd2ea8efbc28/go.mod h1:dguCy7UOdZhTvLzDyt15+rOrawrpM4q7DD9dQ1P11P4= google.golang.org/protobuf v1.35.1 h1:m3LfL6/Ca+fqnjnlqQXNpFPABW1UD7mjh8KO2mKFytA= google.golang.org/protobuf v1.35.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= @@ -216,10 +220,10 @@ k8s.io/gengo/v2 v2.0.0-20240826214909-a7b603a56eb7 h1:cErOOTkQ3JW19o4lo91fFurouh k8s.io/gengo/v2 v2.0.0-20240826214909-a7b603a56eb7/go.mod h1:EJykeLsmFC60UQbYJezXkEsG2FLrt0GPNkU5iK5GWxU= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20241009091222-67ed5848f094 h1:MErs8YA0abvOqJ8gIupA1Tz6PKXYUw34XsGlA7uSL1k= -k8s.io/kube-openapi v0.0.0-20241009091222-67ed5848f094/go.mod h1:7ioBJr1A6igWjsR2fxq2EZ0mlMwYLejazSIc2bzMp2U= -k8s.io/utils v0.0.0-20240921022957-49e7df575cb6 h1:MDF6h2H/h4tbzmtIKTuctcwZmY0tY9mD9fNT47QO6HI= -k8s.io/utils v0.0.0-20240921022957-49e7df575cb6/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +k8s.io/kube-openapi v0.0.0-20240903163716-9e1beecbcb38 h1:1dWzkmJrrprYvjGwh9kEUxmcUV/CtNU8QM7h1FLWQOo= +k8s.io/kube-openapi v0.0.0-20240903163716-9e1beecbcb38/go.mod h1:coRQXBK9NxO98XUv3ZD6AK3xzHCxV6+b7lrquKwaKzA= +k8s.io/utils v0.0.0-20241104163129-6fe5fd82f078 h1:jGnCPejIetjiy2gqaJ5V0NLwTpF4wbQ6cZIItJCSHno= +k8s.io/utils v0.0.0-20241104163129-6fe5fd82f078/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= sigs.k8s.io/controller-runtime v0.19.1 h1:Son+Q40+Be3QWb+niBXAg2vFiYWolDjjRfO8hn/cxOk= sigs.k8s.io/controller-runtime v0.19.1/go.mod h1:iRmWllt8IlaLjvTTDLhRBXIEtkCK6hwVBJJsYS9Ajf4= sigs.k8s.io/gateway-api v1.2.0 h1:LrToiFwtqKTKZcZtoQPTuo3FxhrrhTgzQG0Te+YGSo8= diff --git a/internal/controller/common_test.go b/internal/controller/common_test.go index 83f62a9..6acb87f 100644 --- a/internal/controller/common_test.go +++ b/internal/controller/common_test.go @@ -324,6 +324,9 @@ func eventDrain(ctx context.Context, c *Controller, t *testing.T) { func reconcileTestItem(ctx context.Context, t *testing.T, item QueueItem, data TestData) (err error) { // run inside a test sub-context to maintain test case name with reference to backlog items t.Run(strings.Join(append([]string{data.description}, data.backlogItems...), " "), func(t *testing.T) { + // Deregister metrics + defer deregisterMetrics() + c := initializeControllerForReconciliationTests(t, data.mockErrorForResources, data.discoverResources) go eventDrain(ctx, c, t) diff --git a/internal/controller/controller.go b/internal/controller/controller.go index fac6610..d1b8a2f 100644 --- a/internal/controller/controller.go +++ b/internal/controller/controller.go @@ -57,6 +57,8 @@ type Controller struct { } func NewController(client kubernetes.Interface, crdClient versioned.Interface, istioClient istio.Interface, gardenerCertificateClient gardenerCert.Interface, certManagerCertificateClient certManager.Interface, gardenerDNSClient gardenerDNS.Interface, promClient promop.Interface) *Controller { + // Register metrics provider on the workqueue + initializeMetrics() queues := map[int]workqueue.TypedRateLimitingInterface[QueueItem]{ ResourceCAPApplication: workqueue.NewTypedRateLimitingQueueWithConfig(workqueue.DefaultTypedControllerRateLimiter[QueueItem](), workqueue.TypedRateLimitingQueueConfig[QueueItem]{Name: KindMap[ResourceCAPApplication]}), @@ -134,6 +136,8 @@ func (c *Controller) Start(ctx context.Context) { for _, q := range c.queues { q.ShutDown() } + // Deregister metrics and shutdown queues + deregisterMetrics() }() c.initializeInformers() @@ -259,6 +263,7 @@ func (c *Controller) processQueueItem(ctx context.Context, key int) error { // Handle reconcile errors if err != nil { klog.ErrorS(err, "queue processing error", "resource", getResourceKindFromKey(key)) + ReconcileErrors.WithLabelValues(getResourceKindFromKey(item.Key), item.ResourceKey.Namespace, item.ResourceKey.Name).Inc() if !skipItem { // add back to queue for re-processing q.AddRateLimited(item) @@ -309,6 +314,7 @@ func (c *Controller) recoverFromPanic(ctx context.Context, item QueueItem, q wor default: c.setCAStatusError(ctx, item.ResourceKey, err) } + Panics.WithLabelValues(getResourceKindFromKey(item.Key), item.ResourceKey.Namespace, item.ResourceKey.Name).Inc() // Add the item back to the queue to be processed again with a RateLimited delay q.AddRateLimited(item) diff --git a/internal/controller/controller_test.go b/internal/controller/controller_test.go index 1791629..0c840c5 100644 --- a/internal/controller/controller_test.go +++ b/internal/controller/controller_test.go @@ -227,6 +227,9 @@ func TestController_processQueueItem(t *testing.T) { cat = createCatCRO("ca-does-not-exist", "provider", true) } + // Deregister metrics + defer deregisterMetrics() + c := getTestController(testResources{cas: []*v1alpha1.CAPApplication{ca}, cats: []*v1alpha1.CAPTenant{cat}, preventStart: true}) if tt.resource == 9 || tt.resource == 99 { c.queues[tt.resource] = workqueue.NewTypedRateLimitingQueueWithConfig(workqueue.DefaultTypedControllerRateLimiter[QueueItem](), workqueue.TypedRateLimitingQueueConfig[QueueItem]{}) @@ -401,11 +404,13 @@ func TestController_recoverFromPanic(t *testing.T) { defer cancel() + defer deregisterMetrics() + if tt.expectPanic { panic("Simulate some panic during reconcile") } - // There is no need to check for results in this test as in case of errros the panic raised above will not be reovered! + // There is no need to check for results in this test as in case of errros the panic raised above will not be recovered! }) } diff --git a/internal/controller/informers_test.go b/internal/controller/informers_test.go index f64dd1f..b960ab9 100644 --- a/internal/controller/informers_test.go +++ b/internal/controller/informers_test.go @@ -84,6 +84,9 @@ func TestController_initializeInformers(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { + // Deregister metrics + defer deregisterMetrics() + c := getTestController(testResources{}) expectedResult = false diff --git a/internal/controller/metrics.go b/internal/controller/metrics.go new file mode 100644 index 0000000..53bac34 --- /dev/null +++ b/internal/controller/metrics.go @@ -0,0 +1,182 @@ +/* +SPDX-FileCopyrightText: 2024 SAP SE or an SAP affiliate company and cap-operator contributors +SPDX-License-Identifier: Apache-2.0 +*/ + +package controller + +import ( + "os" + + "github.com/prometheus/client_golang/prometheus" + "k8s.io/client-go/util/workqueue" +) + +// Constants for the metrics +const ( + CAPOp = "cap_op" + Queue = "queue" + // Metrics for workqueue + Depth = "depth" + Adds = "adds_total" + QueueLatency = "latency_seconds" + WorkDuration = "work_duration_seconds" + UnfinishedWork = "unfinished_work_seconds" + LongestRunningProcessor = "longest_running_processor_seconds" + Retries = "retries_total" +) + +var ( + // Metrics for CROs in Error (Kind along with namespace & name of the CRO) + ReconcileErrors = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: CAPOp, + Name: "reconcile_errors", + Help: "Resources that failed to reconcile", + }, []string{"kind", "namespace", "name"}) + + // Metrics for CROs in Panic (namespace-name of the CRO) + Panics = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: CAPOp, + Name: "panics", + Help: "Resources that caused a panic", + }, []string{"kind", "namespace", "name"}) + + // Metrics for overall tenant operations + TenantOperations = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: CAPOp, + Name: "tenant_operations", + Help: "Overall number of tenant operations", + }, []string{"app", "operation"}) + + // Metrics for TenantOperation Failures (with app, operation, namespace & name of the tenant operation) + TenantOperationFailures = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: CAPOp, + Name: "tenant_operation_failures", + Help: "Tenant operations that failed to complete", + }, []string{"app", "operation", "tenant_id", "namespace", "name"}) + + // Metrics for duration of TenantOperations (could help with determining duration of saas provisioning callback for e.g.) + LastTenantOperationDuration = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: CAPOp, + Name: "last_tenant_operation_duration_seconds", + Help: "Duration of last tenant operation in seconds", + }, []string{"app", "tenant_id"}) + + /** + Note: + All the metrics below are for the CAP Operator controller workqueue, + used for handling CAP Operator resources. + These need to be explicitly defined here along with a capOperatorMetricsProvider, + as we have our own controller/workqueue implementation. + **/ + + depth = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: CAPOp, + Subsystem: Queue, + Name: Depth, + Help: "Depth of workqueue", + }, []string{"name"}) + + adds = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: CAPOp, + Subsystem: Queue, + Name: Adds, + Help: "Adds to workqueue", + }, []string{"name"}) + + latency = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: CAPOp, + Subsystem: Queue, + Name: QueueLatency, + Help: "Latency of workqueue", + Buckets: prometheus.ExponentialBuckets(10e-9, 10, 10), + }, []string{"name"}) + + workDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: CAPOp, + Subsystem: Queue, + Name: WorkDuration, + Help: "Processing time of workqueue", + Buckets: prometheus.ExponentialBuckets(10e-9, 10, 10), + }, []string{"name"}) + + unfinished = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: CAPOp, + Subsystem: Queue, + Name: UnfinishedWork, + Help: "Unfinished work in workqueue", + }, []string{"name"}) + + longestRunningProcessor = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: CAPOp, + Subsystem: Queue, + Name: LongestRunningProcessor, + Help: "Longest running processor in workqueue", + }, []string{"name"}) + + retries = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: CAPOp, + Subsystem: Queue, + Name: Retries, + Help: "Retries in workqueue", + }, []string{"name"}) +) + +// Create a varaible to hold all the collectors +var collectors = []prometheus.Collector{ReconcileErrors, Panics, TenantOperations, depth, adds, latency, workDuration, unfinished, longestRunningProcessor, retries} + +// #region capOperatorMetricsProvider +// capOperatorMetricsProvider implements workqueue.MetricsProvider +type capOperatorMetricsProvider struct { +} + +func (capOperatorMetricsProvider) NewDepthMetric(name string) workqueue.GaugeMetric { + return depth.WithLabelValues(name) +} + +func (capOperatorMetricsProvider) NewAddsMetric(name string) workqueue.CounterMetric { + return adds.WithLabelValues(name) +} + +func (capOperatorMetricsProvider) NewLatencyMetric(name string) workqueue.HistogramMetric { + return latency.WithLabelValues(name) +} + +func (capOperatorMetricsProvider) NewWorkDurationMetric(name string) workqueue.HistogramMetric { + return workDuration.WithLabelValues(name) +} + +func (capOperatorMetricsProvider) NewUnfinishedWorkSecondsMetric(name string) workqueue.SettableGaugeMetric { + return unfinished.WithLabelValues(name) +} + +func (capOperatorMetricsProvider) NewLongestRunningProcessorSecondsMetric(name string) workqueue.SettableGaugeMetric { + return longestRunningProcessor.WithLabelValues(name) +} + +func (capOperatorMetricsProvider) NewRetriesMetric(name string) workqueue.CounterMetric { + return retries.WithLabelValues(name) +} + +// #endregion + +// Initialize the metrics +func initializeMetrics() { + // Parse DETAILED_OPERATION_METRICS env. to determine if detailed operation metrics are needed + if os.Getenv("DETAILED_OPERATION_METRICS") == "true" { + collectors = append(collectors, TenantOperationFailures, LastTenantOperationDuration) + } + + // Register CAP Operator metrics + prometheus.MustRegister(collectors...) + + // Register CAP Operator metrics provider as the workqueue metrics provider (needed for the workqueue metrics, to be done just once) + workqueue.SetProvider(capOperatorMetricsProvider{}) +} + +func deregisterMetrics() { + // Un-register CAP Operator metrics + for _, collector := range collectors { + prometheus.Unregister(collector) + } +} diff --git a/internal/controller/reconcile-captenantoperation.go b/internal/controller/reconcile-captenantoperation.go index 2dd0617..1fb72cc 100644 --- a/internal/controller/reconcile-captenantoperation.go +++ b/internal/controller/reconcile-captenantoperation.go @@ -211,6 +211,8 @@ func (c *Controller) reconcileTenantOperationSteps(ctx context.Context, ctop *v1 if err != nil { c.Event(ctop, nil, corev1.EventTypeWarning, CAPTenantOperationConditionReasonStepProcessingError, EventActionTrackJob, err.Error()) } + // Collect.. is called here to ensure that the metrics are collected just once for every "completion" of the tenant operation. + collectTenantOperationMetrics(ctop) }() if ctop.Status.CurrentStep == nil { // set initial step @@ -695,3 +697,19 @@ func getCTOPEnv(params *jobCreateParams, ctop *v1alpha1.CAPTenantOperation, step return env } + +// Collect tenant operation metrics based on the status of the tenant operation +func collectTenantOperationMetrics(ctop *v1alpha1.CAPTenantOperation) { + if isCROConditionReady(ctop.Status.GenericStatus) { + // Collect/Increment overall completed tenant operation metrics + TenantOperations.WithLabelValues(ctop.Labels[LabelBTPApplicationIdentifierHash], string(ctop.Spec.Operation)).Inc() + + if ctop.Status.State == v1alpha1.CAPTenantOperationStateFailed { + // Collect/Increment failed tenant operation metrics with CRO details + TenantOperationFailures.WithLabelValues(ctop.Labels[LabelBTPApplicationIdentifierHash], string(ctop.Spec.Operation), ctop.Spec.TenantId, ctop.Namespace, ctop.Name).Inc() + } + + // Collect tenant operation duration metrics based on creation time of the tenant operation and current time + LastTenantOperationDuration.WithLabelValues(ctop.Labels[LabelBTPApplicationIdentifierHash], ctop.Spec.TenantId).Set(time.Since(ctop.CreationTimestamp.Time).Seconds()) + } +} diff --git a/internal/controller/reconcile-captenantoperation_test.go b/internal/controller/reconcile-captenantoperation_test.go index 7900188..73acd43 100644 --- a/internal/controller/reconcile-captenantoperation_test.go +++ b/internal/controller/reconcile-captenantoperation_test.go @@ -7,6 +7,7 @@ package controller import ( "context" + "os" "testing" ) @@ -51,6 +52,10 @@ func TestTenantOperationInitializeStep(t *testing.T) { } func TestTenantOperationWithNoSteps(t *testing.T) { + // Env set for this test to enable coverage for detailed metrics --> this has no impact on tenant operation code/test as such. + detailedMetrics := "DETAILED_OPERATION_METRICS" + defer os.Unsetenv(detailedMetrics) + os.Setenv(detailedMetrics, "true") err := reconcileTestItem( context.TODO(), t, QueueItem{Key: ResourceCAPTenantOperation, ResourceKey: NamespacedResourceKey{Namespace: "default", Name: "test-cap-01-provider-abcd"}}, diff --git a/internal/controller/reconcile-domains_test.go b/internal/controller/reconcile-domains_test.go index b91e89d..3061d6d 100644 --- a/internal/controller/reconcile-domains_test.go +++ b/internal/controller/reconcile-domains_test.go @@ -184,6 +184,9 @@ func TestController_reconcileOperatorDomains(t *testing.T) { ingressRes = createIngressResource(ingressGWName, ca, dns) } + // Deregister metrics + defer deregisterMetrics() + c = getTestController(testResources{ cas: []*v1alpha1.CAPApplication{ca, ca2}, ingressGW: []*ingressResources{ingressRes}, @@ -224,6 +227,10 @@ func TestController_reconcileOperatorDomains(t *testing.T) { } } ca.Spec.Domains.Secondary = []string{"2" + secondaryDomain, "3" + secondaryDomain} + + // Deregister metrics before starting new controller again + deregisterMetrics() + c = getTestController(testResources{ cas: []*v1alpha1.CAPApplication{ca, ca2}, gateway: gw, @@ -266,6 +273,10 @@ func TestController_reconcileOperatorDomains(t *testing.T) { ca2.Spec.Domains.IstioIngressGatewayLabels[1].Value += "2" ingressGW2 = createIngressResource(ingressGWName+"2", ca2, "Something.that.surely.exceeds.the.64char.limit."+dnsTarget) } + + // Deregister metrics before starting new controller again + deregisterMetrics() + c = getTestController(testResources{ cas: []*v1alpha1.CAPApplication{ca, ca2}, gateway: gw, @@ -273,6 +284,7 @@ func TestController_reconcileOperatorDomains(t *testing.T) { certManagerCert: certManagerCert, ingressGW: []*ingressResources{ingressRes, ingressGW2}, }) + err = c.reconcileOperatorDomains(context.TODO(), q, 0) if (err != nil) != tt.wantErr { t.Errorf("Controller.reconcileOperatorDomains() error = %v, wantErr %v", err, tt.wantErr) diff --git a/internal/controller/reconcile_test.go b/internal/controller/reconcile_test.go index 3352d3f..44bdacf 100644 --- a/internal/controller/reconcile_test.go +++ b/internal/controller/reconcile_test.go @@ -486,6 +486,9 @@ func TestGetLatestReadyCAPApplicationVersion(t *testing.T) { cavs = append(cavs, cav) } + // Deregister metrics at the end of the test + defer deregisterMetrics() + c := getTestController(testResources{ cas: []*v1alpha1.CAPApplication{ca}, cavs: cavs, @@ -578,6 +581,9 @@ func TestGetLatestCAPApplicationVersion(t *testing.T) { cavs = append(cavs, cav) } + // Deregister metrics at the end of the test + defer deregisterMetrics() + c := getTestController(testResources{ cas: []*v1alpha1.CAPApplication{ca}, cavs: cavs, diff --git a/internal/controller/version-monitoring_test.go b/internal/controller/version-monitoring_test.go index 5e19cac..124ecc4 100644 --- a/internal/controller/version-monitoring_test.go +++ b/internal/controller/version-monitoring_test.go @@ -114,6 +114,9 @@ func setupTestControllerWithInitialResources(t *testing.T, initialResources []st } func TestGracefulShutdownMonitoringRoutines(t *testing.T) { + // Deregister metrics at the end of the test + defer deregisterMetrics() + c := setupTestControllerWithInitialResources(t, []string{}) s, _ := getPromServer(false, []queryTestCase{}) @@ -213,6 +216,9 @@ func TestVersionSelectionForCleanup(t *testing.T) { for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { + // Deregister metrics at the end of the test + defer deregisterMetrics() + c := setupTestControllerWithInitialResources(t, tc.resources) orc := &cleanupOrchestrator{queue: workqueue.NewTypedRateLimitingQueue(workqueue.DefaultTypedControllerRateLimiter[NamespacedResourceKey]())} defer orc.queue.ShutDown() @@ -557,6 +563,10 @@ func TestVersionCleanupEvaluation(t *testing.T) { defer s.Close() o := initializeVersionCleanupOrchestrator(context.TODO(), &monitoringEnv{address: s.URL, acquireClientRetryDelay: 1 * time.Minute}) defer o.queue.ShutDown() + + // Deregister metrics at the end of the test + defer deregisterMetrics() + c := setupTestControllerWithInitialResources(t, tt.startResources) item := NamespacedResourceKey{Namespace: "default", Name: tt.evaluatedVersion} o.queue.Add(item) diff --git a/internal/util/log.go b/internal/util/log.go index 16f8ae7..33c18bd 100644 --- a/internal/util/log.go +++ b/internal/util/log.go @@ -1,3 +1,8 @@ +/* +SPDX-FileCopyrightText: 2024 SAP SE or an SAP affiliate company and cap-operator contributors +SPDX-License-Identifier: Apache-2.0 +*/ + package util import ( diff --git a/internal/util/log_test.go b/internal/util/log_test.go index b70b4d7..2ffbd79 100644 --- a/internal/util/log_test.go +++ b/internal/util/log_test.go @@ -1,3 +1,8 @@ +/* +SPDX-FileCopyrightText: 2024 SAP SE or an SAP affiliate company and cap-operator contributors +SPDX-License-Identifier: Apache-2.0 +*/ + package util import ( diff --git a/internal/util/metrics-server.go b/internal/util/metrics-server.go new file mode 100644 index 0000000..c17df73 --- /dev/null +++ b/internal/util/metrics-server.go @@ -0,0 +1,55 @@ +/* +SPDX-FileCopyrightText: 2024 SAP SE or an SAP affiliate company and cap-operator contributors +SPDX-License-Identifier: Apache-2.0 +*/ + +package util + +import ( + "net/http" + "os" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + "github.com/prometheus/client_golang/prometheus/promhttp" + "k8s.io/klog/v2" +) + +// Initalizes the metrics server with default port 9090 and path /metrics based on default prometheus client +func InitMetricsServer() { + // Expose /metrics HTTP endpoint + go func() { + // Default port + metricsPort := "9090" + + // Get Port from env + portEnv := os.Getenv("METRICS_PORT") + if portEnv != "" { + metricsPort = portEnv + } + http.Handle("/metrics", promhttp.Handler()) + klog.Fatal(http.ListenAndServe(":"+metricsPort, nil)) + }() +} + +// Instruments the given HTTP handler with counter (total requests) and gauge (in flight requests) metrics +func InstrumentHttpHandler(handler func(http.ResponseWriter, *http.Request), metricNamePrefix string, helpTextSuffix string) http.HandlerFunc { + klog.InfoS("Instrumenting HTTP handler", "metricPrefix", metricNamePrefix, "helpSuffix", helpTextSuffix) + return promhttp.InstrumentHandlerCounter( + promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: metricNamePrefix + "_total", + Help: "Total " + helpTextSuffix, + }, + []string{"code", "method"}, + ), + promhttp.InstrumentHandlerInFlight(promauto.NewGauge( + prometheus.GaugeOpts{ + Name: metricNamePrefix + "_in_flight", + Help: "Current " + helpTextSuffix, + }, + ), + http.HandlerFunc(handler), + ), + ) +} diff --git a/sonar-project.properties b/sonar-project.properties index 517d2ca..e33a6a8 100644 --- a/sonar-project.properties +++ b/sonar-project.properties @@ -12,7 +12,7 @@ sonar.exclusions=**/*_test.go,**/*.yaml sonar.tests=. sonar.test.inclusions=**/*_test.go -sonar.coverage.exclusions=**/main.go,**/server.go +sonar.coverage.exclusions=**/main.go,**/*server.go sonar.go.coverage.reportPaths=coverage.out diff --git a/website/content/en/docs/installation/helm/_index.md b/website/content/en/docs/installation/helm/_index.md index cdf260d..f2ae0a7 100644 --- a/website/content/en/docs/installation/helm/_index.md +++ b/website/content/en/docs/installation/helm/_index.md @@ -32,19 +32,29 @@ Create a namespace and install the Helm chart in that namespace by specifying th domain: cap-operator. ``` -## (Optional) Setup Prometheus Integration for _Version Monitoring_ - -To use the Version Monitoring feature of the CAP Operator, a [Prometheus](https://prometheus.io/) server URL can be provided to the CAP Operator. When installing the CAP Operator using the Helm chart, the following values can be specified in the values: -```yaml -controller: - versionMonitoring: - prometheusAddress: "http://prometheus-operated.monitoring.svc.cluster.local:9090" # <-- example of a Prometheus server running inside the same cluster - promClientAcquireRetryDelay: "2h" - metricsEvaluationInterval: "30m" # <-- duration after which version metrics are evaluated -``` -When the controller is started, the operator will try to connect to the Prometheus server and fetch [runtime information](https://prometheus.io/docs/prometheus/latest/querying/api/#runtime-information) to verify the connection. If the connection is not successful, it will be retried after the duration specified as `controller.versionMonitoring.promClientAcquireRetryDelay`. Check default values for these attributes [here](helm-values.md). - -{{% alert title="Note" color="info" %}} -- When connecting the controller to a Prometheus server running inside the cluster, please ensure that `NetworkPolicies` required for connecting to the service in the namespace where Prometheus is running are also created. -- If the Prometheus service is configured to use TLS, the relevant CA root certificates which need to be trusted can be mounted as volumes to the controller. -{{% /alert %}} +## Optional steps + +- #### Enable Service Monitors for metrics emitted by controller and subscription server + + To enable Monitoring via [metrics](docs/usage/operator-metrics) emitted by CAP Operator components, the following value can be specified: + ```yaml + monitoring: + enabled: true # <-- This enables creation of service monitors, for metrics emitted by the cap operator components + ``` + +- #### Setup Prometheus Integration for _Version Monitoring_ + + To use the [Version Monitoring](docs/usage/version-monitoring/) feature of the CAP Operator, a [Prometheus](https://prometheus.io/) server URL can be provided to the CAP Operator. When installing the CAP Operator using the Helm chart, the following values can be specified in the values: + ```yaml + controller: + versionMonitoring: + prometheusAddress: "http://prometheus-operated.monitoring.svc.cluster.local:9090" # <-- example of a Prometheus server running inside the same cluster + promClientAcquireRetryDelay: "2h" + metricsEvaluationInterval: "30m" # <-- duration after which version metrics are evaluated + ``` + When the controller is started, the operator will try to connect to the Prometheus server and fetch [runtime information](https://prometheus.io/docs/prometheus/latest/querying/api/#runtime-information) to verify the connection. If the connection is not successful, it will be retried after the duration specified as `controller.versionMonitoring.promClientAcquireRetryDelay`. Check default values for these attributes [here](helm-values.md). + + {{% alert title="Note" color="info" %}} + - When connecting the controller to a Prometheus server running inside the cluster, please ensure that `NetworkPolicies` required for connecting to the service in the namespace where Prometheus is running are also created. + - If the Prometheus service is configured to use TLS, the relevant CA root certificates which need to be trusted can be mounted as volumes to the controller. + {{% /alert %}} diff --git a/website/content/en/docs/usage/operator-metrics.md b/website/content/en/docs/usage/operator-metrics.md new file mode 100644 index 0000000..a7a561c --- /dev/null +++ b/website/content/en/docs/usage/operator-metrics.md @@ -0,0 +1,62 @@ +--- +title: "Operator Metrics" +linkTitle: "Operator Metrics" +weight: 60 +type: "docs" +description: > + How to monitor and consume metrics emitted by CAP Operator +--- + +The [Controller](docs/concepts/operator-components/controller/) and [Subscription Server](docs/concepts/operator-components/subscription-server/) now emit [Prometheus metrics](https://pkg.go.dev/github.com/prometheus/client_golang/prometheus) at `/metrics` path and `9090` port that may be used by consumers to analyze and understand usage, detect potential issues, monitor and scale cluster resources. +You can enable + +### Controller metrics +The controller emits [standard go metrics](https://pkg.go.dev/github.com/prometheus/client_golang/prometheus/collectors#WithGoCollectorRuntimeMetrics), workqueue metrics for the resources being reconciled implemented based on [MetricsProvider](https://pkg.go.dev/k8s.io/client-go/util/workqueue#MetricsProvider) and the following additional metrics: + +{{% pageinfo %}} +``` +cap_op_reconcile_errors{kind="CAPApplication",name="my-app",namespace="app"} 11 +``` +a counter type metric indicating total resources that failed to reconcile for each Kind. + +--- + +``` +cap_op_tenant_operations{app="",operation="provisioning"} 83 +``` +a counter type metric that provides some insights into the overall number of tenant operations. +{{% /pageinfo %}} + +By setting the enviroment variable `DETAILED_OPERATION_METRICS` to `"true"`, one can optionally also see these detailed operational metrics: + +{{% pageinfo %}} +``` +cap_op_tenant_operation_failures{app="",operation="upgrade",tenant_id="",namespace="app",name="my-app-tenant-op-xxyyz"} 2 +``` +a counter type metric that provides some insights into failed tenant operations per app, tenant along with name and namespace details of the failed operation resource. + +--- + +``` +cap_op_last_tenant_operation_duration_seconds{app="",tenant_id=""} 42 +``` +a guage type metric that provides some info about the duration in seconds taken by the last tenant operation for an app and tenant. +{{% /pageinfo %}} + + +### Subscription Server metrics +The controller emits [standard go metrics](https://pkg.go.dev/github.com/prometheus/client_golang/prometheus/collectors#WithGoCollectorRuntimeMetrics), and the following http handler specific metrics: + +{{% pageinfo %}} +``` +cap_op_subscription_requests_total{code="202",method="POST"} 82 +``` +a counter type metric indicating total requests triggered for susbscription based on http method and response code. + +--- + +``` +cap_op_subscription_requests_inflight{} 1 +``` +a guage type metric indicating the subscription requests currently being processed by the handler. +{{% /pageinfo %}} \ No newline at end of file diff --git a/website/content/en/docs/usage/resources/_index.md b/website/content/en/docs/usage/resources/_index.md index a41a4aa..81a96ec 100644 --- a/website/content/en/docs/usage/resources/_index.md +++ b/website/content/en/docs/usage/resources/_index.md @@ -1,7 +1,7 @@ --- title: "Resources" linkTitle: "Resources" -weight: 60 +weight: 70 type: "docs" description: > Detailed configuration of resources managed by CAP Operator diff --git a/website/content/en/docs/usage/resources/captenantoutput.md b/website/content/en/docs/usage/resources/captenantoutput.md index 7ba932e..c369bff 100644 --- a/website/content/en/docs/usage/resources/captenantoutput.md +++ b/website/content/en/docs/usage/resources/captenantoutput.md @@ -7,7 +7,7 @@ description: > How to configure the `CAPTenantOutput` resource --- -The [`CAPTenantOutput`](https://sap.github.io/cap-operator/docs/reference/#sme.sap.com/v1alpha1.CAPTenantOutput) may be used to add additional data to the asynchronous callback parameters from the SaaS provisioning service during tenant onboarding. The resource is not reconciled but just consumed by the subscription server to generate additional data. It has the following structure: +The [`CAPTenantOutput`](docs/reference/#sme.sap.com/v1alpha1.CAPTenantOutput) may be used to add additional data to the asynchronous callback parameters from the SaaS provisioning service during tenant onboarding. The resource is not reconciled but just consumed by the subscription server to generate additional data. It has the following structure: ```yaml apiVersion: sme.sap.com/v1alpha1 diff --git a/website/content/en/docs/usage/tenant-provisioning.md b/website/content/en/docs/usage/tenant-provisioning.md index 2684838..11f44a4 100644 --- a/website/content/en/docs/usage/tenant-provisioning.md +++ b/website/content/en/docs/usage/tenant-provisioning.md @@ -25,7 +25,7 @@ spec: ## Tenant Provisioning -The process of tenant provisioning starts when a consumer subaccount subscribes to the application, either via the SAP BTP cockpit or using the APIs provided by the SaaS provisioning service. This, in turn, initiates the asynchronous callback from the SaaS provisioning service instance into the cluster, and the request is handled by the [subscription server]({{< ref "docs/concepts/operator-components/subscription-server.md" >}}). The subscription server validates the request and creates an instance of `CAPTenant` for the identified `CAPApplication`. +The process of tenant provisioning starts when a consumer subaccount subscribes to the application, either via the SAP BTP cockpit or using the APIs provided by the SaaS provisioning service. This, in turn, initiates the asynchronous callback from the SaaS provisioning service instance into the cluster, and the request is handled by the [subscription server](docs/concepts/operator-components/subscription-server). The subscription server validates the request and creates an instance of `CAPTenant` for the identified `CAPApplication`. {{< alert color="warning" title="Warning" >}} An instance of `CAPTenant` must not be created or deleted manually within the cluster. A new instance has to be created by the subscription server after receiving a provisioning call from SaaS provisioning service. diff --git a/website/content/en/docs/whats-new.md b/website/content/en/docs/whats-new.md index 8c03431..cd6b595 100644 --- a/website/content/en/docs/whats-new.md +++ b/website/content/en/docs/whats-new.md @@ -6,14 +6,25 @@ description: > Discover new features added to CAP Operator --- +#### New updates + {{% cardpane %}} + {{% card header="Q4 2024" %}} + CAP Operator now emits prometheus metrics for [Controller](docs/concepts/operator-components/controller/) and [Subscription Server](docs/concepts/operator-components/subscription-server/) components. Learn more about [metrics](./usage/operator-metrics.md). + {{% /card %}} {{% card header="Q3 2024" %}} Define monitoring configuration on version workloads which allow outdated versions to be automatically cleaned up based on usage. Learn more about [Version Monitoring](./usage/version-monitoring.md). {{% /card %}} {{% card header="Q3 2024" %}} New Custom Resource `CAPTenantOutput` can be used to record subscription related data from tenant operations. [Learn more](./usage/resources/captenantoutput.md). {{% /card %}} +{{% /cardpane %}} + + +#### Previous updates + +{{% cardpane %}} {{% card header="Q2 2024" %}} `CAPApplicationVersion` now supports configuration of `initContainers`, `volumes`, `serviceAccountName`, [scheduling related configurations](https://kubernetes.io/docs/concepts/scheduling-eviction/) etc. on workloads. {{% /card %}} -{{% /cardpane %}} +{{% /cardpane %}} \ No newline at end of file diff --git a/website/includes/chart-values.md b/website/includes/chart-values.md index e708ed2..ad166ad 100644 --- a/website/includes/chart-values.md +++ b/website/includes/chart-values.md @@ -13,6 +13,8 @@ | topologySpreadConstraints | list | `[]` | Default topology spread constraints (can be overwritten on component level) | | podLabels | object | `{}` | Additional pod labels for all components | | podAnnotations | object | `{}` | Additional pod annotations for all components | +| monitoring | object | `{"enabled":false}` | Monitoring configuration for all components | +| monitoring.enabled | bool | `false` | Optionally enable Prometheus monitoring for all components (disabled by default) | | controller.replicas | int | `1` | Replicas | | controller.image.repository | string | `"ghcr.io/sap/cap-operator/controller"` | Image repository | | controller.image.tag | string | `""` | Image tag | @@ -34,6 +36,7 @@ | controller.volumes | list | `[]` | Optionally specify list of additional volumes for the controller pod(s) | | controller.volumeMounts | list | `[]` | Optionally specify list of additional volumeMounts for the controller container(s) | | controller.dnsTarget | string | `""` | The dns target mentioned on the public ingress gateway service used in the cluster | +| controller.detailedOperationMetrics | bool | `false` | Optionally enable detailed opertational metrics for the controller by setting this to true | | controller.versionMonitoring.prometheusAddress | string | `""` | The URL of the Prometheus server from which metrics related to managed application versions can be queried | | controller.versionMonitoring.metricsEvaluationInterval | string | `"1h"` | The duration (example 2h) after which versions are evaluated for deletion; based on specified workload metrics | | controller.versionMonitoring.promClientAcquireRetryDelay | string | `"1h"` | The duration (example 10m) to wait before retrying to acquire Prometheus client and verify connection, after a failed attempt |