From e7e9b6023bead8ac9747523648fd94f5d064f5ee Mon Sep 17 00:00:00 2001 From: Rui Vieira Date: Thu, 3 Aug 2023 14:25:12 +0100 Subject: [PATCH] Add central ServiceMonitor to operator's namespace (#81) * Create both local and central ServiceMonitor * Change match labels and namespaces watched by ServiceMonitors * Fix tests * Get central ServiceMonitor and create/update as needed Remove the boolean flag --- controllers/monitor.go | 156 ++++++++++++++++++++++ controllers/suite_test.go | 4 +- controllers/trustyaiservice_controller.go | 83 ++---------- 3 files changed, 168 insertions(+), 75 deletions(-) create mode 100644 controllers/monitor.go diff --git a/controllers/monitor.go b/controllers/monitor.go new file mode 100644 index 00000000..cb25bbd1 --- /dev/null +++ b/controllers/monitor.go @@ -0,0 +1,156 @@ +package controllers + +import ( + "context" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +// generateServiceMonitorSpecCentral generates the ServiceMonitor spec for central ServiceMonitor +func generateServiceMonitorSpecCentral(deploymentNamespace string) *monitoringv1.ServiceMonitor { + serviceMonitor := &monitoringv1.ServiceMonitor{ + ObjectMeta: metav1.ObjectMeta{ + Name: serviceMonitorName, + Namespace: deploymentNamespace, + Labels: map[string]string{ + "modelmesh-service": "modelmesh-serving", + }, + }, + Spec: monitoringv1.ServiceMonitorSpec{ + NamespaceSelector: monitoringv1.NamespaceSelector{ + Any: true, + }, + Endpoints: []monitoringv1.Endpoint{ + { + Interval: "4s", + Path: "/q/metrics", + HonorLabels: true, + Scheme: "http", + Params: map[string][]string{ + "match[]": { + `{__name__= "trustyai_spd"}`, + `{__name__= "trustyai_dir"}`, + }, + }, + MetricRelabelConfigs: []*monitoringv1.RelabelConfig{ + { + Action: "keep", + Regex: "trustyai_.*", + SourceLabels: []monitoringv1.LabelName{"__name__"}, + }, + }, + }, + }, + Selector: metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app.kubernetes.io/part-of": serviceType, + }, + }, + }, + } + return serviceMonitor +} + +// ensureCentralServiceMonitor ensures that the central ServiceMonitor is created +func (r *TrustyAIServiceReconciler) ensureCentralServiceMonitor(ctx context.Context) error { + serviceMonitor := generateServiceMonitorSpecCentral(r.Namespace) + + // Check if this ServiceMonitor already exists + found := &monitoringv1.ServiceMonitor{} + err := r.Get(ctx, types.NamespacedName{Name: serviceMonitor.Name, Namespace: serviceMonitor.Namespace}, found) + if err != nil { + if errors.IsNotFound(err) { + log.FromContext(ctx).Info("Creating a new central ServiceMonitor", "ServiceMonitor.Namespace", serviceMonitor.Namespace, "ServiceMonitor.Name", serviceMonitor.Name) + err = r.Create(ctx, serviceMonitor) + if err != nil { + log.FromContext(ctx).Error(err, "Failed to create central ServiceMonitor", "ServiceMonitor.Namespace", serviceMonitor.Namespace, "ServiceMonitor.Name", serviceMonitor.Name) + return err + } + } else { + log.FromContext(ctx).Error(err, "Failed to get central ServiceMonitor", "ServiceMonitor.Namespace", serviceMonitor.Namespace, "ServiceMonitor.Name", serviceMonitor.Name) + return err + } + } + + return nil +} + +// generateServiceMonitorSpecLocal generates the ServiceMonitor spec for a local ServiceMonitor +func generateServiceMonitorSpecLocal(deploymentNamespace string, serviceName string) *monitoringv1.ServiceMonitor { + serviceMonitor := &monitoringv1.ServiceMonitor{ + ObjectMeta: metav1.ObjectMeta{ + Name: serviceName, + Namespace: deploymentNamespace, + Labels: map[string]string{ + "modelmesh-service": "modelmesh-serving", + }, + }, + Spec: monitoringv1.ServiceMonitorSpec{ + NamespaceSelector: monitoringv1.NamespaceSelector{ + MatchNames: []string{deploymentNamespace}, + }, + Endpoints: []monitoringv1.Endpoint{ + { + Interval: "4s", + Path: "/q/metrics", + HonorLabels: true, + Scheme: "http", + Params: map[string][]string{ + "match[]": { + `{__name__= "trustyai_spd"}`, + `{__name__= "trustyai_dir"}`, + }, + }, + MetricRelabelConfigs: []*monitoringv1.RelabelConfig{ + { + Action: "keep", + Regex: "trustyai_.*", + SourceLabels: []monitoringv1.LabelName{"__name__"}, + }, + }, + }, + }, + Selector: metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app.kubernetes.io/part-of": serviceType, + }, + }, + }, + } + return serviceMonitor +} + +// ensureLocalServiceMonitor ensures that the local ServiceMonitor is created +func (r *TrustyAIServiceReconciler) ensureLocalServiceMonitor(cr *trustyaiopendatahubiov1alpha1.TrustyAIService, ctx context.Context) error { + serviceMonitor := generateServiceMonitorSpecLocal(cr.Namespace, cr.Name) + + // Set TrustyAIService instance as the owner and controller + err := ctrl.SetControllerReference(cr, serviceMonitor, r.Scheme) + if err != nil { + return err + } + + // Check if the ServiceMonitor already exists + found := &monitoringv1.ServiceMonitor{} + err = r.Get(ctx, types.NamespacedName{Name: serviceMonitor.Name, Namespace: serviceMonitor.Namespace}, found) + if err != nil { + if errors.IsNotFound(err) { + log.FromContext(ctx).Info("Creating a new local ServiceMonitor", "ServiceMonitor.Namespace", serviceMonitor.Namespace, "ServiceMonitor.Name", serviceMonitor.Name) + err = r.Create(ctx, serviceMonitor) + if err != nil { + log.FromContext(ctx).Error(err, "Failed to create local ServiceMonitor", "ServiceMonitor.Namespace", serviceMonitor.Namespace, "ServiceMonitor.Name", serviceMonitor.Name) + return err + } + } else { + log.FromContext(ctx).Error(err, "Failed to get local ServiceMonitor", "ServiceMonitor.Namespace", serviceMonitor.Namespace, "ServiceMonitor.Name", serviceMonitor.Name) + return err + } + } + + return nil +} diff --git a/controllers/suite_test.go b/controllers/suite_test.go index e8c997c8..89e09908 100644 --- a/controllers/suite_test.go +++ b/controllers/suite_test.go @@ -243,7 +243,7 @@ var _ = Describe("TrustyAI operator", func() { Expect(deployment.Labels["app"]).Should(Equal(name)) Expect(deployment.Labels["app.kubernetes.io/name"]).Should(Equal(name)) Expect(deployment.Labels["app.kubernetes.io/instance"]).Should(Equal(name)) - Expect(deployment.Labels["app.kubernetes.io/part-of"]).Should(Equal(name)) + Expect(deployment.Labels["app.kubernetes.io/part-of"]).Should(Equal(serviceType)) Expect(deployment.Labels["app.kubernetes.io/version"]).Should(Equal("0.1.0")) Expect(deployment.Spec.Template.Spec.Containers[0].Image).Should(Equal("quay.io/trustyai/trustyai-service:latest")) @@ -298,7 +298,7 @@ var _ = Describe("TrustyAI operator", func() { Expect(deployment.Labels["app"]).Should(Equal(name)) Expect(deployment.Labels["app.kubernetes.io/name"]).Should(Equal(name)) Expect(deployment.Labels["app.kubernetes.io/instance"]).Should(Equal(name)) - Expect(deployment.Labels["app.kubernetes.io/part-of"]).Should(Equal(name)) + Expect(deployment.Labels["app.kubernetes.io/part-of"]).Should(Equal(serviceType)) Expect(deployment.Labels["app.kubernetes.io/version"]).Should(Equal("0.1.0")) Expect(deployment.Spec.Template.Spec.Containers[0].Image).Should(Equal("quay.io/trustyai/trustyai-service:latest")) diff --git a/controllers/trustyaiservice_controller.go b/controllers/trustyaiservice_controller.go index 06b2a4ca..3a871259 100644 --- a/controllers/trustyaiservice_controller.go +++ b/controllers/trustyaiservice_controller.go @@ -21,7 +21,6 @@ import ( goerrors "errors" "fmt" kserveapi "github.com/kserve/kserve/pkg/apis/serving/v1alpha1" - monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" @@ -49,6 +48,7 @@ const ( modelMeshLabelKey = "modelmesh-service" modelMeshLabelValue = "modelmesh-serving" volumeMountName = "volume" + serviceType = "trustyai-service" ) // TrustyAIServiceReconciler reconciles a TrustyAIService object @@ -80,7 +80,7 @@ func getCommonLabels(serviceName string) map[string]string { "app": serviceName, "app.kubernetes.io/name": serviceName, "app.kubernetes.io/instance": serviceName, - "app.kubernetes.io/part-of": serviceName, + "app.kubernetes.io/part-of": serviceType, "app.kubernetes.io/version": "0.1.0", } } @@ -218,8 +218,14 @@ func (r *TrustyAIServiceReconciler) Reconcile(ctx context.Context, req ctrl.Requ } } - // Service Monitor - err = r.reconcileServiceMonitor(instance, ctx) + // Local Service Monitor + err = r.ensureLocalServiceMonitor(instance, ctx) + if err != nil { + return ctrl.Result{}, err + } + + // Central Service Monitor + err = r.ensureCentralServiceMonitor(ctx) if err != nil { return RequeueWithError(err) } @@ -276,75 +282,6 @@ func (r *TrustyAIServiceReconciler) reconcileService(cr *trustyaiopendatahubiov1 return service, nil } -func (r *TrustyAIServiceReconciler) reconcileServiceMonitor(cr *trustyaiopendatahubiov1alpha1.TrustyAIService, ctx context.Context) error { - - serviceMonitor := &monitoringv1.ServiceMonitor{ - ObjectMeta: metav1.ObjectMeta{ - Name: serviceMonitorName, - Namespace: cr.Namespace, - Labels: map[string]string{ - "modelmesh-service": "modelmesh-serving", - }, - }, - Spec: monitoringv1.ServiceMonitorSpec{ - NamespaceSelector: monitoringv1.NamespaceSelector{ - MatchNames: []string{cr.Namespace}, - }, - Endpoints: []monitoringv1.Endpoint{ - { - Interval: "4s", - Path: "/q/metrics", - HonorLabels: true, - Scheme: "http", - Params: map[string][]string{ - "match[]": { - `{__name__= "trustyai_spd"}`, - `{__name__= "trustyai_dir"}`, - }, - }, - MetricRelabelConfigs: []*monitoringv1.RelabelConfig{ - { - Action: "keep", - Regex: "trustyai_.*", - SourceLabels: []monitoringv1.LabelName{"__name__"}, - }, - }, - }, - }, - Selector: metav1.LabelSelector{ - MatchLabels: map[string]string{ - "app.kubernetes.io/name": cr.Name, - }, - }, - }, - } - - // Set TrustyAIService instance as the owner and controller - err := ctrl.SetControllerReference(cr, serviceMonitor, r.Scheme) - if err != nil { - return err - } - - // Check if this ServiceMonitor already exists - found := &monitoringv1.ServiceMonitor{} - err = r.Get(ctx, types.NamespacedName{Name: serviceMonitor.Name, Namespace: serviceMonitor.Namespace}, found) - if err != nil { - if errors.IsNotFound(err) { - log.FromContext(ctx).Info("Creating a new ServiceMonitor", "ServiceMonitor.Namespace", serviceMonitor.Namespace, "ServiceMonitor.Name", serviceMonitor.Name) - err = r.Create(ctx, serviceMonitor) - if err != nil { - log.FromContext(ctx).Error(err, "Not found ServiceMonitor", "ServiceMonitor.Namespace", serviceMonitor.Namespace, "ServiceMonitor.Name", serviceMonitor.Name) - return err - } - } else { - log.FromContext(ctx).Error(err, "Couldn't create new ServiceMonitor", "ServiceMonitor.Namespace", serviceMonitor.Namespace, "ServiceMonitor.Name", serviceMonitor.Name) - return err - } - } - - return nil -} - // SetupWithManager sets up the controller with the Manager. func (r *TrustyAIServiceReconciler) SetupWithManager(mgr ctrl.Manager) error { // Watch ServingRuntime objects (not managed by this controller)