Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ACM-10812]: retry status update on conflict #1427

Merged
merged 4 commits into from
May 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,12 @@ func (r *ObservabilityAddonReconciler) Reconcile(ctx context.Context, req ctrl.R
log.Error(err, "OCP prometheus service does not exist")
// ACM 8509: Special case for hub/local cluster metrics collection
// We do not report status for hub endpoint operator
util.ReportStatus(ctx, r.Client, obsAddon, "NotSupported", !isHubMetricsCollector)
if !isHubMetricsCollector {
jacobbaungard marked this conversation as resolved.
Show resolved Hide resolved
if err := util.ReportStatus(ctx, r.Client, util.NotSupportedStatus, obsAddon.Name, obsAddon.Namespace); err != nil {
log.Error(err, "Failed to report status")
}
}

return ctrl.Result{}, nil
}
return ctrl.Result{}, fmt.Errorf("failed to check prometheus resource: %w", err)
Expand Down Expand Up @@ -297,19 +302,27 @@ func (r *ObservabilityAddonReconciler) Reconcile(ctx context.Context, req ctrl.R
1,
forceRestart)
if err != nil {
util.ReportStatus(ctx, r.Client, obsAddon, "Degraded", !isHubMetricsCollector)
if !isHubMetricsCollector {
if err := util.ReportStatus(ctx, r.Client, util.DegradedStatus, obsAddon.Name, obsAddon.Namespace); err != nil {
log.Error(err, "Failed to report status")
}
}
return ctrl.Result{}, fmt.Errorf("failed to update metrics collectors: %w", err)
}
if created {
util.ReportStatus(ctx, r.Client, obsAddon, "Deployed", !isHubMetricsCollector)
if created && !isHubMetricsCollector {
if err := util.ReportStatus(ctx, r.Client, util.DeployedStatus, obsAddon.Name, obsAddon.Namespace); err != nil {
log.Error(err, "Failed to report status")
}
}
} else {
deleted, err := updateMetricsCollectors(ctx, r.Client, obsAddon.Spec, *hubInfo, clusterID, clusterType, 0, false)
if err != nil {
return ctrl.Result{}, fmt.Errorf("failed to update metrics collectors: %w", err)
}
if deleted {
util.ReportStatus(ctx, r.Client, obsAddon, "Disabled", !isHubMetricsCollector)
if deleted && !isHubMetricsCollector {
if err := util.ReportStatus(ctx, r.Client, util.DisabledStatus, obsAddon.Name, obsAddon.Namespace); err != nil {
log.Error(err, "Failed to report status")
}
}
}

Expand Down
116 changes: 84 additions & 32 deletions operators/endpointmetrics/pkg/util/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,49 +6,101 @@ package util

import (
"context"
"sort"
"time"

oav1beta1 "github.com/stolostron/multicluster-observability-operator/operators/multiclusterobservability/api/v1beta1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/util/retry"
"sigs.k8s.io/controller-runtime/pkg/client"
)

type StatusConditionName string

const (
DeployedStatus StatusConditionName = "Deployed"
DisabledStatus StatusConditionName = "Disabled"
DegradedStatus StatusConditionName = "Degraded"
NotSupportedStatus StatusConditionName = "NotSupported"
MaxStatusConditionsCount = 10
)

var (
conditions = map[string]map[string]string{
"Deployed": {
"type": "Progressing",
"reason": "Deployed",
"message": "Metrics collector deployed"},
"Disabled": {
"type": "Disabled",
"reason": "Disabled",
"message": "enableMetrics is set to False"},
"Degraded": {
"type": "Degraded",
"reason": "Degraded",
"message": "Metrics collector deployment not successful"},
"NotSupported": {
"type": "NotSupported",
"reason": "NotSupported",
"message": "No Prometheus service found in this cluster"},
conditions = map[StatusConditionName]*oav1beta1.StatusCondition{
DeployedStatus: {
Type: "Progressing",
Reason: "Deployed",
Message: "Metrics collector deployed",
Status: metav1.ConditionTrue,
},
DisabledStatus: {
Type: "Disabled",
Reason: "Disabled",
Message: "enableMetrics is set to False",
Status: metav1.ConditionTrue,
},
DegradedStatus: {
Type: "Degraded",
Reason: "Degraded",
Message: "Metrics collector deployment not successful",
Status: metav1.ConditionTrue,
},
NotSupportedStatus: {
Type: "NotSupported",
Reason: "NotSupported",
Message: "No Prometheus service found in this cluster",
Status: metav1.ConditionTrue,
},
}
)

func ReportStatus(ctx context.Context, client client.Client, i *oav1beta1.ObservabilityAddon, t string, reportStatus bool) {
if !reportStatus {
return
}
i.Status.Conditions = []oav1beta1.StatusCondition{
{
Type: conditions[t]["type"],
Status: metav1.ConditionTrue,
LastTransitionTime: metav1.NewTime(time.Now()),
Reason: conditions[t]["reason"],
Message: conditions[t]["message"],
},
func ReportStatus(ctx context.Context, client client.Client, condition StatusConditionName, addonName, addonNs string) error {
newCondition := conditions[condition].DeepCopy()
newCondition.LastTransitionTime = metav1.NewTime(time.Now())

// Fetch the ObservabilityAddon instance in local cluster, and update the status
// Retry on conflict
obsAddon := &oav1beta1.ObservabilityAddon{}
retryErr := retry.RetryOnConflict(retry.DefaultBackoff, func() error {
if err := client.Get(ctx, types.NamespacedName{Name: addonName, Namespace: addonNs}, obsAddon); err != nil {
return err
}

if !shouldAppendCondition(obsAddon.Status.Conditions, newCondition) {
return nil
}

obsAddon.Status.Conditions = append(obsAddon.Status.Conditions, *newCondition)

if len(obsAddon.Status.Conditions) > MaxStatusConditionsCount {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this just for convenience, or why do we want only 10?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's an arbitrary number to limit the list size... You think we need more than 10? Previously we were overriding the whole list with the last element, so 10 is better than before already.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mostly just wondered if there was a specific reason for such a limit. Couldn't we just keep adding? Not sure if there is some best practice on this area.

Anything, this should be fine for now.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I want to avoid an ever increasing list... And potential associated bugs... Maybe it's over engineered

obsAddon.Status.Conditions = obsAddon.Status.Conditions[len(obsAddon.Status.Conditions)-MaxStatusConditionsCount:]
}

return client.Status().Update(ctx, obsAddon)
})
if retryErr != nil {
return retryErr
}
err := client.Status().Update(ctx, i)
if err != nil {
log.Error(err, "Failed to update status for observabilityaddon")

return nil
}

// shouldAppendCondition checks if the new condition should be appended to the status conditions
// based on the last condition in the slice.
func shouldAppendCondition(conditions []oav1beta1.StatusCondition, newCondition *oav1beta1.StatusCondition) bool {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So it stops from changing the condition, if the condition is the same?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The idea is to avoid adding duplicated status. If it is available, I don't need to report available again.
But I assume here that statuses are sorted by date, which is not necessarily the case IIRC. Probably have to change this! Having a look.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated.

if len(conditions) == 0 {
return true
}

sort.Slice(conditions, func(i, j int) bool {
return conditions[i].LastTransitionTime.Before(&conditions[j].LastTransitionTime)
})

lastCondition := conditions[len(conditions)-1]

return lastCondition.Type != newCondition.Type ||
lastCondition.Status != newCondition.Status ||
lastCondition.Reason != newCondition.Reason ||
lastCondition.Message != newCondition.Message
}
155 changes: 129 additions & 26 deletions operators/endpointmetrics/pkg/util/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,23 @@
// Copyright Contributors to the Open Cluster Management project
// Licensed under the Apache License 2.0

package util
package util_test

import (
"context"
"fmt"
"testing"
"time"

"github.com/stolostron/multicluster-observability-operator/operators/endpointmetrics/pkg/util"
oav1beta1 "github.com/stolostron/multicluster-observability-operator/operators/multiclusterobservability/api/v1beta1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/kubernetes/scheme"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
)

Expand All @@ -38,35 +44,132 @@ func TestReportStatus(t *testing.T) {
t.Fatalf("Unable to add oav1beta1 scheme: (%v)", err)
}

expectedStatus := []oav1beta1.StatusCondition{
{
Type: "NotSupported",
Status: metav1.ConditionTrue,
Reason: "NotSupported",
Message: "No Prometheus service found in this cluster",
},
{
Type: "Progressing",
Status: metav1.ConditionTrue,
Reason: "Deployed",
Message: "Metrics collector deployed",
},
{
Type: "Disabled",
Status: metav1.ConditionTrue,
Reason: "Disabled",
Message: "enableMetrics is set to False",
},
}

statusList := []string{"NotSupported", "Deployed", "Disabled"}
// New status should be appended
statusList := []util.StatusConditionName{util.NotSupportedStatus, util.DeployedStatus, util.DisabledStatus}
s.AddKnownTypes(oav1beta1.GroupVersion, oa)
c := fake.NewClientBuilder().WithRuntimeObjects(objs...).Build()
for i := range statusList {
ReportStatus(context.TODO(), c, oa, statusList[i], true)
if oa.Status.Conditions[0].Message != expectedStatus[i].Message || oa.Status.Conditions[0].Reason != expectedStatus[i].Reason || oa.Status.Conditions[0].Status != expectedStatus[i].Status || oa.Status.Conditions[0].Type != expectedStatus[i].Type {
t.Errorf("Error: Status not updated. Expected: %s, Actual: %s", expectedStatus[i], fmt.Sprintf("%+v\n", oa.Status.Conditions[0]))
if err := util.ReportStatus(context.Background(), c, statusList[i], oa.Name, oa.Namespace); err != nil {
t.Fatalf("Error reporting status: %v", err)
}
runtimeAddon := &oav1beta1.ObservabilityAddon{}
if err := c.Get(context.Background(), types.NamespacedName{Name: name, Namespace: testNamespace}, runtimeAddon); err != nil {
t.Fatalf("Error getting observabilityaddon: (%v)", err)
}

if len(runtimeAddon.Status.Conditions) != i+1 {
t.Errorf("Status not updated. Expected: %s, Actual: %s", statusList[i], fmt.Sprintf("%+v\n", runtimeAddon.Status.Conditions))
}

if runtimeAddon.Status.Conditions[i].Reason != string(statusList[i]) {
t.Errorf("Status not updated. Expected: %s, Actual: %s", statusList[i], runtimeAddon.Status.Conditions[i].Type)
}

time.Sleep(1500 * time.Millisecond) // Sleep to ensure LastTransitionTime is different for each condition (1s resolution)
}

// Change ordering of conditions: Get the list, change the order and update
runtimeAddon := &oav1beta1.ObservabilityAddon{}
if err := c.Get(context.Background(), types.NamespacedName{Name: name, Namespace: testNamespace}, runtimeAddon); err != nil {
t.Fatalf("Error getting observabilityaddon: %v", err)
}
conditions := runtimeAddon.Status.Conditions
conditions[0], conditions[len(conditions)-1] = conditions[len(conditions)-1], conditions[0]
runtimeAddon.Status.Conditions = conditions
if err := c.Status().Update(context.Background(), runtimeAddon); err != nil {
t.Fatalf("Error updating observabilityaddon: (%v)", err)
}

// Same status than current one should not be appended
if err := util.ReportStatus(context.Background(), c, util.DisabledStatus, oa.Name, oa.Namespace); err != nil {
t.Fatalf("Error reporting status: %v", err)
}
runtimeAddon = &oav1beta1.ObservabilityAddon{}
if err := c.Get(context.Background(), types.NamespacedName{Name: name, Namespace: testNamespace}, runtimeAddon); err != nil {
t.Fatalf("Error getting observabilityaddon: %v", err)
}

if len(runtimeAddon.Status.Conditions) != len(statusList) {
t.Errorf("Status should not be appended. Expected: %d, Actual: %d", len(statusList), len(runtimeAddon.Status.Conditions))
}

// Number of conditions should not exceed MaxStatusConditionsCount
statusList = []util.StatusConditionName{util.DeployedStatus, util.DisabledStatus, util.DegradedStatus}
for i := 0; i < util.MaxStatusConditionsCount+3; i++ {
status := statusList[i%len(statusList)]
if err := util.ReportStatus(context.Background(), c, status, oa.Name, oa.Namespace); err != nil {
t.Fatalf("Error reporting status: %v", err)
}
}

runtimeAddon = &oav1beta1.ObservabilityAddon{}
if err := c.Get(context.Background(), types.NamespacedName{Name: name, Namespace: testNamespace}, runtimeAddon); err != nil {
t.Fatalf("Error getting observabilityaddon: (%v)", err)
}

if len(runtimeAddon.Status.Conditions) != util.MaxStatusConditionsCount {
t.Errorf("Number of conditions should not exceed MaxStatusConditionsCount. Expected: %d, Actual: %d", util.MaxStatusConditionsCount, len(runtimeAddon.Status.Conditions))
}
}

func TestReportStatus_Conflict(t *testing.T) {
// Conflict on update should be retried
oa := newObservabilityAddon(name, testNamespace)
s := scheme.Scheme
oav1beta1.AddToScheme(s)
fakeClient := fake.NewClientBuilder().WithScheme(s).WithRuntimeObjects(oa).Build()
conflictErr := errors.NewConflict(schema.GroupResource{Group: oav1beta1.GroupVersion.Group, Resource: "resource"}, name, fmt.Errorf("conflict"))

c := newClientWithUpdateError(fakeClient, conflictErr)
if err := util.ReportStatus(context.Background(), c, util.DeployedStatus, name, testNamespace); err == nil {
t.Fatalf("Conflict error should be retried and return an error if it fails")
}
if c.UpdateCallsCount() <= 1 {
t.Errorf("Conflict error should be retried, called %d times", c.UpdateCallsCount())
}
}

// TestClient wraps a client.Client to customize operations for testing
type TestClient struct {
client.Client
UpdateError error
updateCallsCount int
statusWriter *TestStatusWriter
}

func newClientWithUpdateError(c client.Client, updateError error) *TestClient {
ret := &TestClient{
Client: c,
UpdateError: updateError,
}
ret.statusWriter = &TestStatusWriter{SubResourceWriter: c.Status(), updateError: &ret.UpdateError, callsCount: &ret.updateCallsCount}
return ret
}

func (c *TestClient) Status() client.StatusWriter {
return c.statusWriter
}

func (c *TestClient) UpdateCallsCount() int {
return c.updateCallsCount
}

func (c *TestClient) Reset() {
c.updateCallsCount = 0
}

type TestStatusWriter struct {
client.SubResourceWriter
updateError *error
callsCount *int
}

func (f *TestStatusWriter) Update(ctx context.Context, obj client.Object, opts ...client.SubResourceUpdateOption) error {
*f.callsCount++

if *f.updateError != nil {
return *f.updateError
}

return f.SubResourceWriter.Update(ctx, obj, opts...)
}
Loading