From 2b7dd196a479535d6e49ba1b5a7220abaa5c0b5e Mon Sep 17 00:00:00 2001 From: Mark Mandel Date: Thu, 12 Sep 2024 15:46:31 -0700 Subject: [PATCH] Cleanup Patch Sidecar Logging (#3973) Missed this little of nuance in code review when moving to Patch with the SDK Sidecar - in that a Patch operation doesn't return a Conflict error type/status value if it fails, so every time it does, the Agones sidecar will log it as an error on production installs. So this cleaned up a few things: * renamed "debugError" type to "traceError" type, since we only log it under Trace, not Debug. * if `patchGameServer` fails under the error `IsInvalid`, wrap it in a TraceError so it doesn't show up in prod. * Wrapped the error with `errors` context, such that if the patch does fail, it's far easier now to work out where it failed. Closes #3967 --- pkg/gameservers/controller.go | 12 ++++++------ pkg/gameservers/health.go | 2 +- pkg/gameservers/migration.go | 2 +- pkg/sdkserver/sdkserver.go | 8 +++++++- pkg/util/workerqueue/workerqueue.go | 22 +++++++++++----------- pkg/util/workerqueue/workerqueue_test.go | 10 +++++----- 6 files changed, 31 insertions(+), 25 deletions(-) diff --git a/pkg/gameservers/controller.go b/pkg/gameservers/controller.go index 577020e31d..3fa79d86c2 100644 --- a/pkg/gameservers/controller.go +++ b/pkg/gameservers/controller.go @@ -878,19 +878,19 @@ func (c *Controller) syncGameServerStartingState(ctx context.Context, gs *agones if err != nil { // expected to happen, so don't log it. if k8serrors.IsNotFound(err) { - return nil, workerqueue.NewDebugError(err) + return nil, workerqueue.NewTraceError(err) } // do log if it's something other than NotFound, since that's weird. return nil, err } if pod.Spec.NodeName == "" { - return gs, workerqueue.NewDebugError(errors.Errorf("node not yet populated for Pod %s", pod.ObjectMeta.Name)) + return gs, workerqueue.NewTraceError(errors.Errorf("node not yet populated for Pod %s", pod.ObjectMeta.Name)) } // Ensure the pod IPs are populated if pod.Status.PodIPs == nil || len(pod.Status.PodIPs) == 0 { - return gs, workerqueue.NewDebugError(errors.Errorf("pod IPs not yet populated for Pod %s", pod.ObjectMeta.Name)) + return gs, workerqueue.NewTraceError(errors.Errorf("pod IPs not yet populated for Pod %s", pod.ObjectMeta.Name)) } node, err := c.nodeLister.Get(pod.Spec.NodeName) @@ -943,7 +943,7 @@ func (c *Controller) syncGameServerRequestReadyState(ctx context.Context, gs *ag if gs.Status.NodeName == "" { addressPopulated = true if pod.Spec.NodeName == "" { - return gs, workerqueue.NewDebugError(errors.Errorf("node not yet populated for Pod %s", pod.ObjectMeta.Name)) + return gs, workerqueue.NewTraceError(errors.Errorf("node not yet populated for Pod %s", pod.ObjectMeta.Name)) } node, err := c.nodeLister.Get(pod.Spec.NodeName) if err != nil { @@ -963,7 +963,7 @@ func (c *Controller) syncGameServerRequestReadyState(ctx context.Context, gs *ag // check to make sure this container is actually running. If there was a recent crash, the cache may // not yet have the newer, running container. if cs.State.Running == nil { - return nil, workerqueue.NewDebugError(fmt.Errorf("game server container for GameServer %s in namespace %s is not currently running, try again", gsCopy.ObjectMeta.Name, gsCopy.ObjectMeta.Namespace)) + return nil, workerqueue.NewTraceError(fmt.Errorf("game server container for GameServer %s in namespace %s is not currently running, try again", gsCopy.ObjectMeta.Name, gsCopy.ObjectMeta.Namespace)) } gsCopy.ObjectMeta.Annotations[agonesv1.GameServerReadyContainerIDAnnotation] = cs.ContainerID } @@ -972,7 +972,7 @@ func (c *Controller) syncGameServerRequestReadyState(ctx context.Context, gs *ag } // Verify that we found the game server container - we may have a stale cache where pod is missing ContainerStatuses. if _, ok := gsCopy.ObjectMeta.Annotations[agonesv1.GameServerReadyContainerIDAnnotation]; !ok { - return nil, workerqueue.NewDebugError(fmt.Errorf("game server container for GameServer %s in namespace %s not present in pod status, try again", gsCopy.ObjectMeta.Name, gsCopy.ObjectMeta.Namespace)) + return nil, workerqueue.NewTraceError(fmt.Errorf("game server container for GameServer %s in namespace %s not present in pod status, try again", gsCopy.ObjectMeta.Name, gsCopy.ObjectMeta.Namespace)) } // Also update the pod with the same annotation, so we can check if the Pod data is up-to-date, now and also in the HealthController. diff --git a/pkg/gameservers/health.go b/pkg/gameservers/health.go index f76afa3f52..416d9c6384 100644 --- a/pkg/gameservers/health.go +++ b/pkg/gameservers/health.go @@ -261,7 +261,7 @@ func (hc *HealthController) skipUnhealthyGameContainer(gs *agonesv1.GameServer, // in which case, send it back to the queue to try again. gsReadyContainerID := gs.ObjectMeta.Annotations[agonesv1.GameServerReadyContainerIDAnnotation] if pod.ObjectMeta.Annotations[agonesv1.GameServerReadyContainerIDAnnotation] != gsReadyContainerID { - return false, workerqueue.NewDebugError(errors.Errorf("pod and gameserver %s data are out of sync, retrying", gs.ObjectMeta.Name)) + return false, workerqueue.NewTraceError(errors.Errorf("pod and gameserver %s data are out of sync, retrying", gs.ObjectMeta.Name)) } if gs.IsBeforeReady() { diff --git a/pkg/gameservers/migration.go b/pkg/gameservers/migration.go index bbd1280712..2b6a8aa203 100644 --- a/pkg/gameservers/migration.go +++ b/pkg/gameservers/migration.go @@ -153,7 +153,7 @@ func (mc *MigrationController) isMigratingGameServerPod(pod *k8sv1.Pod) (*agones } if pod.Spec.NodeName == "" { - return nil, nil, false, workerqueue.NewDebugError(errors.Errorf("node not yet populated for Pod %s", pod.ObjectMeta.Name)) + return nil, nil, false, workerqueue.NewTraceError(errors.Errorf("node not yet populated for Pod %s", pod.ObjectMeta.Name)) } node, err := mc.nodeLister.Get(pod.Spec.NodeName) diff --git a/pkg/sdkserver/sdkserver.go b/pkg/sdkserver/sdkserver.go index 043db885a3..641bc3e90b 100644 --- a/pkg/sdkserver/sdkserver.go +++ b/pkg/sdkserver/sdkserver.go @@ -29,6 +29,7 @@ import ( "github.com/sirupsen/logrus" corev1 "k8s.io/api/core/v1" apiequality "k8s.io/apimachinery/pkg/api/equality" + k8serrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/types" @@ -459,7 +460,12 @@ func (s *SDKServer) patchGameServer(ctx context.Context, gs, gsCopy *agonesv1.Ga return nil, err } - return s.gameServerGetter.GameServers(s.namespace).Patch(ctx, gs.GetObjectMeta().GetName(), types.JSONPatchType, patch, metav1.PatchOptions{}) + gs, err = s.gameServerGetter.GameServers(s.namespace).Patch(ctx, gs.GetObjectMeta().GetName(), types.JSONPatchType, patch, metav1.PatchOptions{}) + // if the test operation fails, no reason to error log + if err != nil && k8serrors.IsInvalid(err) { + err = workerqueue.NewTraceError(err) + } + return gs, errors.Wrapf(err, "error attempting to patch gameserver: %s/%s", gsCopy.ObjectMeta.Namespace, gsCopy.ObjectMeta.Name) } // updateLabels updates the labels on this GameServer to the ones persisted in SDKServer, diff --git a/pkg/util/workerqueue/workerqueue.go b/pkg/util/workerqueue/workerqueue.go index 93f43172e2..c47353abf2 100644 --- a/pkg/util/workerqueue/workerqueue.go +++ b/pkg/util/workerqueue/workerqueue.go @@ -37,29 +37,29 @@ const ( workFx = time.Second ) -// debugError is a marker type for errors that that should only be logged at a Debug level. +// traceError is a marker type for errors that that should only be logged at a Trace level. // Useful if you want a Handler to be retried, but not logged at an Error level. -type debugError struct { +type traceError struct { err error } -// NewDebugError returns a debugError wrapper around an error. -func NewDebugError(err error) error { - return &debugError{err: err} +// NewTraceError returns a traceError wrapper around an error. +func NewTraceError(err error) error { + return &traceError{err: err} } // Error returns the error string -func (l *debugError) Error() string { +func (l *traceError) Error() string { if l.err == nil { return "" } return l.err.Error() } -// isDebugError returns if the error is a debug error or not -func isDebugError(err error) bool { +// isTraceError returns if the error is a trace error or not +func isTraceError(err error) bool { cause := errors.Cause(err) - _, ok := cause.(*debugError) + _, ok := cause.(*traceError) return ok } @@ -181,8 +181,8 @@ func (wq *WorkerQueue) processNextWorkItem(ctx context.Context) bool { if err := wq.SyncHandler(ctx, key); err != nil { // Conflicts are expected, so only show them in debug operations. - // Also check is debugError for other expected errors. - if k8serror.IsConflict(errors.Cause(err)) || isDebugError(err) { + // Also check is traceError for other expected errors. + if k8serror.IsConflict(errors.Cause(err)) || isTraceError(err) { wq.logger.WithField(wq.keyName, obj).Trace(err) } else { runtime.HandleError(wq.logger.WithField(wq.keyName, obj), err) diff --git a/pkg/util/workerqueue/workerqueue_test.go b/pkg/util/workerqueue/workerqueue_test.go index 37ba602679..d68533fda5 100644 --- a/pkg/util/workerqueue/workerqueue_test.go +++ b/pkg/util/workerqueue/workerqueue_test.go @@ -198,13 +198,13 @@ func TestWorkerQueueEnqueueAfter(t *testing.T) { func TestDebugError(t *testing.T) { err := errors.New("not a debug error") - assert.False(t, isDebugError(err)) + assert.False(t, isTraceError(err)) - err = NewDebugError(err) - assert.True(t, isDebugError(err)) + err = NewTraceError(err) + assert.True(t, isTraceError(err)) assert.EqualError(t, err, "not a debug error") - err = NewDebugError(nil) - assert.True(t, isDebugError(err)) + err = NewTraceError(nil) + assert.True(t, isTraceError(err)) assert.EqualError(t, err, "") }