Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: delay deleting GameServers in Error state #3428

Merged
merged 1 commit into from
Oct 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pkg/apis/agones/v1/gameserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,9 @@ const (
PodSafeToEvictAnnotation = "cluster-autoscaler.kubernetes.io/safe-to-evict"
// SafeToEvictLabel is a label that, when "false", matches the restrictive PDB agones-gameserver-safe-to-evict-false.
SafeToEvictLabel = agones.GroupName + "/safe-to-evict"
// GameServerErroredAtAnnotation is an annotation that records the timestamp the GameServer entered the
// error state. The timestamp is encoded in RFC3339 format.
GameServerErroredAtAnnotation = agones.GroupName + "/errored-at"

// True is the string "true" to appease the goconst lint.
True = "true"
Expand Down
4 changes: 4 additions & 0 deletions pkg/gameservers/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -926,6 +926,10 @@ func (c *Controller) syncGameServerShutdownState(ctx context.Context, gs *agones
// moveToErrorState moves the GameServer to the error state
func (c *Controller) moveToErrorState(ctx context.Context, gs *agonesv1.GameServer, msg string) (*agonesv1.GameServer, error) {
gsCopy := gs.DeepCopy()
if gsCopy.Annotations == nil {
gsCopy.Annotations = make(map[string]string, 1)
}
gsCopy.Annotations[agonesv1.GameServerErroredAtAnnotation] = time.Now().Format(time.RFC3339)
gsCopy.Status.State = agonesv1.GameServerStateError

gs, err := c.gameServerGetter.GameServers(gs.ObjectMeta.Namespace).Update(ctx, gsCopy, metav1.UpdateOptions{})
Expand Down
10 changes: 10 additions & 0 deletions pkg/gameservers/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1152,6 +1152,11 @@ func TestControllerCreateGameServerPod(t *testing.T) {

assert.True(t, podCreated, "attempt should have been made to create a pod")
assert.True(t, gsUpdated, "GameServer should be updated")
if assert.NotEmpty(t, gs.Annotations[agonesv1.GameServerErroredAtAnnotation]) {
gotTime, err := time.Parse(time.RFC3339, gs.Annotations[agonesv1.GameServerErroredAtAnnotation])
require.NoError(t, err)
assert.WithinDuration(t, time.Now(), gotTime, time.Second)
}
assert.Equal(t, agonesv1.GameServerStateError, gs.Status.State)
})

Expand All @@ -1178,6 +1183,11 @@ func TestControllerCreateGameServerPod(t *testing.T) {

assert.True(t, podCreated, "attempt should have been made to create a pod")
assert.True(t, gsUpdated, "GameServer should be updated")
if assert.NotEmpty(t, gs.Annotations[agonesv1.GameServerErroredAtAnnotation]) {
gotTime, err := time.Parse(time.RFC3339, gs.Annotations[agonesv1.GameServerErroredAtAnnotation])
require.NoError(t, err)
assert.WithinDuration(t, time.Now(), gotTime, time.Second)
}
assert.Equal(t, agonesv1.GameServerStateError, gs.Status.State)
})
}
Expand Down
36 changes: 35 additions & 1 deletion pkg/gameserversets/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ const (

// maxPodPendingCount is the maximum number of pending pods per game server set
maxPodPendingCount = 5000

// gameServerErrorDeletionDelay is the minimum amount of time to delay the deletion
// of a GameServer in Error state.
gameServerErrorDeletionDelay = 30 * time.Second
)

// Extensions struct contains what is needed to bind webhook handlers
Expand Down Expand Up @@ -430,7 +434,19 @@ func computeReconciliationAction(strategy apis.SchedulingStrategy, list []*agone

// GameServerStateShutdown - already handled above
// GameServerStateAllocated - already handled above
case agonesv1.GameServerStateError, agonesv1.GameServerStateUnhealthy:
case agonesv1.GameServerStateError:
if !shouldDeleteErroredGameServer(gs) {
// The GameServer is in an Error state and should not be deleted yet.
// To stop an ever-increasing number of GameServers from being created,
// consider the Error state GameServers as up and pending. This stops high
// churn rate that can negatively impact Kubernetes.
podPendingCount++
handleGameServerUp(gs)
} else {
scheduleDeletion(gs)
}

case agonesv1.GameServerStateUnhealthy:
scheduleDeletion(gs)
default:
// unrecognized state, assume it's up.
Expand Down Expand Up @@ -474,6 +490,24 @@ func computeReconciliationAction(strategy apis.SchedulingStrategy, list []*agone
return numServersToAdd, toDelete, partialReconciliation
}

func shouldDeleteErroredGameServer(gs *agonesv1.GameServer) bool {
erroredAtStr := gs.Annotations[agonesv1.GameServerErroredAtAnnotation]
if erroredAtStr == "" {
return true
}

erroredAt, err := time.Parse(time.RFC3339, erroredAtStr)
if err != nil {
// The annotation is in the wrong format, delete the GameServer.
return true
}

if time.Since(erroredAt) >= gameServerErrorDeletionDelay {
return true
}
return false
}

// addMoreGameServers adds diff more GameServers to the set
func (c *Controller) addMoreGameServers(ctx context.Context, gsSet *agonesv1.GameServerSet, count int) error {
loggerForGameServerSet(c.baseLogger, gsSet).WithField("count", count).Debug("Adding more gameservers")
Expand Down
92 changes: 92 additions & 0 deletions pkg/gameserversets/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -622,6 +622,98 @@ func TestSyncGameServerSet(t *testing.T) {
assert.True(t, updated, "A game servers should have been updated")
})

t.Run("adding and deleting errored gameservers", func(t *testing.T) {
gsSet := defaultFixture()
list := createGameServers(gsSet, 5)

// make some as unhealthy
list[0].Annotations = map[string]string{agonesv1.GameServerErroredAtAnnotation: time.Now().Add(-30 * time.Second).UTC().Format(time.RFC3339)}
list[0].Status.State = agonesv1.GameServerStateError

updated := false
count := 0

c, m := newFakeController()
m.AgonesClient.AddReactor("list", "gameserversets", func(action k8stesting.Action) (bool, runtime.Object, error) {
return true, &agonesv1.GameServerSetList{Items: []agonesv1.GameServerSet{*gsSet}}, nil
})
m.AgonesClient.AddReactor("list", "gameservers", func(action k8stesting.Action) (bool, runtime.Object, error) {
return true, &agonesv1.GameServerList{Items: list}, nil
})

m.AgonesClient.AddReactor("update", "gameservers", func(action k8stesting.Action) (bool, runtime.Object, error) {
ua := action.(k8stesting.UpdateAction)
gs := ua.GetObject().(*agonesv1.GameServer)
assert.Equal(t, gs.Status.State, agonesv1.GameServerStateShutdown)

updated = true
assert.Equal(t, "test-0", gs.GetName())
return true, nil, nil
})
m.AgonesClient.AddReactor("create", "gameservers", func(action k8stesting.Action) (bool, runtime.Object, error) {
ca := action.(k8stesting.CreateAction)
gs := ca.GetObject().(*agonesv1.GameServer)

assert.True(t, metav1.IsControlledBy(gs, gsSet))
count++
return true, gs, nil
})

ctx, cancel := agtesting.StartInformers(m, c.gameServerSetSynced, c.gameServerSynced)
defer cancel()

c.syncGameServerSet(ctx, gsSet.ObjectMeta.Namespace+"/"+gsSet.ObjectMeta.Name) // nolint: errcheck

assert.Equal(t, 6, count)
assert.True(t, updated, "A game servers should have been updated")
})

t.Run("adding and delay deleting errored gameservers", func(t *testing.T) {
gsSet := defaultFixture()
list := createGameServers(gsSet, 5)

// make some as unhealthy
list[0].Annotations = map[string]string{agonesv1.GameServerErroredAtAnnotation: time.Now().UTC().Format(time.RFC3339)}
markmandel marked this conversation as resolved.
Show resolved Hide resolved
list[0].Status.State = agonesv1.GameServerStateError

updated := false
count := 0

c, m := newFakeController()
m.AgonesClient.AddReactor("list", "gameserversets", func(action k8stesting.Action) (bool, runtime.Object, error) {
return true, &agonesv1.GameServerSetList{Items: []agonesv1.GameServerSet{*gsSet}}, nil
})
m.AgonesClient.AddReactor("list", "gameservers", func(action k8stesting.Action) (bool, runtime.Object, error) {
return true, &agonesv1.GameServerList{Items: list}, nil
})

m.AgonesClient.AddReactor("update", "gameservers", func(action k8stesting.Action) (bool, runtime.Object, error) {
ua := action.(k8stesting.UpdateAction)
gs := ua.GetObject().(*agonesv1.GameServer)
assert.Equal(t, gs.Status.State, agonesv1.GameServerStateShutdown)

updated = true
assert.Equal(t, "test-0", gs.GetName())
return true, nil, nil
})
m.AgonesClient.AddReactor("create", "gameservers", func(action k8stesting.Action) (bool, runtime.Object, error) {
ca := action.(k8stesting.CreateAction)
gs := ca.GetObject().(*agonesv1.GameServer)

assert.True(t, metav1.IsControlledBy(gs, gsSet))
count++
return true, gs, nil
})

ctx, cancel := agtesting.StartInformers(m, c.gameServerSetSynced, c.gameServerSynced)
defer cancel()

c.syncGameServerSet(ctx, gsSet.ObjectMeta.Namespace+"/"+gsSet.ObjectMeta.Name) // nolint: errcheck

assert.Equal(t, 5, count)
assert.False(t, updated, "A game servers should not have been updated")
})

t.Run("removing gamservers", func(t *testing.T) {
gsSet := defaultFixture()
list := createGameServers(gsSet, 15)
Expand Down