Skip to content

Commit

Permalink
Test in place upgrades run tests (#3991)
Browse files Browse the repository at this point in the history
* Creates game servers every 5 seconds after an Agones version install

* Adds a watch for pods in a crashbackoff loop

* Updates CountsAndLists to a template value in game server template
The sdk-client-test requires pre-existing Counter and List values when CountsAndLists is enabled, and a game server cannot be applied with a Counter or List value when CountsAndLists is not enabled.

* Adds visual studio .sln files to the gitignore file

* Adding fleet code here so I can come back to it later if necessary.

* Adds failure state to upgradeTest for too many pods in CrashBackOff loop
and cleans up resources on delete

* Allow more game server creation per version

* Updates game server watch to store crash looping pods in a map

---------

Co-authored-by: Mengye (Max) Gong <[email protected]>
  • Loading branch information
igooch and gongmax authored Sep 23, 2024
1 parent 77face1 commit 6a7d2af
Show file tree
Hide file tree
Showing 10 changed files with 203 additions and 43 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
*.iml
bin
*.o
*.sln
tmp
terraform.tfvars
terraform.tfstate*
Expand Down
4 changes: 2 additions & 2 deletions test/sdk/go/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ project_path := $(dir $(mkfile_path))
root_path = $(realpath $(project_path)/)
# Because go mod init in the Dockerfile installs the most recently released version of Agones, this
# will need to be built and pushed post-release. During DEV it will be built at DEV - 1.
base_version = 1.43.0
server_tag := $(REGISTRY)/sdk-client-test:$(base_version)
release_version = 1.43.0
server_tag := $(REGISTRY)/sdk-client-test:$(release_version)

# _____ _
# |_ _|_ _ _ __ __ _ ___| |_ ___
Expand Down
3 changes: 3 additions & 0 deletions test/sdk/go/sdk-client-test.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,9 @@ func main() {
testLists(sdk)
}

// Delay before shutdown to prevent Game Servers from churning too quickly on a running cluster
time.Sleep(8 * time.Second)

err = sdk.Shutdown()
if err != nil {
log.Fatalf("Could not shutdown GameServer: %s", err)
Expand Down
1 change: 1 addition & 0 deletions test/upgrade/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
FROM gcr.io/cloud-builders/gcloud AS builder

RUN apt-get update && \
apt-get install -y curl && \
apt-get clean

WORKDIR /usr/local
Expand Down
4 changes: 2 additions & 2 deletions test/upgrade/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ REGISTRY ?=
mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
project_path := $(dir $(mkfile_path))
root_path = $(realpath $(project_path)/)
version := 0.1
server_tag := $(REGISTRY)/upgrade-test-controller:$(version)
dev_version = 1.44.0-dev
server_tag := $(REGISTRY)/upgrade-test-controller:$(dev_version)

# _____ _
# |_ _|_ _ _ __ __ _ ___| |_ ___
Expand Down
18 changes: 18 additions & 0 deletions test/upgrade/gameserverTemplate.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,27 @@ data:
generateName: sdk-client-test-
labels:
agonesVersion: {{ .AgonesVersion }}
app: sdk-client-test
spec:
ports:
- name: default
portPolicy: Dynamic
containerPort: 7654
sdkServer:
logLevel: Debug
{{ if .CountsAndLists }}
counters:
rooms:
count: 1
capacity: 10
lists:
players:
capacity: 100
values:
- test0
- test1
- test2
{{ end }}
template:
metadata:
labels:
Expand All @@ -39,10 +55,12 @@ data:
containers:
- name: sdk-client-test
image: "{{ .Registry }}:{{ .AgonesVersion }}"
imagePullPolicy: Always
resources:
requests:
memory: 64Mi
cpu: 20m
limits:
memory: 64Mi
cpu: 20m
serviceAccountName: agones-sa
177 changes: 154 additions & 23 deletions test/upgrade/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,17 @@ import (
"os"
"os/exec"
"regexp"
"strconv"
"strings"
"time"

v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/cache"
)

const (
Expand All @@ -50,13 +53,11 @@ const (
// AgonesRegistry is the public registry for Agones releases
AgonesRegistry = "us-docker.pkg.dev/agones-images/release"
// TestRegistry is the public registry for upgrade test container files
// TODO: Create Test Registry in agones-images/ci
TestRegistry = "us-docker.pkg.dev/agones-images/ci/sdk-client-test"
)

var (
// Dev is the current development version of Agones
// TODO: Get the build version of dev (i.e. 1.44.0-dev-b765f49)
Dev = os.Getenv("Dev")
// ReleaseVersion is the latest released version of Agones (DEV - 1).
ReleaseVersion = os.Getenv("ReleaseVersion")
Expand All @@ -70,10 +71,21 @@ var (

func main() {
ctx := context.Background()
cfg, err := rest.InClusterConfig()
if err != nil {
log.Fatal("Could not create in cluster config", cfg)
}

validConfigs := configTestSetup(ctx)
kubeClient, err := kubernetes.NewForConfig(cfg)
if err != nil {
log.Fatal("Could not create the kubernetes api clientset", err)
}

validConfigs := configTestSetup(ctx, kubeClient)
go watchGameServerPods(kubeClient, make(chan struct{}), make(map[string]podLog), len(validConfigs)*2)
addAgonesRepo()
runConfigWalker(ctx, validConfigs)
cleanUpResources()
}

type versionMappings struct {
Expand All @@ -92,9 +104,16 @@ type configTest struct {
gameServerPath string
}

// CountsAndLists can be removed from the template once CountsAndLists is GA in all tested versions
type gameServerTemplate struct {
AgonesVersion string
Registry string
AgonesVersion string
Registry string
CountsAndLists bool
}

type podLog struct {
SdkVersion string
GameServerVersion string
}

type helmStatuses []struct {
Expand All @@ -108,11 +127,11 @@ type helmStatuses []struct {
}

// Determine test scenario to run
func configTestSetup(ctx context.Context) []*configTest {
func configTestSetup(ctx context.Context, kubeClient *kubernetes.Clientset) []*configTest {
versionMap := versionMappings{}

// Find the Kubernetes version of the node that this test is running on.
k8sVersion := findK8sVersion(ctx)
k8sVersion := findK8sVersion(ctx, kubeClient)

// Get the mappings of valid Kubernetes, Agones, and Feature Gate versions from the configmap.
err := json.Unmarshal([]byte(VersionMappings), &versionMap)
Expand All @@ -124,38 +143,48 @@ func configTestSetup(ctx context.Context) []*configTest {
configTests := []*configTest{}
for _, agonesVersion := range versionMap.K8sToAgonesVersions[k8sVersion] {
ct := configTest{}
// TODO: create different valid config based off of available feature gates.
// containsCountsAndLists will need to be updated to return true for when CountsAndLists=true.
countsAndLists := containsCountsAndLists(agonesVersion)
ct.agonesVersion = agonesVersion
if agonesVersion == "DEV" {
if agonesVersion == "Dev" {
ct.agonesVersion = Dev
// Game server container cannot be created at DEV version due to go.mod only able to access
// published Agones versions. Use N-1 for DEV.
ct.gameServerPath = createGameServerFile(ReleaseVersion)
ct.gameServerPath = createGameServerFile(ReleaseVersion, countsAndLists)
} else {
ct.gameServerPath = createGameServerFile(agonesVersion)
ct.gameServerPath = createGameServerFile(agonesVersion, countsAndLists)
}
// TODO: create different valid config based off of available feature gates
configTests = append(configTests, &ct)
}

return configTests
}

// Finds the Kubernetes version of the Kubelet on the node that the current pod is running on.
// The Kubelet version is the same version as the node.
func findK8sVersion(ctx context.Context) string {
cfg, err := rest.InClusterConfig()
if err != nil {
log.Fatal("Could not create in cluster config", cfg)
// containsCountsAndLists returns true if the agonesVersion >= 1.41.0 when the CountsAndLists
// feature entered Beta (on by default)
func containsCountsAndLists(agonesVersion string) bool {
if agonesVersion == "Dev" {
return true
}

kubeClient, err := kubernetes.NewForConfig(cfg)
r := regexp.MustCompile(`\d+\.\d+`)
strVersion := r.FindString(agonesVersion)
floatVersion, err := strconv.ParseFloat(strVersion, 64)
if err != nil {
log.Fatal("Could not create the kubernetes api clientset", err)
log.Fatalf("Could not convert agonesVersion %s to float: %s", agonesVersion, err)
}
if floatVersion > 1.40 {
return true
}
return false
}

// Finds the Kubernetes version of the Kubelet on the node that the current pod is running on.
// The Kubelet version is the same version as the node.
func findK8sVersion(ctx context.Context, kubeClient *kubernetes.Clientset) string {
// Wait to get pod and node as these may take a while to start on a new Autopilot cluster.
var pod *v1.Pod
err = wait.PollUntilContextTimeout(ctx, 5*time.Second, 7*time.Minute, true, func(ctx context.Context) (done bool, err error) {
err := wait.PollUntilContextTimeout(ctx, 5*time.Second, 7*time.Minute, true, func(ctx context.Context) (done bool, err error) {
pod, err = kubeClient.CoreV1().Pods(PodNamespace).Get(ctx, PodName, metav1.GetOptions{})
if err != nil {
return false, nil
Expand Down Expand Up @@ -251,6 +280,8 @@ func installAgonesRelease(version, registry, featureGates, imagePullPolicy, side
}

func runConfigWalker(ctx context.Context, validConfigs []*configTest) {
cancelCtx, cancel := context.WithCancel(ctx)

for _, config := range validConfigs {
registry := AgonesRegistry
chart := HelmChart
Expand All @@ -277,7 +308,14 @@ func runConfigWalker(ctx context.Context, validConfigs []*configTest) {
log.Fatalf("PollUntilContextTimeout timed out while attempting upgrade to Agones version %s. Helm Status %s",
config.agonesVersion, helmStatus)
}

go createGameServers(cancelCtx, config.gameServerPath)
// Allow some soak time at the Agones version before next upgrade
time.Sleep(1 * time.Minute)
}
cancel()
// TODO: Replace sleep with wait for the existing healthy Game Servers finish naturally by reaching their shutdown phase.
time.Sleep(30 * time.Second)
}

// checkHelmStatus returns the status of the Helm release at a specified agonesVersion if it exists.
Expand All @@ -304,8 +342,8 @@ func checkHelmStatus(agonesVersion string) string {

// Creates a gameserver yaml file from the mounted gameserver.yaml template. The name of the new
// gameserver yaml is based on the Agones version, i.e. gs1440.yaml for Agones version 1.44.0
func createGameServerFile(agonesVersion string) string {
gsTmpl := gameServerTemplate{Registry: TestRegistry, AgonesVersion: agonesVersion}
func createGameServerFile(agonesVersion string, countsAndLists bool) string {
gsTmpl := gameServerTemplate{Registry: TestRegistry, AgonesVersion: agonesVersion, CountsAndLists: countsAndLists}

gsTemplate, err := template.ParseFiles("gameserver.yaml")
if err != nil {
Expand Down Expand Up @@ -337,3 +375,96 @@ func createGameServerFile(agonesVersion string) string {

return gsPath
}

// Create a game server every five seconds until the context is cancelled. The game server container
// be the same binary version as the game server file. The SDK version is always the same as the
// version of the Agones controller that created it. The Game Server shuts itself down after the
// tests have run as part of the `sdk-client-test` logic.
func createGameServers(ctx context.Context, gsPath string) {
args := []string{"create", "-f", gsPath}
ticker := time.NewTicker(5 * time.Second)

for {
select {
case <-ctx.Done():
ticker.Stop()
return
case <-ticker.C:
_, err := runExecCommand(KubectlCmd, args...)
// TODO: Do not ignore error if unable to create due to something other than cluster scale up
if err != nil {
log.Printf("Could not create Gameserver %s: %s", gsPath, err)
}
}
}
}

// watchGameServerPods watches all game server pods for CrashLoopBackOff. Errors if the number of
// CrashLoopBackOff backoff pods exceeds the number of acceptedFailures.
func watchGameServerPods(kubeClient *kubernetes.Clientset, stopCh chan struct{}, failedPods map[string]podLog, acceptedFailures int) {
// Filter by label agones.dev/role=gameserver to only game server pods
labelOptions := informers.WithTweakListOptions(func(opts *metav1.ListOptions) {
opts.LabelSelector = "agones.dev/role=gameserver"
})
kubeInformerFactory := informers.NewSharedInformerFactoryWithOptions(kubeClient, 5*time.Second,
informers.WithNamespace("default"), labelOptions)
podInformer := kubeInformerFactory.Core().V1().Pods().Informer()

_, err := podInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
UpdateFunc: func(_, newObj interface{}) {
newPod := newObj.(*v1.Pod)
for _, cs := range newPod.Status.ContainerStatuses {
if cs.Name != "sdk-client-test" || cs.State.Waiting == nil || cs.State.Waiting.Reason != "CrashLoopBackOff" {
continue
}
gsVersion := newPod.Labels["agonesVersion"]
sdkVersion := newPod.Annotations["agones.dev/sdk-version"]
log.Printf("%s for pod: %s with game server binary version %s, and SDK version %s", cs.State.Waiting.Reason, newPod.Name, gsVersion, sdkVersion)
// Put failed pods into the map until it reaches capacity.
failedPods[newPod.Name] = podLog{GameServerVersion: gsVersion, SdkVersion: sdkVersion}
if len(failedPods) > acceptedFailures {
log.Fatalf("Too many Game Server pods in CrashLoopBackOff: %v", failedPods)
}
}
},
})
if err != nil {
log.Fatal("Not able to create AddEventHandler", err)
}

go podInformer.Run(stopCh)
if !cache.WaitForCacheSync(stopCh, podInformer.HasSynced) {
log.Fatal("Timed out waiting for caches to sync")
}
}

// Deletes any remaining Game Servers, Uninstalls Agones, and Deletes agones-system namespace.
func cleanUpResources() {
args := []string{"delete", "gs", "-l", "app=sdk-client-test"}
_, err := runExecCommand(KubectlCmd, args...)
if err != nil {
log.Println("Could not delete game servers", err)
}

args = []string{"uninstall", "agones", "-n", "agones-system"}
_, err = runExecCommand(HelmCmd, args...)
if err != nil {
log.Println("Could not Helm uninstall Agones", err)
}

// Apiservice v1.allocation.agones.dev, which is part of Service agones-system/agones-controller-service,
// does not always get cleaned up on Helm uninstall, and needs to be deleted (if it exists) before
// the agones-system namespace can be removed.
// Ignore the error, because an "error" means Helm already uninstall the apiservice.
args = []string{"delete", "apiservice", "v1.allocation.agones.dev"}
out, err := runExecCommand(KubectlCmd, args...)
if err == nil {
fmt.Println(string(out))
}

args = []string{"delete", "ns", "agones-system"}
_, err = runExecCommand(KubectlCmd, args...)
if err != nil {
log.Println("Could not delete agones-system namespace", err)
}
}
Loading

0 comments on commit 6a7d2af

Please sign in to comment.