From 6a7d2af14d3fb7873c029bb641acd3de28ed8417 Mon Sep 17 00:00:00 2001 From: igooch Date: Sun, 22 Sep 2024 21:06:11 -0700 Subject: [PATCH] Test in place upgrades run tests (#3991) * Creates game servers every 5 seconds after an Agones version install * Adds a watch for pods in a crashbackoff loop * Updates CountsAndLists to a template value in game server template The sdk-client-test requires pre-existing Counter and List values when CountsAndLists is enabled, and a game server cannot be applied with a Counter or List value when CountsAndLists is not enabled. * Adds visual studio .sln files to the gitignore file * Adding fleet code here so I can come back to it later if necessary. * Adds failure state to upgradeTest for too many pods in CrashBackOff loop and cleans up resources on delete * Allow more game server creation per version * Updates game server watch to store crash looping pods in a map --------- Co-authored-by: Mengye (Max) Gong <8364575+gongmax@users.noreply.github.com> --- .gitignore | 1 + test/sdk/go/Makefile | 4 +- test/sdk/go/sdk-client-test.go | 3 + test/upgrade/Dockerfile | 1 + test/upgrade/Makefile | 4 +- test/upgrade/gameserverTemplate.yaml | 18 +++ test/upgrade/main.go | 177 +++++++++++++++++++++++---- test/upgrade/permissions.yaml | 26 ++-- test/upgrade/upgradeTest.yaml | 4 +- test/upgrade/versionMap.yaml | 8 +- 10 files changed, 203 insertions(+), 43 deletions(-) diff --git a/.gitignore b/.gitignore index b71f162fd7..bb27de2141 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,7 @@ *.iml bin *.o +*.sln tmp terraform.tfvars terraform.tfstate* diff --git a/test/sdk/go/Makefile b/test/sdk/go/Makefile index 162b2e29c4..3c3e60551b 100644 --- a/test/sdk/go/Makefile +++ b/test/sdk/go/Makefile @@ -29,8 +29,8 @@ project_path := $(dir $(mkfile_path)) root_path = $(realpath $(project_path)/) # Because go mod init in the Dockerfile installs the most recently released version of Agones, this # will need to be built and pushed post-release. During DEV it will be built at DEV - 1. -base_version = 1.43.0 -server_tag := $(REGISTRY)/sdk-client-test:$(base_version) +release_version = 1.43.0 +server_tag := $(REGISTRY)/sdk-client-test:$(release_version) # _____ _ # |_ _|_ _ _ __ __ _ ___| |_ ___ diff --git a/test/sdk/go/sdk-client-test.go b/test/sdk/go/sdk-client-test.go index a5c022e9f8..699c382f7f 100644 --- a/test/sdk/go/sdk-client-test.go +++ b/test/sdk/go/sdk-client-test.go @@ -105,6 +105,9 @@ func main() { testLists(sdk) } + // Delay before shutdown to prevent Game Servers from churning too quickly on a running cluster + time.Sleep(8 * time.Second) + err = sdk.Shutdown() if err != nil { log.Fatalf("Could not shutdown GameServer: %s", err) diff --git a/test/upgrade/Dockerfile b/test/upgrade/Dockerfile index 5383df1b4a..52aa6a5e3f 100644 --- a/test/upgrade/Dockerfile +++ b/test/upgrade/Dockerfile @@ -15,6 +15,7 @@ FROM gcr.io/cloud-builders/gcloud AS builder RUN apt-get update && \ + apt-get install -y curl && \ apt-get clean WORKDIR /usr/local diff --git a/test/upgrade/Makefile b/test/upgrade/Makefile index e181b6b04c..4b6bfe5a0a 100644 --- a/test/upgrade/Makefile +++ b/test/upgrade/Makefile @@ -27,8 +27,8 @@ REGISTRY ?= mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST))) project_path := $(dir $(mkfile_path)) root_path = $(realpath $(project_path)/) -version := 0.1 -server_tag := $(REGISTRY)/upgrade-test-controller:$(version) +dev_version = 1.44.0-dev +server_tag := $(REGISTRY)/upgrade-test-controller:$(dev_version) # _____ _ # |_ _|_ _ _ __ __ _ ___| |_ ___ diff --git a/test/upgrade/gameserverTemplate.yaml b/test/upgrade/gameserverTemplate.yaml index ae9bbc7043..407ed218aa 100644 --- a/test/upgrade/gameserverTemplate.yaml +++ b/test/upgrade/gameserverTemplate.yaml @@ -26,11 +26,27 @@ data: generateName: sdk-client-test- labels: agonesVersion: {{ .AgonesVersion }} + app: sdk-client-test spec: ports: - name: default portPolicy: Dynamic containerPort: 7654 + sdkServer: + logLevel: Debug + {{ if .CountsAndLists }} + counters: + rooms: + count: 1 + capacity: 10 + lists: + players: + capacity: 100 + values: + - test0 + - test1 + - test2 + {{ end }} template: metadata: labels: @@ -39,6 +55,7 @@ data: containers: - name: sdk-client-test image: "{{ .Registry }}:{{ .AgonesVersion }}" + imagePullPolicy: Always resources: requests: memory: 64Mi @@ -46,3 +63,4 @@ data: limits: memory: 64Mi cpu: 20m + serviceAccountName: agones-sa diff --git a/test/upgrade/main.go b/test/upgrade/main.go index da922ec7a1..6ce62924ae 100644 --- a/test/upgrade/main.go +++ b/test/upgrade/main.go @@ -24,14 +24,17 @@ import ( "os" "os/exec" "regexp" + "strconv" "strings" "time" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" + "k8s.io/client-go/tools/cache" ) const ( @@ -50,13 +53,11 @@ const ( // AgonesRegistry is the public registry for Agones releases AgonesRegistry = "us-docker.pkg.dev/agones-images/release" // TestRegistry is the public registry for upgrade test container files - // TODO: Create Test Registry in agones-images/ci TestRegistry = "us-docker.pkg.dev/agones-images/ci/sdk-client-test" ) var ( // Dev is the current development version of Agones - // TODO: Get the build version of dev (i.e. 1.44.0-dev-b765f49) Dev = os.Getenv("Dev") // ReleaseVersion is the latest released version of Agones (DEV - 1). ReleaseVersion = os.Getenv("ReleaseVersion") @@ -70,10 +71,21 @@ var ( func main() { ctx := context.Background() + cfg, err := rest.InClusterConfig() + if err != nil { + log.Fatal("Could not create in cluster config", cfg) + } - validConfigs := configTestSetup(ctx) + kubeClient, err := kubernetes.NewForConfig(cfg) + if err != nil { + log.Fatal("Could not create the kubernetes api clientset", err) + } + + validConfigs := configTestSetup(ctx, kubeClient) + go watchGameServerPods(kubeClient, make(chan struct{}), make(map[string]podLog), len(validConfigs)*2) addAgonesRepo() runConfigWalker(ctx, validConfigs) + cleanUpResources() } type versionMappings struct { @@ -92,9 +104,16 @@ type configTest struct { gameServerPath string } +// CountsAndLists can be removed from the template once CountsAndLists is GA in all tested versions type gameServerTemplate struct { - AgonesVersion string - Registry string + AgonesVersion string + Registry string + CountsAndLists bool +} + +type podLog struct { + SdkVersion string + GameServerVersion string } type helmStatuses []struct { @@ -108,11 +127,11 @@ type helmStatuses []struct { } // Determine test scenario to run -func configTestSetup(ctx context.Context) []*configTest { +func configTestSetup(ctx context.Context, kubeClient *kubernetes.Clientset) []*configTest { versionMap := versionMappings{} // Find the Kubernetes version of the node that this test is running on. - k8sVersion := findK8sVersion(ctx) + k8sVersion := findK8sVersion(ctx, kubeClient) // Get the mappings of valid Kubernetes, Agones, and Feature Gate versions from the configmap. err := json.Unmarshal([]byte(VersionMappings), &versionMap) @@ -124,38 +143,48 @@ func configTestSetup(ctx context.Context) []*configTest { configTests := []*configTest{} for _, agonesVersion := range versionMap.K8sToAgonesVersions[k8sVersion] { ct := configTest{} + // TODO: create different valid config based off of available feature gates. + // containsCountsAndLists will need to be updated to return true for when CountsAndLists=true. + countsAndLists := containsCountsAndLists(agonesVersion) ct.agonesVersion = agonesVersion - if agonesVersion == "DEV" { + if agonesVersion == "Dev" { ct.agonesVersion = Dev // Game server container cannot be created at DEV version due to go.mod only able to access // published Agones versions. Use N-1 for DEV. - ct.gameServerPath = createGameServerFile(ReleaseVersion) + ct.gameServerPath = createGameServerFile(ReleaseVersion, countsAndLists) } else { - ct.gameServerPath = createGameServerFile(agonesVersion) + ct.gameServerPath = createGameServerFile(agonesVersion, countsAndLists) } - // TODO: create different valid config based off of available feature gates configTests = append(configTests, &ct) } return configTests } -// Finds the Kubernetes version of the Kubelet on the node that the current pod is running on. -// The Kubelet version is the same version as the node. -func findK8sVersion(ctx context.Context) string { - cfg, err := rest.InClusterConfig() - if err != nil { - log.Fatal("Could not create in cluster config", cfg) +// containsCountsAndLists returns true if the agonesVersion >= 1.41.0 when the CountsAndLists +// feature entered Beta (on by default) +func containsCountsAndLists(agonesVersion string) bool { + if agonesVersion == "Dev" { + return true } - - kubeClient, err := kubernetes.NewForConfig(cfg) + r := regexp.MustCompile(`\d+\.\d+`) + strVersion := r.FindString(agonesVersion) + floatVersion, err := strconv.ParseFloat(strVersion, 64) if err != nil { - log.Fatal("Could not create the kubernetes api clientset", err) + log.Fatalf("Could not convert agonesVersion %s to float: %s", agonesVersion, err) } + if floatVersion > 1.40 { + return true + } + return false +} +// Finds the Kubernetes version of the Kubelet on the node that the current pod is running on. +// The Kubelet version is the same version as the node. +func findK8sVersion(ctx context.Context, kubeClient *kubernetes.Clientset) string { // Wait to get pod and node as these may take a while to start on a new Autopilot cluster. var pod *v1.Pod - err = wait.PollUntilContextTimeout(ctx, 5*time.Second, 7*time.Minute, true, func(ctx context.Context) (done bool, err error) { + err := wait.PollUntilContextTimeout(ctx, 5*time.Second, 7*time.Minute, true, func(ctx context.Context) (done bool, err error) { pod, err = kubeClient.CoreV1().Pods(PodNamespace).Get(ctx, PodName, metav1.GetOptions{}) if err != nil { return false, nil @@ -251,6 +280,8 @@ func installAgonesRelease(version, registry, featureGates, imagePullPolicy, side } func runConfigWalker(ctx context.Context, validConfigs []*configTest) { + cancelCtx, cancel := context.WithCancel(ctx) + for _, config := range validConfigs { registry := AgonesRegistry chart := HelmChart @@ -277,7 +308,14 @@ func runConfigWalker(ctx context.Context, validConfigs []*configTest) { log.Fatalf("PollUntilContextTimeout timed out while attempting upgrade to Agones version %s. Helm Status %s", config.agonesVersion, helmStatus) } + + go createGameServers(cancelCtx, config.gameServerPath) + // Allow some soak time at the Agones version before next upgrade + time.Sleep(1 * time.Minute) } + cancel() + // TODO: Replace sleep with wait for the existing healthy Game Servers finish naturally by reaching their shutdown phase. + time.Sleep(30 * time.Second) } // checkHelmStatus returns the status of the Helm release at a specified agonesVersion if it exists. @@ -304,8 +342,8 @@ func checkHelmStatus(agonesVersion string) string { // Creates a gameserver yaml file from the mounted gameserver.yaml template. The name of the new // gameserver yaml is based on the Agones version, i.e. gs1440.yaml for Agones version 1.44.0 -func createGameServerFile(agonesVersion string) string { - gsTmpl := gameServerTemplate{Registry: TestRegistry, AgonesVersion: agonesVersion} +func createGameServerFile(agonesVersion string, countsAndLists bool) string { + gsTmpl := gameServerTemplate{Registry: TestRegistry, AgonesVersion: agonesVersion, CountsAndLists: countsAndLists} gsTemplate, err := template.ParseFiles("gameserver.yaml") if err != nil { @@ -337,3 +375,96 @@ func createGameServerFile(agonesVersion string) string { return gsPath } + +// Create a game server every five seconds until the context is cancelled. The game server container +// be the same binary version as the game server file. The SDK version is always the same as the +// version of the Agones controller that created it. The Game Server shuts itself down after the +// tests have run as part of the `sdk-client-test` logic. +func createGameServers(ctx context.Context, gsPath string) { + args := []string{"create", "-f", gsPath} + ticker := time.NewTicker(5 * time.Second) + + for { + select { + case <-ctx.Done(): + ticker.Stop() + return + case <-ticker.C: + _, err := runExecCommand(KubectlCmd, args...) + // TODO: Do not ignore error if unable to create due to something other than cluster scale up + if err != nil { + log.Printf("Could not create Gameserver %s: %s", gsPath, err) + } + } + } +} + +// watchGameServerPods watches all game server pods for CrashLoopBackOff. Errors if the number of +// CrashLoopBackOff backoff pods exceeds the number of acceptedFailures. +func watchGameServerPods(kubeClient *kubernetes.Clientset, stopCh chan struct{}, failedPods map[string]podLog, acceptedFailures int) { + // Filter by label agones.dev/role=gameserver to only game server pods + labelOptions := informers.WithTweakListOptions(func(opts *metav1.ListOptions) { + opts.LabelSelector = "agones.dev/role=gameserver" + }) + kubeInformerFactory := informers.NewSharedInformerFactoryWithOptions(kubeClient, 5*time.Second, + informers.WithNamespace("default"), labelOptions) + podInformer := kubeInformerFactory.Core().V1().Pods().Informer() + + _, err := podInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + UpdateFunc: func(_, newObj interface{}) { + newPod := newObj.(*v1.Pod) + for _, cs := range newPod.Status.ContainerStatuses { + if cs.Name != "sdk-client-test" || cs.State.Waiting == nil || cs.State.Waiting.Reason != "CrashLoopBackOff" { + continue + } + gsVersion := newPod.Labels["agonesVersion"] + sdkVersion := newPod.Annotations["agones.dev/sdk-version"] + log.Printf("%s for pod: %s with game server binary version %s, and SDK version %s", cs.State.Waiting.Reason, newPod.Name, gsVersion, sdkVersion) + // Put failed pods into the map until it reaches capacity. + failedPods[newPod.Name] = podLog{GameServerVersion: gsVersion, SdkVersion: sdkVersion} + if len(failedPods) > acceptedFailures { + log.Fatalf("Too many Game Server pods in CrashLoopBackOff: %v", failedPods) + } + } + }, + }) + if err != nil { + log.Fatal("Not able to create AddEventHandler", err) + } + + go podInformer.Run(stopCh) + if !cache.WaitForCacheSync(stopCh, podInformer.HasSynced) { + log.Fatal("Timed out waiting for caches to sync") + } +} + +// Deletes any remaining Game Servers, Uninstalls Agones, and Deletes agones-system namespace. +func cleanUpResources() { + args := []string{"delete", "gs", "-l", "app=sdk-client-test"} + _, err := runExecCommand(KubectlCmd, args...) + if err != nil { + log.Println("Could not delete game servers", err) + } + + args = []string{"uninstall", "agones", "-n", "agones-system"} + _, err = runExecCommand(HelmCmd, args...) + if err != nil { + log.Println("Could not Helm uninstall Agones", err) + } + + // Apiservice v1.allocation.agones.dev, which is part of Service agones-system/agones-controller-service, + // does not always get cleaned up on Helm uninstall, and needs to be deleted (if it exists) before + // the agones-system namespace can be removed. + // Ignore the error, because an "error" means Helm already uninstall the apiservice. + args = []string{"delete", "apiservice", "v1.allocation.agones.dev"} + out, err := runExecCommand(KubectlCmd, args...) + if err == nil { + fmt.Println(string(out)) + } + + args = []string{"delete", "ns", "agones-system"} + _, err = runExecCommand(KubectlCmd, args...) + if err != nil { + log.Println("Could not delete agones-system namespace", err) + } +} diff --git a/test/upgrade/permissions.yaml b/test/upgrade/permissions.yaml index 2de3ae82f1..1f4a96005b 100644 --- a/test/upgrade/permissions.yaml +++ b/test/upgrade/permissions.yaml @@ -83,47 +83,47 @@ description: "This priority class should be used for upgrade-test-runner pods on apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: namespace-creator + name: namespace-manager rules: - apiGroups: [""] resources: ["namespaces"] - verbs: ["get", "list", "watch", "create"] + verbs: ["get", "list", "watch", "create", "delete"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: create-namespaces + name: manage-namespaces subjects: - kind: ServiceAccount name: agones-sa namespace: default roleRef: kind: ClusterRole - name: namespace-creator + name: namespace-manager apiGroup: rbac.authorization.k8s.io --- -# Helm needs to be able to create secrets +# Helm needs to be able to perform CRUD operations on secrets apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: namespace: agones-system - name: secret-creator + name: secret-manager rules: - apiGroups: [""] resources: ["secrets"] - verbs: ["get", "watch", "list", "create", "patch", "update"] + verbs: ["get", "watch", "list", "create", "patch", "update", "delete"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: create-secrets + name: manage-secrets subjects: - kind: ServiceAccount name: agones-sa namespace: default roleRef: kind: ClusterRole - name: secret-creator + name: secret-manager apiGroup: rbac.authorization.k8s.io --- # Helm needs to be able to create priorityclasses @@ -376,7 +376,7 @@ rules: verbs: ["patch"] - apiGroups: ["agones.dev"] resources: ["fleets"] - verbs: ["get", "list", "update", "watch"] + verbs: ["create", "delete", "get", "list", "update", "watch"] - apiGroups: ["agones.dev"] resources: ["fleets/status", "gameserversets/status"] verbs: ["update"] @@ -439,6 +439,9 @@ subjects: - kind: User name: system:serviceaccount:default:agones-sa apiGroup: rbac.authorization.k8s.io + - kind: ServiceAccount + name: agones-sa + namespace: default roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole @@ -488,6 +491,9 @@ subjects: - kind: User name: system:serviceaccount:default:agones-sa apiGroup: rbac.authorization.k8s.io + - kind: ServiceAccount + name: agones-sa + namespace: default roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole diff --git a/test/upgrade/upgradeTest.yaml b/test/upgrade/upgradeTest.yaml index c02bdd3863..ebb301953c 100644 --- a/test/upgrade/upgradeTest.yaml +++ b/test/upgrade/upgradeTest.yaml @@ -26,8 +26,8 @@ spec: spec: containers: - name: upgrade-test-controller - # TODO(igooch): Update image name to use a templated value - image: $(REGISTRY)/upgrade-test-controller:0.1 + # TODO: Update image name to use a templated value for current Dev version + image: us-docker.pkg.dev/agones-images/ci/upgrade-test-controller:1.44.0-dev imagePullPolicy: Always env: - name: PodName diff --git a/test/upgrade/versionMap.yaml b/test/upgrade/versionMap.yaml index cb2295cfe7..b0a7499de0 100644 --- a/test/upgrade/versionMap.yaml +++ b/test/upgrade/versionMap.yaml @@ -55,18 +55,18 @@ data: "1.41.0", "1.42.0", "1.43.0", - "DEV" + "Dev" ], "1.29": [ "1.40.0", "1.41.0", "1.42.0", "1.43.0", - "DEV" + "Dev" ], "1.30": [ "1.43.0", - "DEV" + "Dev" ] }, "agonesVersionFeatureGates": { @@ -110,7 +110,7 @@ data: "alphaGates": ["GKEAutopilotExtendedDurationPods", "PlayerAllocationFilter", "PlayerTracking", "PortPolicyNone", "PortRanges", "RollingUpdateFix"], "betaGates": ["AutopilotPassthroughPort", "CountsAndLists", "DisableResyncOnSDKServer"] }, - "DEV": { + "Dev": { "alphaGates": ["GKEAutopilotExtendedDurationPods", "PlayerAllocationFilter", "PlayerTracking", "PortPolicyNone", "PortRanges", "RollingUpdateFix", "ScheduledAutoscaler"], "betaGates": ["AutopilotPassthroughPort", "CountsAndLists", "DisableResyncOnSDKServer"] }