diff --git a/plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/KubernetesClusterManagerImpl.java b/plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/KubernetesClusterManagerImpl.java index 19c66c1ba0b8..4bd6ec65fc3a 100644 --- a/plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/KubernetesClusterManagerImpl.java +++ b/plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/KubernetesClusterManagerImpl.java @@ -1647,6 +1647,7 @@ public ConfigKey[] getConfigKeys() { KubernetesClusterStartTimeout, KubernetesClusterScaleTimeout, KubernetesClusterUpgradeTimeout, + KubernetesClusterUpgradeRetries, KubernetesClusterExperimentalFeaturesEnabled, KubernetesMaxClusterSize }; diff --git a/plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/KubernetesClusterService.java b/plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/KubernetesClusterService.java index 420f35527207..12240a47ecac 100644 --- a/plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/KubernetesClusterService.java +++ b/plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/KubernetesClusterService.java @@ -65,6 +65,12 @@ public interface KubernetesClusterService extends PluggableService, Configurable "Timeout interval (in seconds) in which upgrade operation for a Kubernetes cluster should be completed. Not strictly obeyed while upgrade is in progress on a node", true, KubernetesServiceEnabled.key()); + static final ConfigKey KubernetesClusterUpgradeRetries = new ConfigKey("Advanced", Integer.class, + "cloud.kubernetes.cluster.upgrade.retries", + "3", + "The number of retries if fail to upgrade kubernetes cluster due to some reasons (e.g. drain node, etcdserver leader changed)", + true, + KubernetesServiceEnabled.key()); static final ConfigKey KubernetesClusterExperimentalFeaturesEnabled = new ConfigKey("Advanced", Boolean.class, "cloud.kubernetes.cluster.experimental.features.enabled", "false", diff --git a/plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/actionworkers/KubernetesClusterUpgradeWorker.java b/plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/actionworkers/KubernetesClusterUpgradeWorker.java index 14f5760d5ae6..d418e20f58f7 100644 --- a/plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/actionworkers/KubernetesClusterUpgradeWorker.java +++ b/plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/actionworkers/KubernetesClusterUpgradeWorker.java @@ -77,39 +77,62 @@ private Pair runInstallScriptOnVM(final UserVm vm, final int in } private void upgradeKubernetesClusterNodes() { - Pair result = null; for (int i = 0; i < clusterVMs.size(); ++i) { UserVm vm = clusterVMs.get(i); String hostName = vm.getHostName(); if (StringUtils.isNotEmpty(hostName)) { hostName = hostName.toLowerCase(); } - result = null; + Pair result; if (LOGGER.isInfoEnabled()) { LOGGER.info(String.format("Upgrading node on VM %s in Kubernetes cluster %s with Kubernetes version(%s) ID: %s", vm.getDisplayName(), kubernetesCluster.getName(), upgradeVersion.getSemanticVersion(), upgradeVersion.getUuid())); } - try { - result = SshHelper.sshExecute(publicIpAddress, sshPort, getControlNodeLoginUser(), sshKeyFile, null, - String.format("sudo /opt/bin/kubectl drain %s --ignore-daemonsets --delete-emptydir-data", hostName), - 10000, 10000, 60000); - } catch (Exception e) { - logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, unable to drain Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, e); - } - if (!result.first()) { - logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, unable to drain Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null); + String errorMessage = String.format("Failed to upgrade Kubernetes cluster : %s, unable to drain Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName()); + for (int retry = KubernetesClusterService.KubernetesClusterUpgradeRetries.value(); retry >= 0; retry--) { + try { + result = SshHelper.sshExecute(publicIpAddress, sshPort, getControlNodeLoginUser(), sshKeyFile, null, + String.format("sudo /opt/bin/kubectl drain %s --ignore-daemonsets --delete-emptydir-data", hostName), + 10000, 10000, 60000); + if (result.first()) { + break; + } + if (retry > 0) { + LOGGER.error(String.format("%s, retries left: %s", errorMessage, retry)); + } else { + logTransitStateDetachIsoAndThrow(Level.ERROR, errorMessage, kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null); + } + } catch (Exception e) { + if (retry > 0) { + LOGGER.error(String.format("%s due to %s, retries left: %s", errorMessage, e, retry)); + } else { + logTransitStateDetachIsoAndThrow(Level.ERROR, errorMessage, kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, e); + } + } } if (System.currentTimeMillis() > upgradeTimeoutTime) { logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, upgrade action timed out", kubernetesCluster.getName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null); } - try { - deployProvider(); - result = runInstallScriptOnVM(vm, i); - } catch (Exception e) { - logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, unable to upgrade Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, e); - } - if (!result.first()) { - logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, unable to upgrade Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null); + errorMessage = String.format("Failed to upgrade Kubernetes cluster : %s, unable to upgrade Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName()); + for (int retry = KubernetesClusterService.KubernetesClusterUpgradeRetries.value(); retry >= 0; retry--) { + try { + deployProvider(); + result = runInstallScriptOnVM(vm, i); + if (result.first()) { + break; + } + if (retry > 0) { + LOGGER.error(String.format("%s, retries left: %s", errorMessage, retry)); + } else { + logTransitStateDetachIsoAndThrow(Level.ERROR, errorMessage, kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null); + } + } catch (Exception e) { + if (retry > 0) { + LOGGER.error(String.format("%s due to %s, retries left: %s", errorMessage, e, retry)); + } else { + logTransitStateDetachIsoAndThrow(Level.ERROR, errorMessage, kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, e); + } + } } if (System.currentTimeMillis() > upgradeTimeoutTime) { logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, upgrade action timed out", kubernetesCluster.getName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null); diff --git a/test/integration/smoke/test_kubernetes_clusters.py b/test/integration/smoke/test_kubernetes_clusters.py index dc8d42f444ca..0c5b96a77122 100644 --- a/test/integration/smoke/test_kubernetes_clusters.py +++ b/test/integration/smoke/test_kubernetes_clusters.py @@ -278,13 +278,15 @@ def deleteKubernetesSupportedVersion(cls, version_id): cls.apiclient.deleteKubernetesSupportedVersion(deleteKubernetesSupportedVersionCmd) @classmethod - def listKubernetesCluster(cls, cluster_id = None): + def listKubernetesCluster(cls, cluster_id = None, cluster_name = None): listKubernetesClustersCmd = listKubernetesClusters.listKubernetesClustersCmd() listKubernetesClustersCmd.listall = True if cluster_id != None: listKubernetesClustersCmd.id = cluster_id + if cluster_name != None: + listKubernetesClustersCmd.name = cluster_name clusterResponse = cls.apiclient.listKubernetesClusters(listKubernetesClustersCmd) - if cluster_id != None and clusterResponse != None: + if (cluster_id != None or cluster_name != None) and clusterResponse != None: return clusterResponse[0] return clusterResponse @@ -523,24 +525,6 @@ def test_06_delete_kubernetes_cluster(self): return - @attr(tags=["advanced", "smoke"], required_hardware="true") - @skipTestIf("hypervisorNotSupported") - def test_07_deploy_kubernetes_ha_cluster(self): - """Test to deploy a new HA Kubernetes cluster - - # Validate the following: - # 1. createKubernetesCluster should return valid info for new cluster - # 2. The Cloud Database contains the valid information - """ - if self.setup_failed == True: - self.fail("Setup incomplete") - if self.default_network: - self.skipTest("HA cluster on shared network requires external ip address, skipping it") - global k8s_cluster - k8s_cluster = self.getValidKubernetesCluster(1, 3) - self.debug("HA Kubernetes cluster with ID: %s successfully deployed" % k8s_cluster.id) - return - @attr(tags=["advanced", "smoke"], required_hardware="true") @skipTestIf("hypervisorNotSupported") def test_08_upgrade_kubernetes_ha_cluster(self): @@ -568,24 +552,6 @@ def test_08_upgrade_kubernetes_ha_cluster(self): self.debug("Kubernetes cluster with ID: %s successfully upgraded" % k8s_cluster.id) return - @attr(tags=["advanced", "smoke"], required_hardware="true") - @skipTestIf("hypervisorNotSupported") - def test_09_delete_kubernetes_ha_cluster(self): - """Test to delete a HA Kubernetes cluster - - # Validate the following: - # 1. deleteKubernetesCluster should delete an existing HA Kubernetes cluster - """ - if self.setup_failed == True: - self.fail("Setup incomplete") - if self.default_network: - self.skipTest("HA cluster on shared network requires external ip address, skipping it") - global k8s_cluster - k8s_cluster = self.getValidKubernetesCluster(1, 3) - - self.debug("Deleting Kubernetes cluster with ID: %s" % k8s_cluster.id) - return - @attr(tags=["advanced", "smoke"], required_hardware="true") @skipTestIf("hypervisorNotSupported") def test_10_vpc_tier_kubernetes_cluster(self): @@ -739,8 +705,14 @@ def createNewKubernetesCluster(self, version, size, control_nodes) : cluster = self.createKubernetesCluster(name, version.id, size, control_nodes) self.verifyKubernetesCluster(cluster, name, version.id, size, control_nodes) except Exception as ex: + cluster = self.listKubernetesCluster(cluster_name = name) + if cluster != None: + self.deleteKubernetesClusterAndVerify(cluster.id, False, True) self.fail("Kubernetes cluster deployment failed: %s" % ex) except AssertionError as err: + cluster = self.listKubernetesCluster(cluster_name = name) + if cluster != None: + self.deleteKubernetesClusterAndVerify(cluster.id, False, True) self.fail("Kubernetes cluster deployment failed during cluster verification: %s" % err) return cluster