diff --git a/deployments/deploy-ssh.sh b/deployments/deploy-ssh.sh index ae8e318e9..a60971c20 100755 --- a/deployments/deploy-ssh.sh +++ b/deployments/deploy-ssh.sh @@ -600,6 +600,8 @@ spec: value: "${K8S_VENDOR}" - name: TORPEDO_SSH_USER value: "${TORPEDO_SSH_USER}" + - name: LB_SUBNET_KEY + value: "${LB_SUBNET_KEY}" - name: TORPEDO_SSH_PASSWORD value: "${TORPEDO_SSH_PASSWORD}" - name: TORPEDO_SSH_KEY @@ -806,6 +808,10 @@ spec: value: "${EKS_CLUSTER_REGION}" - name: EKS_PX_NODEGROUP_NAME value: "${EKS_PX_NODEGROUP_NAME}" + - name: EKS_MAX_UNAVAILABLE_NODES_VALUE + value: "${EKS_MAX_UNAVAILABLE_NODES_VALUE}" + - name: EKS_MAX_UNAVAILABLE_PERCENTAGE_VALUE + value: "${EKS_MAX_UNAVAILABLE_PERCENTAGE_VALUE}" - name: IKS_CLUSTER_NAME value: "${IKS_CLUSTER_NAME}" - name: IKS_PX_WORKERPOOL_NAME diff --git a/drivers/node/vsphere/vsphere.go b/drivers/node/vsphere/vsphere.go index ca424e22e..df8bb6371 100644 --- a/drivers/node/vsphere/vsphere.go +++ b/drivers/node/vsphere/vsphere.go @@ -1018,8 +1018,13 @@ func filterTargetDatastores(ctx context.Context, sourceDatastore *object.Datasto availableSpace := dsProps.Summary.FreeSpace / 1024 spaceAfterMove := availableSpace - totalDiskSizeToMove maxAllowedUsage := (dsProps.Summary.Capacity / 1024) * 90 / 100 + log.Infof("Available Space is: %v", availableSpace) + log.Infof("Total Disk size to move is: %v", totalDiskSizeToMove) + log.Infof("Space after move is: %v", spaceAfterMove) + log.Infof("Max Allowed Usage is: %v", maxAllowedUsage) - if spaceAfterMove < 0 || spaceAfterMove > maxAllowedUsage { + usedSpace := (dsProps.Summary.Capacity / 1024) - availableSpace + if spaceAfterMove < 0 || (usedSpace+totalDiskSizeToMove > maxAllowedUsage) { log.Infof("Datastore %v does not have enough space or will exceed 90 percent capacity.", ds.Name()) continue } diff --git a/drivers/scheduler/eks/eks.go b/drivers/scheduler/eks/eks.go index 282072162..b44524116 100644 --- a/drivers/scheduler/eks/eks.go +++ b/drivers/scheduler/eks/eks.go @@ -3,6 +3,11 @@ package eks import ( "context" "fmt" + "os" + "strconv" + "strings" + "time" + "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/config" "github.com/aws/aws-sdk-go-v2/service/autoscaling" @@ -19,9 +24,6 @@ import ( "github.com/portworx/torpedo/drivers/scheduler" kube "github.com/portworx/torpedo/drivers/scheduler/k8s" "github.com/portworx/torpedo/pkg/log" - "os" - "strings" - "time" ) const ( @@ -143,6 +145,96 @@ func (e *EKS) UpgradeNodeGroup(nodeGroupName string, version string) error { return nil } +// SetMaxUnavailableForUpgradeInNodeGroup updates Max Unavailable values for upgrades for a given node group +func (e *EKS) SetMaxUnavailableForUpgradeInNodeGroup(nodeGroupName string) error { + updateNodeGroup := false + + log.Info("Checking if we need to configure EKS Node Group Max Unavailable values for surge upgrade..") + // Get Max Unavailable values from ENV vars, if present + maxUnavailableNodes := os.Getenv("EKS_MAX_UNAVAILABLE_NODES_VALUE") + maxUnavailablePercentage := os.Getenv("EKS_MAX_UNAVAILABLE_PERCENTAGE_VALUE") + + // MaxUnavailable: aws.Int32(1), // Set the maximum number of nodes that can be unavailable during the update + // MaxUnavailablePercentage: aws.Int32(25), // Set the maximum percentage of nodes that can be unavailable during the update + updateConfig := &types.NodegroupUpdateConfig{} + + if maxUnavailableNodes != "" { + log.Infof("Setting MaxUnavailable to [%s]", maxUnavailableNodes) + maxUnavailableNodesInt, err := strconv.Atoi(maxUnavailableNodes) + if err != nil { + return fmt.Errorf("failed to convert maxUnavailableNode string [%s] to int, Err: %v", maxUnavailableNodes, err) + } + updateConfig.MaxUnavailable = aws.Int32(int32(maxUnavailableNodesInt)) + updateNodeGroup = true + } + + if maxUnavailablePercentage != "" { + log.Infof("Setting MaxUnavailablePercentage to [%s]", maxUnavailablePercentage) + maxUnavailablePercentageInt, err := strconv.Atoi(maxUnavailablePercentage) + if err != nil { + return fmt.Errorf("failed to convert maxUnavailablePercentage string [%s] to int, Err: %v", maxUnavailablePercentage, err) + } + updateConfig.MaxUnavailablePercentage = aws.Int32(int32(maxUnavailablePercentageInt)) + updateNodeGroup = true + } + + if !updateNodeGroup { + log.Info("Skipping updating Node Group Max Unavailable values as none were passed..") + return nil + } + + log.Infof("Updating Node Group [%s] with Max Unavailable values for surge upgrade for EKS cluster [%s]", nodeGroupName, e.clusterName) + _, err := e.eksClient.UpdateNodegroupConfig(context.TODO(), &eks.UpdateNodegroupConfigInput{ + ClusterName: aws.String(e.clusterName), + NodegroupName: aws.String(nodeGroupName), + UpdateConfig: updateConfig, + }) + if err != nil { + return fmt.Errorf("failed to configure Max Unavailable for EKS cluster [%s] node group [%s], Err: [%v]", e.clusterName, nodeGroupName, err) + } + + if err := e.WaitForNodeGroupUpdate(nodeGroupName); err != nil { + return err + } + + log.Infof("Successfully configured EKS cluster [%s] node group [%s] with Max Unavailable values for surge upgrade", e.clusterName, nodeGroupName) + return nil +} + +// WaitForNodeGroupUpdate waits for Node Group to be updated and have expected status +func (e *EKS) WaitForNodeGroupUpdate(nodeGroupName string) error { + log.Infof("Waiting for EKS cluster [%s] Node Group [%s] to be updated", e.clusterName, nodeGroupName) + expectedUpgradeStatus := types.NodegroupStatusActive + t := func() (interface{}, bool, error) { + eksDescribeNodegroupOutput, err := e.eksClient.DescribeNodegroup( + context.TODO(), + &eks.DescribeNodegroupInput{ + ClusterName: aws.String(e.clusterName), + NodegroupName: aws.String(nodeGroupName), + }, + ) + if err != nil { + return nil, false, err + } + if eksDescribeNodegroupOutput.Nodegroup == nil { + return nil, false, fmt.Errorf("failed to describe EKS cluster [%s] Node Group [%s], node group not found", e.clusterName, nodeGroupName) + } + status := eksDescribeNodegroupOutput.Nodegroup.Status + if status == expectedUpgradeStatus { + return nil, false, nil + } else { + return nil, true, fmt.Errorf("waiting for EKS cluster [%s] Node Group [%s] update to complete, expected status [%s], actual status [%s]", e.clusterName, nodeGroupName, expectedUpgradeStatus, status) + } + } + _, err := task.DoRetryWithTimeout(t, defaultEKSUpgradeTimeout, defaultEKSUpgradeRetryInterval) + if err != nil { + return fmt.Errorf("failed to update EKS cluster [%s] Node Group [%s], Err: [%v]", e.clusterName, nodeGroupName, err) + } + + log.Infof("Successfully updated EKS cluster [%s] Node Group [%s]", e.clusterName, nodeGroupName) + return nil +} + // WaitForNodeGroupToUpgrade waits for the EKS node group to be upgraded to the specified version func (e *EKS) WaitForNodeGroupToUpgrade(nodeGroupName string, version string) error { log.Infof("Waiting for EKS cluster [%s] node group [%s] to be upgraded to [%s]", e.clusterName, nodeGroupName, version) @@ -325,6 +417,12 @@ func (e *EKS) UpgradeScheduler(version string) error { return fmt.Errorf("failed to wait for EKS cluster [%s] control plane to be upgraded to [%s], Err: [%v]", e.clusterName, version, err) } + // Update Max Unavailable values for Node Group + err = e.SetMaxUnavailableForUpgradeInNodeGroup(e.pxNodeGroupName) + if err != nil { + return fmt.Errorf("failed to configure EKS cluster [%s] node group [%s] Max Unavailable values, Err: [%v]", e.clusterName, e.pxNodeGroupName, err) + } + // Upgrade Node Group err = e.UpgradeNodeGroup(e.pxNodeGroupName, version) if err != nil { @@ -659,4 +757,4 @@ func (e *EKS) connect(n node.Node, listCmdsInput *ssm.ListCommandInvocationsInpu func init() { e := &EKS{} scheduler.Register(SchedName, e) -} \ No newline at end of file +} diff --git a/drivers/scheduler/k8s/k8s.go b/drivers/scheduler/k8s/k8s.go index ca2ededd3..c8ead3dc2 100644 --- a/drivers/scheduler/k8s/k8s.go +++ b/drivers/scheduler/k8s/k8s.go @@ -5593,10 +5593,17 @@ func (k *K8s) createVirtualMachineObjects( if err != nil { return nil, fmt.Errorf("failed to retrieve VM after creating/waiting for DataVolumes: %v", err) } - // Check if the VM is in 'Running' phase. - if !vm.Status.Ready { - return nil, fmt.Errorf("VM is not in the expected 'Running' state") + t := func() (interface{}, bool, error) { + vm, err = k8sKubevirt.GetVirtualMachine(obj.Name, obj.Namespace) + if err != nil { + return nil, true, err + } + if vm.Status.Ready { + return nil, false, nil + } + return nil, true, fmt.Errorf("waiting for VM [%s] in namespace [%s] to be ready", obj.Name, obj.Namespace) } + _, err = task.DoRetryWithTimeout(t, cdiImageImportTimeout, cdiImageImportRetry) return vm, nil } } diff --git a/tests/backup/backup_azure_credential_test.go b/tests/backup/backup_azure_credential_test.go index 095b5b1a6..bcf36c17d 100644 --- a/tests/backup/backup_azure_credential_test.go +++ b/tests/backup/backup_azure_credential_test.go @@ -5,11 +5,13 @@ import ( . "github.com/onsi/ginkgo/v2" "github.com/pborman/uuid" api "github.com/portworx/px-backup-api/pkg/apis/v1" + "github.com/portworx/sched-ops/task" "github.com/portworx/torpedo/drivers/backup" "github.com/portworx/torpedo/drivers/scheduler" "github.com/portworx/torpedo/pkg/log" . "github.com/portworx/torpedo/tests" "golang.org/x/sync/errgroup" + "strings" ) // This testcase creates Azure cloud account with mandatory and non-mandatory fields and take backup & restore @@ -51,7 +53,7 @@ var _ = Describe("{AzureCloudAccountCreationWithMandatoryAndNonMandatoryFields}" ) JustBeforeEach(func() { - StartPxBackupTorpedoTest("AzureCloudAccountCreationWithMandatoryAndNonMandatoryFields", "Azure cloud account with mandatory and non mandatory fields", nil, 31661, Sagrawal, Q2FY25) + StartPxBackupTorpedoTest("AzureCloudAccountCreationWithMandatoryAndNonMandatoryFields", "Azure cloud account with mandatory and non mandatory fields", nil, 300036, Sagrawal, Q2FY25) backupLocationMap1 = make(map[string]string) cloudCredentialMap1 = make(map[string]string) backupLocationMap2 = make(map[string]string) @@ -229,3 +231,77 @@ var _ = Describe("{AzureCloudAccountCreationWithMandatoryAndNonMandatoryFields}" log.FailOnError(err, "Data validations failed") }) }) + +// This testcase verifies the error message while creating immutable backup location with few mandatory parameters missing for immutable backup location while creating cloud cred +var _ = Describe("{AzureCloudAccountForLockedBucket}", Label(TestCaseLabelsMap[AzureCloudAccountForLockedBucket]...), func() { + + var ( + credUidWithMandatoryFields string + azureCredNameWithMandatoryFields string + azureBackupLocationNameWithMandatoryFields string + backupLocationMandatoryFieldsUID string + containerLevelStorageAccount string + containerLevelStorageAccountKey string + azureImmutableBucket string + backupLocationMap map[string]string + cloudCredentialMap map[string]string + azureConfigFields *api.AzureConfig + scheduledAppContexts []*scheduler.Context + ) + + JustBeforeEach(func() { + StartPxBackupTorpedoTest("AzureCloudAccountForLockedBucket", "Azure cloud account for immutable bucket", nil, 300037, Sagrawal, Q2FY25) + backupLocationMap = make(map[string]string) + cloudCredentialMap = make(map[string]string) + }) + + It("Azure cloud account with mandatory and non mandatory fields", func() { + ctx, err := backup.GetAdminCtxFromSecret() + log.FailOnError(err, "Fetching px-central-admin ctx") + _, containerLevelStorageAccount, containerLevelStorageAccountKey, _, _, _, _ = GetAzureImmutabilityCredsFromEnv() + + Step("Creating azure cloud account with only mandatory fields", func() { + log.InfoD("Creating azure cloud account with only mandatory fields") + credUidWithMandatoryFields = uuid.New() + azureConfigFields = &api.AzureConfig{ + AccountName: containerLevelStorageAccount, + AccountKey: containerLevelStorageAccountKey, + } + azureCredNameWithMandatoryFields = fmt.Sprintf("%s-azure-cred-with-mandatory-fields-immutable", RandomString(5)) + err = CreateAzureCloudCredential(azureCredNameWithMandatoryFields, credUidWithMandatoryFields, BackupOrgID, azureConfigFields, ctx) + dash.VerifyFatal(err, nil, fmt.Sprintf("Verifying creation of azure cloud credential named [%s] for org [%s] having only mandatory fields for immutable bucket", azureCredNameWithMandatoryFields, BackupOrgID)) + cloudCredentialMap[azureCredNameWithMandatoryFields] = credUidWithMandatoryFields + + log.InfoD("Creating azure immutable bucket") + azureImmutableBucket = RandomString(5) + "azure-immutable-bucket" + CreateAzureBucket(azureImmutableBucket, true, Container_level, 2, true) + + log.InfoD("Creating azure immutable backup location with mandatory fields in azure credentials") + azureBackupLocationNameWithMandatoryFields = fmt.Sprintf("azure-immutable-bkp-loc-mandatory-fields-%v", RandomString(5)) + backupLocationMandatoryFieldsUID = uuid.New() + backupLocationMap[backupLocationMandatoryFieldsUID] = azureBackupLocationNameWithMandatoryFields + err = CreateBackupLocation("azure", azureBackupLocationNameWithMandatoryFields, backupLocationMandatoryFieldsUID, azureCredNameWithMandatoryFields, credUidWithMandatoryFields, azureImmutableBucket, BackupOrgID, "", true) + dash.VerifyFatal(strings.Contains(err.Error(), fmt.Sprintf("error in obtaining object-lock info for the bucket %v: secret can't be empty string", azureBackupLocationNameWithMandatoryFields)), true, fmt.Sprintf("Verifying message while creating immutable backup location %v with few mandatory parameters missing for immutable backup location while creating cloud cred", azureBackupLocationNameWithMandatoryFields)) + }) + }) + + JustAfterEach(func() { + defer EndPxBackupTorpedoTest(scheduledAppContexts) + ctx, err := backup.GetAdminCtxFromSecret() + log.FailOnError(err, "Fetching px-central-admin ctx") + err = DeleteCloudCredentialWithContext(azureCredNameWithMandatoryFields, BackupOrgID, credUidWithMandatoryFields, ctx) + Inst().Dash.VerifyFatal(err, nil, fmt.Sprintf("Verifying deletion of cloud cred [%s]", azureCredNameWithMandatoryFields)) + cloudCredDeleteStatus := func() (interface{}, bool, error) { + status, err := IsCloudCredPresent(azureCredNameWithMandatoryFields, ctx, BackupOrgID) + if err != nil { + return "", true, fmt.Errorf("cloud cred %s still present with error %v", azureCredNameWithMandatoryFields, err) + } + if status { + return "", true, fmt.Errorf("cloud cred %s is not deleted yet", azureCredNameWithMandatoryFields) + } + return "", false, nil + } + _, err = task.DoRetryWithTimeout(cloudCredDeleteStatus, CloudAccountDeleteTimeout, CloudAccountDeleteRetryTime) + Inst().Dash.VerifySafely(err, nil, fmt.Sprintf("Deleting cloud cred %s", azureCredNameWithMandatoryFields)) + }) +}) diff --git a/tests/backup/backup_test_labels.go b/tests/backup/backup_test_labels.go index 0362187d7..2513aeef3 100644 --- a/tests/backup/backup_test_labels.go +++ b/tests/backup/backup_test_labels.go @@ -135,6 +135,7 @@ const ( PartialBackupSuccessWithAzureEndpoint TestCaseName = "PartialBackupSuccessWithAzureEndpoint" PSALowerPrivilegeToHigherPrivilegeWithProjectMapping TestCaseName = "PSALowerPrivilegeToHigherPrivilegeWithProjectMapping" AzureCloudAccountCreationWithMandatoryAndNonMandatoryFields TestCaseName = "AzureCloudAccountCreationWithMandatoryAndNonMandatoryFields" + AzureCloudAccountForLockedBucket TestCaseName = "AzureCloudAccountForLockedBucket" ) // Test case labels @@ -269,6 +270,7 @@ const ( PsaTakeBackupInLowerPrevilegeRestoreInHigherPrivilege TestCaseLabel = "PsaTakeBackupInLowerPrevilegeRestoreInHigherPrivilege" PSALowerPrivilegeToHigherPrivilegeWithProjectMappingLabel TestCaseLabel = "PSALowerPrivilegeToHigherPrivilegeWithProjectMapping" AzureCloudAccountCreationWithMandatoryAndNonMandatoryFieldsLabel TestCaseLabel = "AzureCloudAccountCreationWithMandatoryAndNonMandatoryFields" + AzureCloudAccountForLockedBucketLabel TestCaseLabel = "AzureCloudAccountForLockedBucket" ) // Common Labels diff --git a/tests/common.go b/tests/common.go index 3b667f1c6..ea8e29fcc 100644 --- a/tests/common.go +++ b/tests/common.go @@ -608,6 +608,7 @@ var ( contextsCreated []*scheduler.Context CurrentClusterConfigPath = "" clusterProvider = "aws" + ClusterPrefix = "default" ) var ( @@ -5202,6 +5203,110 @@ func CreateApplicationClusters(orgID string, cloudName string, uid string, ctx c return nil } +// CreateDuplicateApplicationClusters Adds n number of cluster objects to the Px-Backup using the supplied kubeconfig path +func CreateDuplicateApplicationClusters(orgID string, cloudName string, uid string, ctx context1.Context, configPath string, n int, clusterPrefix string) error { + var clusterCredName string + var clusterCredUid string + + clusterCreation := func(clusterCredName string, clusterCredUid string, clusterName string, configPath string) error { + err := CreateCluster(clusterName, configPath, orgID, clusterCredName, clusterCredUid, ctx) + if err != nil && !strings.Contains(err.Error(), "already exists with status: Online") { + return err + } + // Check cluster status + clusterStatus, err := Inst().Backup.GetClusterStatus(BackupOrgID, clusterName, ctx) + if err != nil { + log.FailOnError(err, fmt.Sprintf("Fetching [%s] cluster status", clusterName)) + } + dash.VerifyFatal(clusterStatus, api.ClusterInfo_StatusInfo_Online, fmt.Sprintf("Verifying if [%s] cluster is online", clusterName)) + return nil + } + + clusterProvider := GetClusterProviders() + for _, provider := range clusterProvider { + switch provider { + case drivers.ProviderAzure, drivers.ProviderAws, drivers.ProviderGke, drivers.ProviderIbm: + for i := 0; i < n; i++ { + clusterCredName = fmt.Sprintf("%v-%v-cloud-cred-%v", provider, cloudName, RandomString(5)) + clusterCredUid = uuid.New() + log.Infof("Creating cloud credential for cluster") + err := CreateCloudCredential(provider, clusterCredName, clusterCredUid, orgID, ctx, cloudName) + if err != nil { + if strings.Contains(err.Error(), CreateCloudCredentialError) { + log.Infof("The error is - %v", err.Error()) + adminCtx, err := backup.GetAdminCtxFromSecret() + if err != nil { + return fmt.Errorf("failed to fetch px-central-admin ctx with error %v", err) + } + log.Infof("Creating cloud credential %s from admin context and sharing with all the users", clusterCredName) + err = CreateCloudCredential(provider, clusterCredName, clusterCredUid, orgID, adminCtx, cloudName) + if err != nil { + return fmt.Errorf("failed to create cloud cred %s with error %v", clusterCredName, err) + } + err = AddCloudCredentialOwnership(clusterCredName, clusterCredUid, nil, nil, 0, Read, adminCtx, orgID) + if err != nil { + return fmt.Errorf("failed to share the cloud cred with error %v", err) + } + } else { + return fmt.Errorf("failed to create cloud cred with error =%v", err) + } + } + clusterName := fmt.Sprintf("cluster-%s-%d", clusterPrefix, i+1) + err = clusterCreation(clusterCredName, clusterCredUid, clusterName, configPath) + if err != nil { + return err + } + ClusterConfigPathMap[clusterName] = configPath // Store the cluster name and its kubeconfig path + } + + case drivers.ProviderRke: + for i := 0; i < n; i++ { + clusterCredName = fmt.Sprintf("%v-%v-cloud-cred-%v", provider, cloudName, RandomString(5)) + clusterCredUid = uuid.New() + log.Infof("Creating cloud credential for cluster") + err := CreateCloudCredential(provider, clusterCredName, clusterCredUid, orgID, ctx, cloudName) + if err != nil { + if strings.Contains(err.Error(), CreateCloudCredentialError) { + log.Infof("The error is - %v", err.Error()) + adminCtx, err := backup.GetAdminCtxFromSecret() + if err != nil { + return fmt.Errorf("failed to fetch px-central-admin ctx with error %v", err) + } + log.Infof("Creating cloud credential %s from admin context and sharing with all the users", clusterCredName) + err = CreateCloudCredential(provider, clusterCredName, clusterCredUid, orgID, adminCtx, cloudName) + if err != nil { + return fmt.Errorf("failed to create cloud cred %s with error %v", clusterCredName, err) + } + err = AddCloudCredentialOwnership(clusterCredName, clusterCredUid, nil, nil, Invalid, Read, adminCtx, orgID) + if err != nil { + return fmt.Errorf("failed to share the cloud cred with error %v", err) + } + } else { + return fmt.Errorf("failed to create cloud cred with error =%v", err) + } + } + clusterName := fmt.Sprintf("cluster-%s-%d", clusterPrefix, i+1) + err = clusterCreation(clusterCredName, clusterCredUid, clusterName, configPath) + if err != nil { + return err + } + ClusterConfigPathMap[clusterName] = configPath // Store the cluster name and its kubeconfig path + } + + default: + for i := 0; i < n; i++ { + clusterName := fmt.Sprintf("cluster-%s-%d", clusterPrefix, i+1) + err := clusterCreation(clusterCredName, clusterCredUid, clusterName, configPath) + if err != nil { + return err + } + ClusterConfigPathMap[clusterName] = configPath // Store the cluster name and its kubeconfig path + } + } + } + return nil +} + // AddAzureApplicationClusters adds azure application cluster using the given cloud credential func AddAzureApplicationClusters(orgID string, clusterCredName string, clusterCredUid string, ctx context1.Context) error { kubeconfigs := os.Getenv("KUBECONFIGS")