Skip to content

Commit

Permalink
[PLAT-16631] Destroying an manual-onprem universe which failed in pre…
Browse files Browse the repository at this point in the history
…check leaves them in DECOMMISSIONED state

Summary: Check IP before decommisioning an on-prem node.

Test Plan: Manually created 2 universes - one is made to fail in precheck (connection), other successfully created. Both got deleted fine with nodes freed.

Reviewers: yshchetinin, hzare, anijhawan, muthu

Reviewed By: yshchetinin

Subscribers: yugaware

Differential Revision: https://phorge.dev.yugabyte.com/D41670
  • Loading branch information
nkhogen committed Feb 5, 2025
1 parent f86f9d6 commit 752170c
Show file tree
Hide file tree
Showing 10 changed files with 110 additions and 231 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
import com.yugabyte.yw.commissioner.tasks.subtasks.DeleteClusterFromUniverse;
import com.yugabyte.yw.commissioner.tasks.subtasks.InstanceActions;
import com.yugabyte.yw.commissioner.tasks.subtasks.InstanceExistCheck;
import com.yugabyte.yw.commissioner.tasks.subtasks.PrecheckNode;
import com.yugabyte.yw.commissioner.tasks.subtasks.PreflightNodeCheck;
import com.yugabyte.yw.commissioner.tasks.subtasks.SetupYNP;
import com.yugabyte.yw.commissioner.tasks.subtasks.UniverseSetTlsParams;
Expand Down Expand Up @@ -1346,34 +1345,6 @@ public SubTaskGroup createWaitForMasterLeaderTask() {
return subTaskGroup;
}

/**
* Creates a task that will always fail. Utility task to display preflight error messages.
*
* @param failedNodes : map of nodeName to associated error message
*/
public SubTaskGroup createFailedPrecheckTask(Map<String, String> failedNodes) {
return createFailedPrecheckTask(failedNodes, false);
}

/**
* Creates a task that will always fail. Utility task to display preflight error messages.
*
* @param failedNodes : map of nodeName to associated error message
* @param reserveNodes : whether to reserve nodes for this universe for future use
*/
public SubTaskGroup createFailedPrecheckTask(
Map<String, String> failedNodes, boolean reserveNodes) {
SubTaskGroup subTaskGroup = createSubTaskGroup("PrecheckNode");
PrecheckNode.Params params = new PrecheckNode.Params();
params.failedNodeNamesToError = failedNodes;
params.reserveNodes = reserveNodes;
PrecheckNode failedCheck = createTask(PrecheckNode.class);
failedCheck.initialize(params);
subTaskGroup.addSubTask(failedCheck);
getRunnableTask().addSubTaskGroup(subTaskGroup);
return subTaskGroup;
}

protected void fillSetupParamsForNode(
AnsibleSetupServer.Params params, UserIntent userIntent, NodeDetails node) {
CloudSpecificInfo cloudInfo = node.cloudInfo;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,6 @@
import com.yugabyte.yw.models.DrConfig;
import com.yugabyte.yw.models.HighAvailabilityConfig;
import com.yugabyte.yw.models.NodeAgent;
import com.yugabyte.yw.models.NodeInstance;
import com.yugabyte.yw.models.PitrConfig;
import com.yugabyte.yw.models.Provider;
import com.yugabyte.yw.models.Restore;
Expand Down Expand Up @@ -1967,28 +1966,6 @@ public SubTaskGroup createDestroyServerTasks(
}

for (NodeDetails node : nodes) {
// Check if the private ip for the node is set. If not, that means we don't have
// a clean state to delete the node. Log it, free up the onprem node
// so that the client can use the node instance to create another universe.
if (node.cloudInfo.private_ip == null) {
log.warn(
String.format(
"Node %s doesn't have a private IP. Skipping node delete.", node.nodeName));
if (node.cloudInfo.cloud.equals(
com.yugabyte.yw.commissioner.Common.CloudType.onprem.name())) {
try {
NodeInstance providerNode = NodeInstance.getByName(node.nodeName);
providerNode.setToFailedCleanup(universe, node);
} catch (Exception ex) {
log.warn("On-prem node {} doesn't have a linked instance ", node.nodeName);
}
continue;
}
if (node.nodeUuid == null) {
// No other way to identify the node.
continue;
}
}
Cluster cluster = universe.getCluster(node.placementUuid);
AnsibleDestroyServer.Params params = new AnsibleDestroyServer.Params();
// Set the device information (numVolumes, volumeSize, etc.)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import java.util.Optional;
import javax.inject.Inject;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;

@Slf4j
public class AnsibleClusterServerCtl extends NodeTaskBase {
Expand Down Expand Up @@ -72,6 +73,14 @@ public void run() {
taskParams().command);
return;
}
if (nodeDetails.cloudInfo == null || StringUtils.isEmpty(nodeDetails.cloudInfo.private_ip)) {
log.warn(
"Node {} has no IP in the universe {}. Skipping server control command - {}",
taskParams().nodeName,
universeOpt.get().getUniverseUUID(),
taskParams().command);
return;
}
if (ServerType.MASTER.name().equalsIgnoreCase(taskParams().process)
&& "start".equalsIgnoreCase(taskParams().command)
&& nodeDetails.masterState != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,15 @@
import com.yugabyte.yw.commissioner.Common;
import com.yugabyte.yw.commissioner.tasks.params.NodeTaskParams;
import com.yugabyte.yw.common.NodeManager;
import com.yugabyte.yw.forms.UniverseDefinitionTaskParams;
import com.yugabyte.yw.forms.UniverseDefinitionTaskParams.UserIntent;
import com.yugabyte.yw.models.NodeInstance;
import com.yugabyte.yw.models.Universe;
import com.yugabyte.yw.models.Universe.UniverseUpdater;
import com.yugabyte.yw.models.helpers.NodeDetails;
import java.util.Optional;
import javax.inject.Inject;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;

@Slf4j
public class AnsibleDestroyServer extends NodeTaskBase {
Expand Down Expand Up @@ -60,10 +61,9 @@ private void removeNodeFromUniverse(final String nodeName) {
new UniverseUpdater() {
@Override
public void run(Universe universe) {
UniverseDefinitionTaskParams universeDetails = universe.getUniverseDetails();
universeDetails.removeNode(nodeName);
log.debug(
"Removing node {} from universe {}", nodeName, taskParams().getUniverseUUID());
universe.getUniverseDetails().removeNode(nodeName);
}
};

Expand All @@ -75,37 +75,44 @@ public void run() {
Universe universe = Universe.getOrBadRequest(taskParams().getUniverseUUID());
NodeDetails nodeDetails = universe.getNode(taskParams().nodeName);
if (nodeDetails == null) {
// Nothing can be done.
log.warn(
"Node {} is not found in the universe {}",
taskParams().nodeName,
universe.getUniverseUUID());
return;
}
boolean cleanupFailed = false;

// Execute the ansible command.
UserIntent userIntent =
universe.getUniverseDetails().getClusterByUuid(nodeDetails.placementUuid).userIntent;
if (userIntent.providerType == Common.CloudType.onprem
&& (nodeDetails.cloudInfo == null
|| StringUtils.isEmpty(nodeDetails.cloudInfo.private_ip))) {
// Node IP was never updated, nothing was changed. For onprem, it can just be cleared.
// For CSPs, the instance needs to be terminated.
log.warn(
"Onprem node {} has no IP in the universe {}",
taskParams().nodeName,
universe.getUniverseUUID());
NodeInstance.maybeGetByName(taskParams().nodeName).ifPresent(n -> n.clearNodeDetails());
return;
}
boolean cleanupFailed = true;
try {
getNodeManager()
.nodeCommand(NodeManager.NodeCommandType.Destroy, taskParams())
.processErrors();
cleanupFailed = false;
} catch (Exception e) {
if (!taskParams().isForceDelete) {
throw e;
} else {
cleanupFailed = true;
log.debug(
"Ignoring error deleting instance {} due to isForceDelete being set.",
taskParams().nodeName,
e);
}
}

UserIntent userIntent =
universe.getUniverseDetails().getClusterByUuid(nodeDetails.placementUuid).userIntent;

if (taskParams().deleteRootVolumes
&& !userIntent.providerType.equals(Common.CloudType.onprem)) {
if (taskParams().deleteRootVolumes && userIntent.providerType != Common.CloudType.onprem) {
try {
getNodeManager()
.nodeCommand(NodeManager.NodeCommandType.Delete_Root_Volumes, taskParams())
Expand Down Expand Up @@ -135,24 +142,19 @@ public void run() {
}
}

if (userIntent.providerType.equals(Common.CloudType.onprem)
if (userIntent.providerType == Common.CloudType.onprem
&& nodeDetails.state != NodeDetails.NodeState.Decommissioned) {
// Free up the node.
try {
NodeInstance providerNode = NodeInstance.getByName(taskParams().nodeName);
Optional<NodeInstance> nodeInstanceOpt = NodeInstance.maybeGetByName(taskParams().nodeName);
if (nodeInstanceOpt.isPresent()) {
if (cleanupFailed) {
log.info(
"Failed to clean node instance {}. Setting to decommissioned state",
taskParams().nodeName);
providerNode.setToFailedCleanup(universe, nodeDetails);
nodeInstanceOpt.get().setToFailedCleanup(universe, nodeDetails);
} else {
providerNode.clearNodeDetails();
nodeInstanceOpt.get().clearNodeDetails();
log.info("Marked node instance {} as available", taskParams().nodeName);
}
} catch (Exception e) {
if (!taskParams().isForceDelete) {
throw e;
}
}
}
// Update the node state to Terminated to mark that instance has been terminated. This is a
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -687,8 +687,6 @@ public enum TaskType {

AnsibleCreateServer(com.yugabyte.yw.commissioner.tasks.subtasks.AnsibleCreateServer.class),

PrecheckNode(com.yugabyte.yw.commissioner.tasks.subtasks.PrecheckNode.class),

PrecheckNodeDetached(com.yugabyte.yw.commissioner.tasks.subtasks.PrecheckNodeDetached.class),

AnsibleUpdateNodeInfo(com.yugabyte.yw.commissioner.tasks.subtasks.AnsibleUpdateNodeInfo.class),
Expand Down
Loading

0 comments on commit 752170c

Please sign in to comment.