Skip to content

Commit

Permalink
Prevent successful containers from overwriting failed containers. - A…
Browse files Browse the repository at this point in the history
…dded logic to ensure that successful containers no longer overwrite failed containers, preserving the correct status. - Improved handling of init containers: - If a pod has init containers, the pod's status now checks their completion first. - On the plugin side, init containers must be executed correctly: they must run sequentially and complete before the main containers.
  • Loading branch information
Bianco95 committed Jan 13, 2025
1 parent 7b6addf commit 87d9c59
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 43 deletions.
2 changes: 0 additions & 2 deletions pkg/interlink/api/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@ func (h *InterLinkHandler) StatusHandler(w http.ResponseWriter, r *http.Request)

log.G(h.Ctx).Info("InterLink: forwarding GetStatus call to sidecar")
req.Header.Set("Content-Type", "application/json")
log.G(h.Ctx).Debug("Interlink get status request content:", req)

sessionContext := GetSessionContext(r)
bodyBytes, err = ReqWithError(h.Ctx, req, w, start, span, false, true, sessionContext, h.ClientHTTP)
Expand Down Expand Up @@ -127,7 +126,6 @@ func (h *InterLinkHandler) StatusHandler(w http.ResponseWriter, r *http.Request)
log.G(h.Ctx).Error(err)
return
}
log.G(h.Ctx).Debug("InterLink: status " + string(returnValue))

w.WriteHeader(statusCode)
_, err = w.Write(returnValue)
Expand Down
11 changes: 6 additions & 5 deletions pkg/interlink/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,12 @@ type PodCreateRequests struct {

// PodStatus is a simplified v1.Pod struct, holding only necessary variables to uniquely identify a job/service in the sidecar. It is used to request
type PodStatus struct {
PodName string `json:"name"`
PodUID string `json:"UID"`
PodNamespace string `json:"namespace"`
JobID string `json:"JID"`
Containers []v1.ContainerStatus `json:"containers"`
PodName string `json:"name"`
PodUID string `json:"UID"`
PodNamespace string `json:"namespace"`
JobID string `json:"JID"`
Containers []v1.ContainerStatus `json:"containers"`
InitContainers []v1.ContainerStatus `json:"initContainers"`
}

// CreateStruct is the response to be received from interLink whenever asked to create a pod. It will allow for mapping remote ID with the pod UUID
Expand Down
160 changes: 124 additions & 36 deletions pkg/virtualkubelet/execute.go
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,11 @@ func LogRetrieval(ctx context.Context, config Config, logsRequest types.LogStruc
AddSessionContext(req, sessionContext)

logTransport := http.DefaultTransport.(*http.Transport).Clone()

// logTransport.TLSClientConfig = &tls.Config{
// InsecureSkipVerify: true,
// }

// logTransport.DisableKeepAlives = true
// logTransport.MaxIdleConnsPerHost = -1
var logHTTPClient = &http.Client{Transport: logTransport}
Expand Down Expand Up @@ -539,8 +544,6 @@ func RemoteExecution(ctx context.Context, config Config, p *Provider, pod *v1.Po
// After the statusRequest returns a response, this function uses that response to update every Pod and Container status.
func checkPodsStatus(ctx context.Context, p *Provider, podsList []*v1.Pod, token string, config Config) ([]types.PodStatus, error) {
var ret []types.PodStatus
// commented out because it's too verbose. uncomment to see all registered pods
// log.G(ctx).Debug(p.pods)

// retrieve pod status from remote interlink
returnVal, err := statusRequest(ctx, config, podsList, token)
Expand Down Expand Up @@ -574,12 +577,71 @@ func checkPodsStatus(ctx context.Context, p *Provider, podsList []*v1.Pod, token

// if the PodUID match with the one in etcd we are talking of the same thing. GOOD
if podRemoteStatus.PodUID == string(podRefInCluster.UID) {
podRunning := false
podErrored := false
podCompleted := false
podInit := false // if a init container is running, the other containers phase is PodInitializing
podRunning := false // if a normale container is running, the phase is PodRunning
podErrored := false // if a container is in error, the phase is PodFailed
podCompleted := false // if all containers are terminated, the phase is PodSucceeded, but if one is in error, the phase is PodFailed
podWaitingForInitContainers := false // if init containers are waiting, the phase is PodPending
failedReason := ""

// For each container of the pod we check if there is a previous state known by K8s
nContainersInPod := len(podRemoteStatus.Containers)
counterOfTerminatedContainers := 0

nInitContainersInPod := len(podRemoteStatus.InitContainers)
counterOfTerminatedInitContainers := 0

log.G(ctx).Debug("Number of containers in POD: " + strconv.Itoa(nContainersInPod))
log.G(ctx).Debug("Number of init containers in POD: " + strconv.Itoa(nContainersInPod))

// if there are init containers, we need to check them first
if nInitContainersInPod > 0 {

log.G(ctx).Debug("Init containers detected, going to check them first")

for _, containerRemoteStatus := range podRemoteStatus.InitContainers {
index := 0
foundCt := false

for i, checkedContainer := range podRefInCluster.Status.InitContainerStatuses {
if checkedContainer.Name == containerRemoteStatus.Name {
foundCt = true
index = i
}
}

if !foundCt {
podRefInCluster.Status.InitContainerStatuses = append(podRefInCluster.Status.InitContainerStatuses, containerRemoteStatus)
} else {
podRefInCluster.Status.InitContainerStatuses[index] = containerRemoteStatus
}

switch {
case containerRemoteStatus.State.Terminated != nil:
counterOfTerminatedInitContainers = counterOfTerminatedInitContainers + 1
podRefInCluster.Status.InitContainerStatuses[index].State.Terminated.ExitCode = containerRemoteStatus.State.Terminated.ExitCode
podRefInCluster.Status.InitContainerStatuses[index].State.Terminated.Reason = "Completed"
if containerRemoteStatus.State.Terminated.ExitCode != 0 {
podErrored = true
failedReason = "Error: " + strconv.Itoa(int(containerRemoteStatus.State.Terminated.ExitCode))
podRefInCluster.Status.InitContainerStatuses[index].State.Terminated.Reason = failedReason
log.G(ctx).Error("Container " + containerRemoteStatus.Name + " exited with error: " + strconv.Itoa(int(containerRemoteStatus.State.Terminated.ExitCode)))
}
case containerRemoteStatus.State.Waiting != nil:
log.G(ctx).Info("Pod " + podRemoteStatus.PodName + ": Service " + containerRemoteStatus.Name + " is setting up on Sidecar")
podWaitingForInitContainers = true
podRefInCluster.Status.InitContainerStatuses[index].State.Waiting = containerRemoteStatus.State.Waiting
case containerRemoteStatus.State.Running != nil:
podInit = true
log.G(ctx).Debug("Pod " + podRemoteStatus.PodName + ": Service " + containerRemoteStatus.Name + " is running on Sidecar")
podRefInCluster.Status.InitContainerStatuses[index].State.Running = containerRemoteStatus.State.Running
podRefInCluster.Status.InitContainerStatuses[index].State.Waiting = nil
}
}
if counterOfTerminatedInitContainers == nInitContainersInPod {
podWaitingForInitContainers = false
}
}

for _, containerRemoteStatus := range podRemoteStatus.Containers {
index := 0
foundCt := false
Expand All @@ -598,47 +660,73 @@ func checkPodsStatus(ctx context.Context, p *Provider, podsList []*v1.Pod, token
podRefInCluster.Status.ContainerStatuses[index] = containerRemoteStatus
}

log.G(ctx).Debug(containerRemoteStatus.State.Running)

// if plugin cannot return any non-terminated container set the status to terminated
// if the exit code is != 0 get the error and set error reason + rememeber to set pod to failed
switch {
case containerRemoteStatus.State.Terminated != nil:
log.G(ctx).Debug("Pod " + podRemoteStatus.PodName + ": Service " + containerRemoteStatus.Name + " is not running on Plugin side")
podCompleted = true
podRefInCluster.Status.ContainerStatuses[index].State.Terminated.Reason = "Completed"
if containerRemoteStatus.State.Terminated.ExitCode != 0 {
podErrored = true
failedReason = "Error: " + strconv.Itoa(int(containerRemoteStatus.State.Terminated.ExitCode))
podRefInCluster.Status.ContainerStatuses[index].State.Terminated.Reason = failedReason
log.G(ctx).Error("Container " + containerRemoteStatus.Name + " exited with error: " + strconv.Itoa(int(containerRemoteStatus.State.Terminated.ExitCode)))
// if the pod is waiting for the starting of the init containers or some of them are still running
// all the other containers are in waiting state
if podWaitingForInitContainers || podInit {
podRefInCluster.Status.ContainerStatuses[index].State.Waiting = &v1.ContainerStateWaiting{Reason: "Waiting for init containers"}
podRefInCluster.Status.ContainerStatuses[index].State.Running = nil
podRefInCluster.Status.ContainerStatuses[index].State.Terminated = nil
if podInit {
podRefInCluster.Status.ContainerStatuses[index].State.Waiting.Reason = "Init:" + strconv.Itoa(counterOfTerminatedInitContainers) + "/" + strconv.Itoa(nInitContainersInPod)
} else {
podRefInCluster.Status.ContainerStatuses[index].State.Waiting.Reason = "PodInitializing"
}
} else {
// if plugin cannot return any non-terminated container set the status to terminated
// if the exit code is != 0 get the error and set error reason + rememeber to set pod to failed
switch {
case containerRemoteStatus.State.Terminated != nil:
log.G(ctx).Debug("Pod " + podRemoteStatus.PodName + ": Service " + containerRemoteStatus.Name + " is not running on Plugin side")
counterOfTerminatedContainers = counterOfTerminatedContainers + 1
podRefInCluster.Status.ContainerStatuses[index].State.Terminated.Reason = "Completed"
if containerRemoteStatus.State.Terminated.ExitCode != 0 {
podErrored = true
failedReason = "Error: " + strconv.Itoa(int(containerRemoteStatus.State.Terminated.ExitCode))
podRefInCluster.Status.ContainerStatuses[index].State.Terminated.Reason = failedReason
log.G(ctx).Error("Container " + containerRemoteStatus.Name + " exited with error: " + strconv.Itoa(int(containerRemoteStatus.State.Terminated.ExitCode)))
}
case containerRemoteStatus.State.Waiting != nil:
log.G(ctx).Info("Pod " + podRemoteStatus.PodName + ": Service " + containerRemoteStatus.Name + " is setting up on Sidecar")
podRunning = true
case containerRemoteStatus.State.Running != nil:
podRunning = true
log.G(ctx).Debug("Pod " + podRemoteStatus.PodName + ": Service " + containerRemoteStatus.Name + " is running on Sidecar")
}
case containerRemoteStatus.State.Waiting != nil:
log.G(ctx).Info("Pod " + podRemoteStatus.PodName + ": Service " + containerRemoteStatus.Name + " is setting up on Sidecar")
podRunning = true
case containerRemoteStatus.State.Running != nil:
podRunning = true
log.G(ctx).Debug("Pod " + podRemoteStatus.PodName + ": Service " + containerRemoteStatus.Name + " is running on Sidecar")
}
}
if counterOfTerminatedContainers == nContainersInPod {
podCompleted = true
}

// if this is the first time you see a container running/errored/completed, update the status of the pod.
switch {
case podRunning && podRefInCluster.Status.Phase != v1.PodRunning:
podRefInCluster.Status.Phase = v1.PodRunning
podRefInCluster.Status.Conditions = append(podRefInCluster.Status.Conditions, v1.PodCondition{Type: v1.PodReady, Status: v1.ConditionTrue})
case podErrored && podRefInCluster.Status.Phase != v1.PodFailed:
if podCompleted {
// it means that all containers are terminated, check if some of them are errored
if podErrored {
podRefInCluster.Status.Phase = v1.PodFailed
podRefInCluster.Status.Reason = failedReason
case podCompleted && podRefInCluster.Status.Phase != v1.PodSucceeded:
// override all the ContainerStatuses to set Reason to failedReason
for i := range podRefInCluster.Status.ContainerStatuses {
podRefInCluster.Status.ContainerStatuses[i].State.Terminated.Reason = failedReason
}
} else {
podRefInCluster.Status.Conditions = append(podRefInCluster.Status.Conditions, v1.PodCondition{Type: v1.PodReady, Status: v1.ConditionFalse})
podRefInCluster.Status.Phase = v1.PodSucceeded
podRefInCluster.Status.Reason = "Completed"
}

} else {
if podInit {
podRefInCluster.Status.Phase = v1.PodPending
podRefInCluster.Status.Reason = "Init"
}
if podWaitingForInitContainers {
podRefInCluster.Status.Phase = v1.PodPending
podRefInCluster.Status.Reason = "Waiting for init containers"
}
if podRunning && podRefInCluster.Status.Phase != v1.PodRunning { // do not update the status if it is already running
podRefInCluster.Status.Phase = v1.PodRunning
podRefInCluster.Status.Conditions = append(podRefInCluster.Status.Conditions, v1.PodCondition{Type: v1.PodReady, Status: v1.ConditionTrue})
}
}
} else {

// if you don't now any UID yet, collect the status and updated the status cache
list, err := p.clientSet.CoreV1().Pods(podRemoteStatus.PodNamespace).List(ctx, metav1.ListOptions{})
if err != nil {
log.G(ctx).Error(err)
Expand Down

0 comments on commit 87d9c59

Please sign in to comment.