From 87d9c5935c8e148917c57f9a484a1919ef8d7570 Mon Sep 17 00:00:00 2001 From: Bianco95 Date: Mon, 13 Jan 2025 08:23:35 +0100 Subject: [PATCH] Prevent successful containers from overwriting failed containers. - Added logic to ensure that successful containers no longer overwrite failed containers, preserving the correct status. - Improved handling of init containers: - If a pod has init containers, the pod's status now checks their completion first. - On the plugin side, init containers must be executed correctly: they must run sequentially and complete before the main containers. --- pkg/interlink/api/status.go | 2 - pkg/interlink/types.go | 11 +-- pkg/virtualkubelet/execute.go | 160 ++++++++++++++++++++++++++-------- 3 files changed, 130 insertions(+), 43 deletions(-) diff --git a/pkg/interlink/api/status.go b/pkg/interlink/api/status.go index 4eeaa0fe..1c3f8a79 100644 --- a/pkg/interlink/api/status.go +++ b/pkg/interlink/api/status.go @@ -80,7 +80,6 @@ func (h *InterLinkHandler) StatusHandler(w http.ResponseWriter, r *http.Request) log.G(h.Ctx).Info("InterLink: forwarding GetStatus call to sidecar") req.Header.Set("Content-Type", "application/json") - log.G(h.Ctx).Debug("Interlink get status request content:", req) sessionContext := GetSessionContext(r) bodyBytes, err = ReqWithError(h.Ctx, req, w, start, span, false, true, sessionContext, h.ClientHTTP) @@ -127,7 +126,6 @@ func (h *InterLinkHandler) StatusHandler(w http.ResponseWriter, r *http.Request) log.G(h.Ctx).Error(err) return } - log.G(h.Ctx).Debug("InterLink: status " + string(returnValue)) w.WriteHeader(statusCode) _, err = w.Write(returnValue) diff --git a/pkg/interlink/types.go b/pkg/interlink/types.go index 6ba742f4..70981687 100644 --- a/pkg/interlink/types.go +++ b/pkg/interlink/types.go @@ -15,11 +15,12 @@ type PodCreateRequests struct { // PodStatus is a simplified v1.Pod struct, holding only necessary variables to uniquely identify a job/service in the sidecar. It is used to request type PodStatus struct { - PodName string `json:"name"` - PodUID string `json:"UID"` - PodNamespace string `json:"namespace"` - JobID string `json:"JID"` - Containers []v1.ContainerStatus `json:"containers"` + PodName string `json:"name"` + PodUID string `json:"UID"` + PodNamespace string `json:"namespace"` + JobID string `json:"JID"` + Containers []v1.ContainerStatus `json:"containers"` + InitContainers []v1.ContainerStatus `json:"initContainers"` } // CreateStruct is the response to be received from interLink whenever asked to create a pod. It will allow for mapping remote ID with the pod UUID diff --git a/pkg/virtualkubelet/execute.go b/pkg/virtualkubelet/execute.go index b5de887c..b6b38c97 100644 --- a/pkg/virtualkubelet/execute.go +++ b/pkg/virtualkubelet/execute.go @@ -387,6 +387,11 @@ func LogRetrieval(ctx context.Context, config Config, logsRequest types.LogStruc AddSessionContext(req, sessionContext) logTransport := http.DefaultTransport.(*http.Transport).Clone() + + // logTransport.TLSClientConfig = &tls.Config{ + // InsecureSkipVerify: true, + // } + // logTransport.DisableKeepAlives = true // logTransport.MaxIdleConnsPerHost = -1 var logHTTPClient = &http.Client{Transport: logTransport} @@ -539,8 +544,6 @@ func RemoteExecution(ctx context.Context, config Config, p *Provider, pod *v1.Po // After the statusRequest returns a response, this function uses that response to update every Pod and Container status. func checkPodsStatus(ctx context.Context, p *Provider, podsList []*v1.Pod, token string, config Config) ([]types.PodStatus, error) { var ret []types.PodStatus - // commented out because it's too verbose. uncomment to see all registered pods - // log.G(ctx).Debug(p.pods) // retrieve pod status from remote interlink returnVal, err := statusRequest(ctx, config, podsList, token) @@ -574,12 +577,71 @@ func checkPodsStatus(ctx context.Context, p *Provider, podsList []*v1.Pod, token // if the PodUID match with the one in etcd we are talking of the same thing. GOOD if podRemoteStatus.PodUID == string(podRefInCluster.UID) { - podRunning := false - podErrored := false - podCompleted := false + podInit := false // if a init container is running, the other containers phase is PodInitializing + podRunning := false // if a normale container is running, the phase is PodRunning + podErrored := false // if a container is in error, the phase is PodFailed + podCompleted := false // if all containers are terminated, the phase is PodSucceeded, but if one is in error, the phase is PodFailed + podWaitingForInitContainers := false // if init containers are waiting, the phase is PodPending failedReason := "" - // For each container of the pod we check if there is a previous state known by K8s + nContainersInPod := len(podRemoteStatus.Containers) + counterOfTerminatedContainers := 0 + + nInitContainersInPod := len(podRemoteStatus.InitContainers) + counterOfTerminatedInitContainers := 0 + + log.G(ctx).Debug("Number of containers in POD: " + strconv.Itoa(nContainersInPod)) + log.G(ctx).Debug("Number of init containers in POD: " + strconv.Itoa(nContainersInPod)) + + // if there are init containers, we need to check them first + if nInitContainersInPod > 0 { + + log.G(ctx).Debug("Init containers detected, going to check them first") + + for _, containerRemoteStatus := range podRemoteStatus.InitContainers { + index := 0 + foundCt := false + + for i, checkedContainer := range podRefInCluster.Status.InitContainerStatuses { + if checkedContainer.Name == containerRemoteStatus.Name { + foundCt = true + index = i + } + } + + if !foundCt { + podRefInCluster.Status.InitContainerStatuses = append(podRefInCluster.Status.InitContainerStatuses, containerRemoteStatus) + } else { + podRefInCluster.Status.InitContainerStatuses[index] = containerRemoteStatus + } + + switch { + case containerRemoteStatus.State.Terminated != nil: + counterOfTerminatedInitContainers = counterOfTerminatedInitContainers + 1 + podRefInCluster.Status.InitContainerStatuses[index].State.Terminated.ExitCode = containerRemoteStatus.State.Terminated.ExitCode + podRefInCluster.Status.InitContainerStatuses[index].State.Terminated.Reason = "Completed" + if containerRemoteStatus.State.Terminated.ExitCode != 0 { + podErrored = true + failedReason = "Error: " + strconv.Itoa(int(containerRemoteStatus.State.Terminated.ExitCode)) + podRefInCluster.Status.InitContainerStatuses[index].State.Terminated.Reason = failedReason + log.G(ctx).Error("Container " + containerRemoteStatus.Name + " exited with error: " + strconv.Itoa(int(containerRemoteStatus.State.Terminated.ExitCode))) + } + case containerRemoteStatus.State.Waiting != nil: + log.G(ctx).Info("Pod " + podRemoteStatus.PodName + ": Service " + containerRemoteStatus.Name + " is setting up on Sidecar") + podWaitingForInitContainers = true + podRefInCluster.Status.InitContainerStatuses[index].State.Waiting = containerRemoteStatus.State.Waiting + case containerRemoteStatus.State.Running != nil: + podInit = true + log.G(ctx).Debug("Pod " + podRemoteStatus.PodName + ": Service " + containerRemoteStatus.Name + " is running on Sidecar") + podRefInCluster.Status.InitContainerStatuses[index].State.Running = containerRemoteStatus.State.Running + podRefInCluster.Status.InitContainerStatuses[index].State.Waiting = nil + } + } + if counterOfTerminatedInitContainers == nInitContainersInPod { + podWaitingForInitContainers = false + } + } + for _, containerRemoteStatus := range podRemoteStatus.Containers { index := 0 foundCt := false @@ -598,47 +660,73 @@ func checkPodsStatus(ctx context.Context, p *Provider, podsList []*v1.Pod, token podRefInCluster.Status.ContainerStatuses[index] = containerRemoteStatus } - log.G(ctx).Debug(containerRemoteStatus.State.Running) - - // if plugin cannot return any non-terminated container set the status to terminated - // if the exit code is != 0 get the error and set error reason + rememeber to set pod to failed - switch { - case containerRemoteStatus.State.Terminated != nil: - log.G(ctx).Debug("Pod " + podRemoteStatus.PodName + ": Service " + containerRemoteStatus.Name + " is not running on Plugin side") - podCompleted = true - podRefInCluster.Status.ContainerStatuses[index].State.Terminated.Reason = "Completed" - if containerRemoteStatus.State.Terminated.ExitCode != 0 { - podErrored = true - failedReason = "Error: " + strconv.Itoa(int(containerRemoteStatus.State.Terminated.ExitCode)) - podRefInCluster.Status.ContainerStatuses[index].State.Terminated.Reason = failedReason - log.G(ctx).Error("Container " + containerRemoteStatus.Name + " exited with error: " + strconv.Itoa(int(containerRemoteStatus.State.Terminated.ExitCode))) + // if the pod is waiting for the starting of the init containers or some of them are still running + // all the other containers are in waiting state + if podWaitingForInitContainers || podInit { + podRefInCluster.Status.ContainerStatuses[index].State.Waiting = &v1.ContainerStateWaiting{Reason: "Waiting for init containers"} + podRefInCluster.Status.ContainerStatuses[index].State.Running = nil + podRefInCluster.Status.ContainerStatuses[index].State.Terminated = nil + if podInit { + podRefInCluster.Status.ContainerStatuses[index].State.Waiting.Reason = "Init:" + strconv.Itoa(counterOfTerminatedInitContainers) + "/" + strconv.Itoa(nInitContainersInPod) + } else { + podRefInCluster.Status.ContainerStatuses[index].State.Waiting.Reason = "PodInitializing" + } + } else { + // if plugin cannot return any non-terminated container set the status to terminated + // if the exit code is != 0 get the error and set error reason + rememeber to set pod to failed + switch { + case containerRemoteStatus.State.Terminated != nil: + log.G(ctx).Debug("Pod " + podRemoteStatus.PodName + ": Service " + containerRemoteStatus.Name + " is not running on Plugin side") + counterOfTerminatedContainers = counterOfTerminatedContainers + 1 + podRefInCluster.Status.ContainerStatuses[index].State.Terminated.Reason = "Completed" + if containerRemoteStatus.State.Terminated.ExitCode != 0 { + podErrored = true + failedReason = "Error: " + strconv.Itoa(int(containerRemoteStatus.State.Terminated.ExitCode)) + podRefInCluster.Status.ContainerStatuses[index].State.Terminated.Reason = failedReason + log.G(ctx).Error("Container " + containerRemoteStatus.Name + " exited with error: " + strconv.Itoa(int(containerRemoteStatus.State.Terminated.ExitCode))) + } + case containerRemoteStatus.State.Waiting != nil: + log.G(ctx).Info("Pod " + podRemoteStatus.PodName + ": Service " + containerRemoteStatus.Name + " is setting up on Sidecar") + podRunning = true + case containerRemoteStatus.State.Running != nil: + podRunning = true + log.G(ctx).Debug("Pod " + podRemoteStatus.PodName + ": Service " + containerRemoteStatus.Name + " is running on Sidecar") } - case containerRemoteStatus.State.Waiting != nil: - log.G(ctx).Info("Pod " + podRemoteStatus.PodName + ": Service " + containerRemoteStatus.Name + " is setting up on Sidecar") - podRunning = true - case containerRemoteStatus.State.Running != nil: - podRunning = true - log.G(ctx).Debug("Pod " + podRemoteStatus.PodName + ": Service " + containerRemoteStatus.Name + " is running on Sidecar") } + } + if counterOfTerminatedContainers == nContainersInPod { + podCompleted = true + } - // if this is the first time you see a container running/errored/completed, update the status of the pod. - switch { - case podRunning && podRefInCluster.Status.Phase != v1.PodRunning: - podRefInCluster.Status.Phase = v1.PodRunning - podRefInCluster.Status.Conditions = append(podRefInCluster.Status.Conditions, v1.PodCondition{Type: v1.PodReady, Status: v1.ConditionTrue}) - case podErrored && podRefInCluster.Status.Phase != v1.PodFailed: + if podCompleted { + // it means that all containers are terminated, check if some of them are errored + if podErrored { podRefInCluster.Status.Phase = v1.PodFailed podRefInCluster.Status.Reason = failedReason - case podCompleted && podRefInCluster.Status.Phase != v1.PodSucceeded: + // override all the ContainerStatuses to set Reason to failedReason + for i := range podRefInCluster.Status.ContainerStatuses { + podRefInCluster.Status.ContainerStatuses[i].State.Terminated.Reason = failedReason + } + } else { podRefInCluster.Status.Conditions = append(podRefInCluster.Status.Conditions, v1.PodCondition{Type: v1.PodReady, Status: v1.ConditionFalse}) podRefInCluster.Status.Phase = v1.PodSucceeded podRefInCluster.Status.Reason = "Completed" } - + } else { + if podInit { + podRefInCluster.Status.Phase = v1.PodPending + podRefInCluster.Status.Reason = "Init" + } + if podWaitingForInitContainers { + podRefInCluster.Status.Phase = v1.PodPending + podRefInCluster.Status.Reason = "Waiting for init containers" + } + if podRunning && podRefInCluster.Status.Phase != v1.PodRunning { // do not update the status if it is already running + podRefInCluster.Status.Phase = v1.PodRunning + podRefInCluster.Status.Conditions = append(podRefInCluster.Status.Conditions, v1.PodCondition{Type: v1.PodReady, Status: v1.ConditionTrue}) + } } } else { - - // if you don't now any UID yet, collect the status and updated the status cache list, err := p.clientSet.CoreV1().Pods(podRemoteStatus.PodNamespace).List(ctx, metav1.ListOptions{}) if err != nil { log.G(ctx).Error(err)