Skip to content

Commit

Permalink
🧹 Add e2e OOM tests
Browse files Browse the repository at this point in the history
This adds e2e tests for OOM of Scan API and node scan. This covers the two different cases we currently have: Deployment and CronJob.

Fixes #940

Signed-off-by: Christian Zunker <[email protected]>
  • Loading branch information
czunker committed Nov 20, 2023
1 parent 4d7bf13 commit 26c717c
Show file tree
Hide file tree
Showing 7 changed files with 243 additions and 38 deletions.
11 changes: 7 additions & 4 deletions controllers/admission/conditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,15 @@ func updateAdmissionConditions(config *mondoov1alpha2.MondooAuditConfig, degrade
} else if degradedStatus {
msg = "Admission controller is unavailable"
for _, pod := range pods.Items {
for _, status := range pod.Status.ContainerStatuses {
if status.LastTerminationState.Terminated != nil && status.LastTerminationState.Terminated.ExitCode == 137 {
// TODO: double check container name?
for i, containerStatus := range pod.Status.ContainerStatuses {
if containerStatus.Name != "webhook" {
continue
}
if (containerStatus.LastTerminationState.Terminated != nil && containerStatus.LastTerminationState.Terminated.ExitCode == 137) ||
(containerStatus.State.Terminated != nil && containerStatus.State.Terminated.ExitCode == 137) {
msg = "Admission controller is unavailable due to OOM"
affectedPods = append(affectedPods, pod.Name)
memoryLimit = pod.Spec.Containers[0].Resources.Limits.Memory().String()
memoryLimit = pod.Spec.Containers[i].Resources.Limits.Memory().String()
break
}
}
Expand Down
23 changes: 13 additions & 10 deletions controllers/container_image/conditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package container_image

import (
"go.mondoo.com/mondoo-operator/api/v1alpha2"
"go.mondoo.com/mondoo-operator/pkg/utils/k8s"
"go.mondoo.com/mondoo-operator/pkg/utils/mondoo"
corev1 "k8s.io/api/core/v1"
)
Expand All @@ -26,16 +27,18 @@ func updateImageScanningConditions(config *v1alpha2.MondooAuditConfig, degradedS
status = corev1.ConditionTrue
}

for _, pod := range pods.Items {
for _, containerStatus := range pod.Status.ContainerStatuses {
if containerStatus.LastTerminationState.Terminated != nil && containerStatus.LastTerminationState.Terminated.ExitCode == 137 {
// TODO: double check container name?
msg = "Kubernetes Container Image Scanning is unavailable due to OOM"
affectedPods = append(affectedPods, pod.Name)
memoryLimit = pod.Spec.Containers[0].Resources.Limits.Memory().String()
reason = "KubernetesContainerImageScanningUnavailable"
status = corev1.ConditionTrue
}
currentPod := k8s.GetNewestPodFromList(pods)
for i, containerStatus := range currentPod.Status.ContainerStatuses {
if containerStatus.Name != "mondoo-containers-scan" {
continue
}
if (containerStatus.LastTerminationState.Terminated != nil && containerStatus.LastTerminationState.Terminated.ExitCode == 137) ||
(containerStatus.State.Terminated != nil && containerStatus.State.Terminated.ExitCode == 137) {
msg = "Kubernetes Container Image Scanning is unavailable due to OOM"
affectedPods = append(affectedPods, currentPod.Name)
memoryLimit = currentPod.Spec.Containers[i].Resources.Limits.Memory().String()
reason = "KubernetesContainerImageScanningUnavailable"
status = corev1.ConditionTrue
}
}

Expand Down
23 changes: 13 additions & 10 deletions controllers/k8s_scan/conditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package k8s_scan

import (
"go.mondoo.com/mondoo-operator/api/v1alpha2"
"go.mondoo.com/mondoo-operator/pkg/utils/k8s"
"go.mondoo.com/mondoo-operator/pkg/utils/mondoo"
corev1 "k8s.io/api/core/v1"
)
Expand All @@ -26,16 +27,18 @@ func updateWorkloadsConditions(config *v1alpha2.MondooAuditConfig, degradedStatu
status = corev1.ConditionTrue
}

for _, pod := range pods.Items {
for _, containerStatus := range pod.Status.ContainerStatuses {
if containerStatus.LastTerminationState.Terminated != nil && containerStatus.LastTerminationState.Terminated.ExitCode == 137 {
// TODO: double check container name?
msg = "Kubernetes Resources Scanning is unavailable due to OOM"
affectedPods = append(affectedPods, pod.Name)
memoryLimit = pod.Spec.Containers[0].Resources.Limits.Memory().String()
reason = "KubernetesResourcesScanningUnavailable"
status = corev1.ConditionTrue
}
currentPod := k8s.GetNewestPodFromList(pods)
for i, containerStatus := range currentPod.Status.ContainerStatuses {
if containerStatus.Name != "mondoo-k8s-scan" {
continue
}
if (containerStatus.LastTerminationState.Terminated != nil && containerStatus.LastTerminationState.Terminated.ExitCode == 137) ||
(containerStatus.State.Terminated != nil && containerStatus.State.Terminated.ExitCode == 137) {
msg = "Kubernetes Resources Scanning is unavailable due to OOM"
affectedPods = append(affectedPods, currentPod.Name)
memoryLimit = currentPod.Spec.Containers[i].Resources.Limits.Memory().String()
reason = "KubernetesResourcesScanningUnavailable"
status = corev1.ConditionTrue
}
}

Expand Down
30 changes: 20 additions & 10 deletions controllers/nodes/conditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package nodes

import (
"go.mondoo.com/mondoo-operator/api/v1alpha2"
"go.mondoo.com/mondoo-operator/pkg/utils/k8s"
"go.mondoo.com/mondoo-operator/pkg/utils/mondoo"

corev1 "k8s.io/api/core/v1"
Expand All @@ -27,16 +28,25 @@ func updateNodeConditions(config *v1alpha2.MondooAuditConfig, degradedStatus boo
status = corev1.ConditionTrue
}

for _, pod := range pods.Items {
for _, containerStatus := range pod.Status.ContainerStatuses {
if containerStatus.LastTerminationState.Terminated != nil && containerStatus.LastTerminationState.Terminated.ExitCode == 137 {
// TODO: double check container name?
msg = "Node Scanning is unavailable due to OOM"
affectedPods = append(affectedPods, pod.Name)
memoryLimit = pod.Spec.Containers[0].Resources.Limits.Memory().String()
reason = "NodeScanningUnavailable"
status = corev1.ConditionTrue
}
currentPod := k8s.GetNewestPodFromList(pods)
var managerContainer *corev1.Container
for _, container := range currentPod.Spec.Containers {
if container.Name == "manager" {
managerContainer = &container
break
}
}
for _, containerStatus := range currentPod.Status.ContainerStatuses {
if containerStatus.Name != "cnspec" {
continue
}
if (containerStatus.LastTerminationState.Terminated != nil && containerStatus.LastTerminationState.Terminated.ExitCode == 137) ||
(containerStatus.State.Terminated != nil && containerStatus.State.Terminated.ExitCode == 137) {
msg = "Node Scanning is unavailable due to OOM"
affectedPods = append(affectedPods, currentPod.Name)
memoryLimit = managerContainer.Resources.Limits.Memory().String()
reason = "NodeScanningUnavailable"
status = corev1.ConditionTrue
}
}

Expand Down
11 changes: 7 additions & 4 deletions controllers/scanapi/conditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,15 @@ func updateScanAPIConditions(config *mondoov1alpha2.MondooAuditConfig, degradedS
}

for _, pod := range pods.Items {
for _, status := range pod.Status.ContainerStatuses {
if status.LastTerminationState.Terminated != nil && status.LastTerminationState.Terminated.ExitCode == 137 {
// TODO: double check container name?
for i, containerStatus := range pod.Status.ContainerStatuses {
if containerStatus.Name != "cnspec" {
continue
}
if (containerStatus.LastTerminationState.Terminated != nil && containerStatus.LastTerminationState.Terminated.ExitCode == 137) ||
(containerStatus.State.Terminated != nil && containerStatus.State.Terminated.ExitCode == 137) {
msg = "ScanAPI controller is unavailable due to OOM"
affectedPods = append(affectedPods, pod.Name)
memoryLimit = pod.Spec.Containers[0].Resources.Limits.Memory().String()
memoryLimit = pod.Spec.Containers[i].Resources.Limits.Memory().String()
}
}
}
Expand Down
173 changes: 173 additions & 0 deletions tests/integration/audit_config_base_suite.go
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,179 @@ func (s *AuditConfigBaseSuite) testMondooAuditConfigNodes(auditConfig mondoov2.M
s.Equal("ACTIVE", status)
}

func (s *AuditConfigBaseSuite) testOOMScanAPI(auditConfig mondoov2.MondooAuditConfig) {
s.auditConfig = auditConfig

// Disable container image resolution to be able to run the k8s resources scan CronJob with a local image.
cleanup := s.disableContainerImageResolution()
defer cleanup()

zap.S().Info("Create an audit config that enables only workloads scanning.")
s.NoErrorf(
s.testCluster.K8sHelper.Clientset.Create(s.ctx, &auditConfig),
"Failed to create Mondoo audit config.")

s.Require().True(s.testCluster.K8sHelper.WaitUntilMondooClientSecretExists(s.ctx, s.auditConfig.Namespace), "Mondoo SA not created")

// Verify scan API deployment and service
s.validateScanApiDeployment(auditConfig)

err := s.testCluster.K8sHelper.CheckForPodInStatus(&auditConfig, "client-k8s-scan")
s.NoErrorf(err, "Couldn't find k8s scan pod in Podlist of the MondooAuditConfig Status")

// Wait for first scan to finish
// It might interfere with this test
cronJobLabels := k8s_scan.CronJobLabels(auditConfig)
s.True(
s.testCluster.K8sHelper.WaitUntilCronJobsSuccessful(utils.LabelsToLabelSelector(cronJobLabels), auditConfig.Namespace),
"Kubernetes resources scan CronJob did not run successfully.")

// Verify scan API deployment and service
s.validateScanApiDeployment(auditConfig)

foundMondooAuditConfig, err := s.testCluster.K8sHelper.GetMondooAuditConfigFromCluster(auditConfig.Name, auditConfig.Namespace)
s.NoError(err, "Failed to find MondooAuditConfig")
foundMondooAuditConfig.Spec.Scanner.Resources.Limits = corev1.ResourceList{
corev1.ResourceMemory: resource.MustParse("10Mi"), // this should be low enough to trigger an OOMkilled
}

zap.S().Info("Reducing memory limit to trigger OOM.")
s.NoError(s.testCluster.K8sHelper.Clientset.Update(s.ctx, foundMondooAuditConfig))

// Wait some time for the new RS to come up
time.Sleep(10 * time.Second)

// This will take some time, because:
// reconcile needs to happen
// a new replicaset should be created
// the first Pod tries to start and gets killed
// on the 2nd start we should get an OOMkilled status update
err = s.testCluster.K8sHelper.CheckForDegradedCondition(&auditConfig, mondoov2.ScanAPIDegraded, corev1.ConditionTrue)
s.NoError(err, "Failed to find degraded condition")

foundMondooAuditConfig, err = s.testCluster.K8sHelper.GetMondooAuditConfigFromCluster(auditConfig.Name, auditConfig.Namespace)
s.NoError(err, "Failed to find MondooAuditConfig")
s.Contains(foundMondooAuditConfig.Status.Conditions[4].Message, "OOM", "Failed to find OOMKilled message in degraded condition")
s.Len(foundMondooAuditConfig.Status.Conditions[4].AffectedPods, 1, "Failed to find only one pod in degraded condition")

// Give the integration a chance to update
time.Sleep(2 * time.Second)

status, err := s.integration.GetStatus(s.ctx)
s.NoError(err, "Failed to get status")
s.Equal("ERROR", status)

foundMondooAuditConfig.Spec.Scanner.Resources.Limits = corev1.ResourceList{
corev1.ResourceMemory: resource.MustParse("200Mi"), // this should be enough to get the ScanAPI running again
}

zap.S().Info("Increasing memory limit to get ScanAPI running again.")
s.NoError(s.testCluster.K8sHelper.Clientset.Update(s.ctx, foundMondooAuditConfig))

err = s.testCluster.K8sHelper.CheckForDegradedCondition(&auditConfig, mondoov2.ScanAPIDegraded, corev1.ConditionFalse)
s.NoError(err, "Failed to find degraded condition")
foundMondooAuditConfig, err = s.testCluster.K8sHelper.GetMondooAuditConfigFromCluster(auditConfig.Name, auditConfig.Namespace)
s.NoError(err, "Failed to find MondooAuditConfig")
s.NotContains(foundMondooAuditConfig.Status.Conditions[4].Message, "OOM", "Found OOMKilled message in condition")
s.Len(foundMondooAuditConfig.Status.Conditions[4].AffectedPods, 0, "Found a pod in condition")

// Give the integration a chance to update
time.Sleep(2 * time.Second)

status, err = s.integration.GetStatus(s.ctx)
s.NoError(err, "Failed to get status")
s.Equal("ACTIVE", status)
}

func (s *AuditConfigBaseSuite) testOOMNodeScan(auditConfig mondoov2.MondooAuditConfig) {
s.auditConfig = auditConfig

auditConfig.Spec.Nodes.Resources.Limits = corev1.ResourceList{
corev1.ResourceMemory: resource.MustParse("10Mi"), // this should be low enough to trigger an OOMkilled
}

// Disable container image resolution to be able to run the k8s resources scan CronJob with a local image.
cleanup := s.disableContainerImageResolution()
defer cleanup()

zap.S().Info("Create an audit config that enables only nodes scanning.")
s.NoErrorf(
s.testCluster.K8sHelper.Clientset.Create(s.ctx, &auditConfig),
"Failed to create Mondoo audit config.")

s.Require().True(s.testCluster.K8sHelper.WaitUntilMondooClientSecretExists(s.ctx, s.auditConfig.Namespace), "Mondoo SA not created")

cronJobs := &batchv1.CronJobList{}
cronJobLabels := nodes.CronJobLabels(auditConfig)

// List only the CronJobs in the namespace of the MondooAuditConfig and only the ones that exactly match our labels.
listOpts := &client.ListOptions{Namespace: auditConfig.Namespace, LabelSelector: labels.SelectorFromSet(cronJobLabels)}

nodeList := &corev1.NodeList{}
s.NoError(s.testCluster.K8sHelper.Clientset.List(s.ctx, nodeList))

// Verify the amount of CronJobs created is equal to the amount of nodes
err := s.testCluster.K8sHelper.ExecuteWithRetries(func() (bool, error) {
s.NoError(s.testCluster.K8sHelper.Clientset.List(s.ctx, cronJobs, listOpts))
if len(nodeList.Items) == len(cronJobs.Items) {
return true, nil
}
return false, nil
})
s.NoErrorf(
err,
"The amount of node scanning CronJobs is not equal to the amount of cluster nodes. expected: %d; actual: %d",
len(nodeList.Items), len(cronJobs.Items))

// Wait some time for the CronJob to trigger
time.Sleep(30 * time.Second)

// This will take some time, because:
// reconcile needs to happen
// a new replicaset should be created
// the first Pod tries to start and gets killed
// on the 2nd start we should get an OOMkilled status update
err = s.testCluster.K8sHelper.CheckForDegradedCondition(&auditConfig, mondoov2.NodeScanningDegraded, corev1.ConditionTrue)
s.NoError(err, "Failed to find degraded condition")

foundMondooAuditConfig, err := s.testCluster.K8sHelper.GetMondooAuditConfigFromCluster(auditConfig.Name, auditConfig.Namespace)
s.NoError(err, "Failed to find MondooAuditConfig")
s.Contains(foundMondooAuditConfig.Status.Conditions[0].Message, "OOM", "Failed to find OOMKilled message in degraded condition")
s.Len(foundMondooAuditConfig.Status.Conditions[0].AffectedPods, 1, "Failed to find only one pod in degraded condition")

// Give the integration a chance to update
time.Sleep(2 * time.Second)

status, err := s.integration.GetStatus(s.ctx)
s.NoError(err, "Failed to get status")
s.Equal("ERROR", status)

foundMondooAuditConfig.Spec.Nodes.Resources.Limits = corev1.ResourceList{
corev1.ResourceMemory: resource.MustParse("200Mi"), // this should be enough to get the ScanAPI running again
}
foundMondooAuditConfig.Spec.Nodes.Schedule = "*/1 * * * *"

zap.S().Info("Increasing memory limit to get node Scans running again.")
s.NoError(s.testCluster.K8sHelper.Clientset.Update(s.ctx, foundMondooAuditConfig))

// Wait for the next run of the CronJob
time.Sleep(30 * time.Second)

err = s.testCluster.K8sHelper.CheckForDegradedCondition(&auditConfig, mondoov2.NodeScanningDegraded, corev1.ConditionFalse)
s.NoError(err, "Failed to find degraded condition")
foundMondooAuditConfig, err = s.testCluster.K8sHelper.GetMondooAuditConfigFromCluster(auditConfig.Name, auditConfig.Namespace)
s.NoError(err, "Failed to find MondooAuditConfig")
s.NotContains(foundMondooAuditConfig.Status.Conditions[0].Message, "OOM", "Found OOMKilled message in condition")
s.Len(foundMondooAuditConfig.Status.Conditions[0].AffectedPods, 0, "Found a pod in condition")

// Give the integration a chance to update
time.Sleep(2 * time.Second)

status, err = s.integration.GetStatus(s.ctx)
s.NoError(err, "Failed to get status")
s.Equal("ACTIVE", status)
}

func (s *AuditConfigBaseSuite) testMondooAuditConfigAdmission(auditConfig mondoov2.MondooAuditConfig) {
// Disable imageResolution for the webhook image to be runnable.
// Otherwise, mondoo-operator will try to resolve the locally-built mondoo-operator container
Expand Down
10 changes: 10 additions & 0 deletions tests/integration/audit_config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,16 @@ func (s *AuditConfigSuite) TestReconcile_KubernetesResources() {
s.testMondooAuditConfigKubernetesResources(auditConfig)
}

func (s *AuditConfigSuite) TestOOMScanAPI() {
auditConfig := utils.DefaultAuditConfigMinimal(s.testCluster.Settings.Namespace, true, false, false, false)
s.testOOMScanAPI(auditConfig)
}

func (s *AuditConfigSuite) TestOOMNodeScan() {
auditConfig := utils.DefaultAuditConfigMinimal(s.testCluster.Settings.Namespace, false, false, true, false)
s.testOOMNodeScan(auditConfig)
}

func (s *AuditConfigSuite) TestReconcile_Containers() {
auditConfig := utils.DefaultAuditConfigMinimal(s.testCluster.Settings.Namespace, false, true, false, false)

Expand Down

0 comments on commit 26c717c

Please sign in to comment.