Adding watch to see if components recovered

paigerube14 · paigerube14 · commit 0fc82090f2c5 · 2020-08-18T16:26:04.000-04:00
diff --git a/config/config.yaml b/config/config.yaml
@@ -1,11 +1,15 @@
 kraken:
     kubeconfig_path: /root/.kube/config                    # Path to kubeconfig
+    exit_on_failure: False                                 # Exit when a post action scenario fails
     scenarios:                                             # List of policies/chaos scenarios to load
-        -    scenarios/etcd.yml
-        -    scenarios/openshift-kube-apiserver.yml
-        -    scenarios/openshift-apiserver.yml
-        -    scenarios/regex_openshift_pod_kill.yml
-
+        - -    scenarios/etcd.yml
+          -    scenarios/post_action_etcd_example.sh
+        - -    scenarios/openshift-kube-apiserver.yml
+          -    scenarios/post_action_openshift-kube-apiserver.yml
+        - -    scenarios/openshift-apiserver.yml
+          -    scenarios/post_action_openshift-apiserver.yml
+        - -    scenarios/regex_openshift_pod_kill.yml
+          -    scenarios/post_action_regex.py
 cerberus:
     cerberus_enabled: False                                # Enable it when cerberus is previously installed
     cerberus_url:                                          # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal
diff --git a/kraken/invoke/command.py b/kraken/invoke/command.py
@@ -5,8 +5,10 @@
 # Invokes a given command and returns the stdout
 def invoke(command):
     try:
-        output = subprocess.check_output(command, shell=True,
-                                         universal_newlines=True)
-    except Exception:
-        logging.error("Failed to run %s" % (command))
-    return output
+        output = subprocess.Popen(command, shell=True,
+                                  universal_newlines=True, stdout=subprocess.PIPE,
+                                  stderr=subprocess.STDOUT)
+        (out, err) = output.communicate()
+    except Exception as e:
+        logging.error("Failed to run %s, error: %s" % (command, e))
+    return out
diff --git a/kraken/kubernetes/client.py b/kraken/kubernetes/client.py
@@ -4,6 +4,7 @@
 import kraken.invoke.command as runcommand
 import json
 
+
 kraken_node_name = ""
 
 
diff --git a/run_kraken.py b/run_kraken.py
@@ -12,6 +12,109 @@
 import pyfiglet
 
 
+# Get cerberus status
+def cerberus_integration(config):
+    cerberus_status = True
+    if config["cerberus"]["cerberus_enabled"]:
+        cerberus_url = config["cerberus"]["cerberus_url"]
+        if not cerberus_url:
+            logging.error("url where Cerberus publishes True/False signal is not provided.")
+            sys.exit(1)
+        cerberus_status = requests.get(cerberus_url).content
+        cerberus_status = True if cerberus_status == b'True' else False
+        if not cerberus_status:
+            logging.error("Received a no-go signal from Cerberus, looks like "
+                          "the cluster is unhealthy. Please check the Cerberus "
+                          "report for more details. Test failed.")
+            sys.exit(1)
+        else:
+            logging.info("Received a go signal from Ceberus, the cluster is healthy. "
+                         "Test passed.")
+    return cerberus_status
+
+
+# Function to publish kraken status to cerberus
+def publish_kraken_status(config, failed_post_scenarios):
+    cerberus_status = cerberus_integration(config)
+    if not cerberus_status:
+        if failed_post_scenarios:
+            if config['kraken']['exit_on_failure']:
+                logging.info("Cerberus status is not healthy and post action scenarios "
+                             "are still failing, exiting kraken run")
+                sys.exit(1)
+            else:
+                logging.info("Cerberus status is not healthy and post action scenarios "
+                             "are still failing")
+    else:
+
+        if failed_post_scenarios:
+            if config['kraken']['exit_on_failure']:
+                logging.info("Cerberus status is healthy but post action scenarios "
+                             "are still failing, exiting kraken run")
+                sys.exit(1)
+            else:
+                logging.info("Cerberus status is healthy but post action scenarios "
+                             "are still failing")
+
+
+def run_post_action(kubeconfig_path, scenario, pre_action_output=""):
+
+    if scenario.endswith(".yaml") or scenario.endswith(".yml"):
+        action_output = runcommand.invoke("powerfulseal autonomous "
+                                          "--use-pod-delete-instead-of-ssh-kill"
+                                          " --policy-file %s --kubeconfig %s --no-cloud"
+                                          " --inventory-kubernetes --headless"
+                                          % (scenario, kubeconfig_path))
+        # read output to make sure no error
+        if "ERROR" in action_output:
+            action_output.split("ERROR")[1].split('\n')[0]
+            if not pre_action_output:
+                logging.info("Powerful seal pre action check failed for " + str(scenario))
+            return False
+        else:
+            logging.info(scenario + " post action checks passed")
+
+    elif scenario.endswith(".py"):
+        action_output = runcommand.invoke("python3 " + scenario).strip()
+        if pre_action_output:
+            if pre_action_output == action_output:
+                logging.info(scenario + " post action checks passed")
+            else:
+                logging.info(scenario + ' post action response did not match pre check output')
+                return False
+    else:
+        # invoke custom bash script
+        action_output = runcommand.invoke(scenario).strip()
+        if pre_action_output:
+            if pre_action_output == action_output:
+                logging.info(scenario + " post action checks passed")
+            else:
+                logging.info(scenario + ' post action response did not match pre check output')
+                return False
+
+    return action_output
+
+
+# Perform the post scenario actions to see if components recovered
+def post_actions(kubeconfig_path, scenario, failed_post_scenarios, pre_action_output):
+
+    for failed_scenario in failed_post_scenarios:
+        post_action_output = run_post_action(kubeconfig_path,
+                                             failed_scenario[0], failed_scenario[1])
+        if post_action_output is not False:
+            failed_post_scenarios.remove(failed_scenario)
+        else:
+            logging.info('Post action scenario ' + str(failed_scenario) + "is still failing")
+
+    # check post actions
+    if len(scenario) > 1:
+        post_action_output = run_post_action(kubeconfig_path, scenario[1], pre_action_output)
+        if post_action_output is False:
+            failed_post_scenarios.append([scenario[1], pre_action_output])
+
+    return failed_post_scenarios
+
+
 # Main function
 def main(cfg):
     # Start kraken
@@ -24,7 +127,6 @@ def main(cfg):
             config = yaml.full_load(f)
         kubeconfig_path = config["kraken"]["kubeconfig_path"]
         scenarios = config["kraken"]["scenarios"]
-        cerberus_enabled = config["cerberus"]["cerberus_enabled"]
         wait_duration = config["tunings"]["wait_duration"]
         iterations = config["tunings"]["iterations"]
         daemon_mode = config["tunings"]['daemon_mode']
@@ -59,41 +161,34 @@ def main(cfg):
                          % str(iterations))
             iterations = int(iterations)
 
+        failed_post_scenarios = []
         # Loop to run the chaos starts here
         while (int(iteration) < iterations):
             # Inject chaos scenarios specified in the config
+            logging.info("Executing scenarios for iteration " + str(iteration))
             try:
                 # Loop to run the scenarios starts here
                 for scenario in scenarios:
-                    logging.info("Injecting scenario: %s" % (scenario))
+                    pre_action_output = run_post_action(kubeconfig_path, scenario[1])
                     runcommand.invoke("powerfulseal autonomous --use-pod-delete-instead-of-ssh-kill"
                                       " --policy-file %s --kubeconfig %s --no-cloud"
                                       " --inventory-kubernetes --headless"
-                                      % (scenario, kubeconfig_path))
-                    logging.info("Scenario: %s has been successfully injected!" % (scenario))
-
-                    if cerberus_enabled:
-                        cerberus_url = config["cerberus"]["cerberus_url"]
-                        if not cerberus_url:
-                            logging.error("url where Cerberus publishes True/False signal "
-                                          "is not provided.")
-                            sys.exit(1)
-                        cerberus_status = requests.get(cerberus_url).content
-                        cerberus_status = True if cerberus_status == b'True' else False
-                        if not cerberus_status:
-                            logging.error("Received a no-go signal from Cerberus, looks like the"
-                                          " cluster is unhealthy. Please check the Cerberus report"
-                                          " for more details. Test failed.")
-                            sys.exit(1)
-                        else:
-                            logging.info("Received a go signal from Ceberus, the cluster is "
-                                         "healthy. Test passed.")
+                                      % (scenario[0], kubeconfig_path))
+
+                    logging.info("Scenario: %s has been successfully injected!" % (scenario[0]))
                     logging.info("Waiting for the specified duration: %s" % (wait_duration))
                     time.sleep(wait_duration)
+                    failed_post_scenarios = post_actions(kubeconfig_path, scenario,
+                                                         failed_post_scenarios, pre_action_output)
+                    publish_kraken_status(config, failed_post_scenarios)
             except Exception as e:
                 logging.error("Failed to run scenario: %s. Encountered the following exception: %s"
-                              % (scenario, e))
+                              % (scenario[0], e))
             iteration += 1
+            logging.info("")
+        if failed_post_scenarios:
+            logging.error("Post scenarios are still failing at the end of all iterations")
+            sys.exit(1)
     else:
         logging.error("Cannot find a config at %s, please check" % (cfg))
         sys.exit(1)
diff --git a/scenarios/etcd.yml b/scenarios/etcd.yml
@@ -11,12 +11,9 @@ scenarios:
           - labels:
               namespace: "openshift-etcd"
               selector: "k8s-app=etcd"
-
         filters:
           - randomSample:
               size: 1
-
-        # The actions will be executed in the order specified
         actions:
           - kill:
               probability: 1
diff --git a/scenarios/openshift-apiserver.yml b/scenarios/openshift-apiserver.yml
diff --git a/scenarios/openshift-kube-apiserver.yml b/scenarios/openshift-kube-apiserver.yml
@@ -11,7 +11,6 @@ scenarios:
           - labels:
               namespace: "openshift-kube-apiserver"
               selector: "app=openshift-kube-apiserver"
-
         filters:
           - randomSample:
               size: 1
diff --git a/scenarios/post_action_etcd.yml b/scenarios/post_action_etcd.yml
@@ -0,0 +1,21 @@
+config:
+  runStrategy:
+    runs: 1
+    maxSecondsBetweenRuns: 10
+    minSecondsBetweenRuns: 1
+scenarios:
+  - name: "check 3 pods are in namespace with selector: etcd"
+    steps:
+    - podAction:
+        matches:
+          - labels:
+              namespace: "openshift-etcd"
+              selector: "k8s-app=etcd"
+        filters:
+          - property:
+              name: "state"
+              value: "Running"
+        # The actions will be executed in the order specified
+        actions:
+          - checkPodCount:
+              count: 3
diff --git a/scenarios/post_action_etcd_example.sh b/scenarios/post_action_etcd_example.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+pods="$(oc get pods -n openshift-etcd | grep -c Running)"
+echo "$pods"
diff --git a/scenarios/post_action_etcd_example_py.py b/scenarios/post_action_etcd_example_py.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+import subprocess
+import logging
+
+
+def run(cmd):
+    try:
+        output = subprocess.Popen(cmd, shell=True,
+                                  universal_newlines=True, stdout=subprocess.PIPE,
+                                  stderr=subprocess.STDOUT)
+        (out, err) = output.communicate()
+        logging.info("out " + str(out))
+    except Exception as e:
+        logging.error("Failed to run %s, error: %s" % (cmd, e))
+    return out
+
+
+pods_running = run("oc get pods -n openshift-etcd | grep -c Running").rstrip()
+
+if pods_running == str(3):
+    print("There were 3 pods running properly")
+else:
+    print("ERROR there were " + str(pods_running) + " pods running instead of 3")
diff --git a/scenarios/post_action_openshift-apiserver.yml b/scenarios/post_action_openshift-apiserver.yml
@@ -0,0 +1,23 @@
+config:
+  runStrategy:
+    runs: 1
+    maxSecondsBetweenRuns: 30
+    minSecondsBetweenRuns: 1
+scenarios:
+  - name: "check 3 pods are in namespace with selector: openshift-apiserver"
+    steps:
+    - podAction:
+        matches:
+          - labels:
+              namespace: "openshift-apiserver"
+              selector: "app=openshift-apiserver"
+
+        filters:
+          - property:
+              name: "state"
+              value: "Running"
+
+        # The actions will be executed in the order specified
+        actions:
+          - checkPodCount:
+              count: 3
diff --git a/scenarios/post_action_openshift-kube-apiserver.yml b/scenarios/post_action_openshift-kube-apiserver.yml
@@ -0,0 +1,21 @@
+config:
+  runStrategy:
+    runs: 1
+    maxSecondsBetweenRuns: 30
+    minSecondsBetweenRuns: 1
+scenarios:
+  - name: "check 3 pods are in namespace with selector: openshift-kube-apiserver"
+    steps:
+    - podAction:
+        matches:
+          - labels:
+              namespace: "openshift-kube-apiserver"
+              selector: "app=openshift-kube-apiserver"
+        filters:
+          - property:
+              name: "state"
+              value: "Running"
+        # The actions will be executed in the order specified
+        actions:
+          - checkPodCount:
+              count: 3
diff --git a/scenarios/post_action_regex.py b/scenarios/post_action_regex.py
diff --git a/scenarios/post_action_regex.sh b/scenarios/post_action_regex.sh
diff --git a/scenarios/post_action_regex_openshift_pod_kill.yml b/scenarios/post_action_regex_openshift_pod_kill.yml
diff --git a/scenarios/regex_openshift_pod_kill.yml b/scenarios/regex_openshift_pod_kill.yml

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+#!/bin/bash`
	`2`	`+pods="$(oc get pods -n openshift-etcd \| grep -c Running)"`
	`3`	`+echo "$pods"`