Skip to content

Commit 0fc8209

Browse files
committed
Adding watch to see if components recovered
1 parent c033aa4 commit 0fc8209

16 files changed

+322
-36
lines changed

config/config.yaml

+9-5
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
kraken:
22
kubeconfig_path: /root/.kube/config # Path to kubeconfig
3+
exit_on_failure: False # Exit when a post action scenario fails
34
scenarios: # List of policies/chaos scenarios to load
4-
- scenarios/etcd.yml
5-
- scenarios/openshift-kube-apiserver.yml
6-
- scenarios/openshift-apiserver.yml
7-
- scenarios/regex_openshift_pod_kill.yml
8-
5+
- - scenarios/etcd.yml
6+
- scenarios/post_action_etcd_example.sh
7+
- - scenarios/openshift-kube-apiserver.yml
8+
- scenarios/post_action_openshift-kube-apiserver.yml
9+
- - scenarios/openshift-apiserver.yml
10+
- scenarios/post_action_openshift-apiserver.yml
11+
- - scenarios/regex_openshift_pod_kill.yml
12+
- scenarios/post_action_regex.py
913
cerberus:
1014
cerberus_enabled: False # Enable it when cerberus is previously installed
1115
cerberus_url: # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal

kraken/invoke/command.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@
55
# Invokes a given command and returns the stdout
66
def invoke(command):
77
try:
8-
output = subprocess.check_output(command, shell=True,
9-
universal_newlines=True)
10-
except Exception:
11-
logging.error("Failed to run %s" % (command))
12-
return output
8+
output = subprocess.Popen(command, shell=True,
9+
universal_newlines=True, stdout=subprocess.PIPE,
10+
stderr=subprocess.STDOUT)
11+
(out, err) = output.communicate()
12+
except Exception as e:
13+
logging.error("Failed to run %s, error: %s" % (command, e))
14+
return out

kraken/kubernetes/client.py

+1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import kraken.invoke.command as runcommand
55
import json
66

7+
78
kraken_node_name = ""
89

910

run_kraken.py

+117-22
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,109 @@
1212
import pyfiglet
1313

1414

15+
# Get cerberus status
16+
def cerberus_integration(config):
17+
cerberus_status = True
18+
if config["cerberus"]["cerberus_enabled"]:
19+
cerberus_url = config["cerberus"]["cerberus_url"]
20+
if not cerberus_url:
21+
logging.error("url where Cerberus publishes True/False signal is not provided.")
22+
sys.exit(1)
23+
cerberus_status = requests.get(cerberus_url).content
24+
cerberus_status = True if cerberus_status == b'True' else False
25+
if not cerberus_status:
26+
logging.error("Received a no-go signal from Cerberus, looks like "
27+
"the cluster is unhealthy. Please check the Cerberus "
28+
"report for more details. Test failed.")
29+
sys.exit(1)
30+
else:
31+
logging.info("Received a go signal from Ceberus, the cluster is healthy. "
32+
"Test passed.")
33+
return cerberus_status
34+
35+
36+
# Function to publish kraken status to cerberus
37+
def publish_kraken_status(config, failed_post_scenarios):
38+
cerberus_status = cerberus_integration(config)
39+
if not cerberus_status:
40+
if failed_post_scenarios:
41+
if config['kraken']['exit_on_failure']:
42+
logging.info("Cerberus status is not healthy and post action scenarios "
43+
"are still failing, exiting kraken run")
44+
sys.exit(1)
45+
else:
46+
logging.info("Cerberus status is not healthy and post action scenarios "
47+
"are still failing")
48+
else:
49+
50+
if failed_post_scenarios:
51+
if config['kraken']['exit_on_failure']:
52+
logging.info("Cerberus status is healthy but post action scenarios "
53+
"are still failing, exiting kraken run")
54+
sys.exit(1)
55+
else:
56+
logging.info("Cerberus status is healthy but post action scenarios "
57+
"are still failing")
58+
59+
60+
def run_post_action(kubeconfig_path, scenario, pre_action_output=""):
61+
62+
if scenario.endswith(".yaml") or scenario.endswith(".yml"):
63+
action_output = runcommand.invoke("powerfulseal autonomous "
64+
"--use-pod-delete-instead-of-ssh-kill"
65+
" --policy-file %s --kubeconfig %s --no-cloud"
66+
" --inventory-kubernetes --headless"
67+
% (scenario, kubeconfig_path))
68+
# read output to make sure no error
69+
if "ERROR" in action_output:
70+
action_output.split("ERROR")[1].split('\n')[0]
71+
if not pre_action_output:
72+
logging.info("Powerful seal pre action check failed for " + str(scenario))
73+
return False
74+
else:
75+
logging.info(scenario + " post action checks passed")
76+
77+
elif scenario.endswith(".py"):
78+
action_output = runcommand.invoke("python3 " + scenario).strip()
79+
if pre_action_output:
80+
if pre_action_output == action_output:
81+
logging.info(scenario + " post action checks passed")
82+
else:
83+
logging.info(scenario + ' post action response did not match pre check output')
84+
return False
85+
else:
86+
# invoke custom bash script
87+
action_output = runcommand.invoke(scenario).strip()
88+
if pre_action_output:
89+
if pre_action_output == action_output:
90+
logging.info(scenario + " post action checks passed")
91+
else:
92+
logging.info(scenario + ' post action response did not match pre check output')
93+
return False
94+
95+
return action_output
96+
97+
98+
# Perform the post scenario actions to see if components recovered
99+
def post_actions(kubeconfig_path, scenario, failed_post_scenarios, pre_action_output):
100+
101+
for failed_scenario in failed_post_scenarios:
102+
post_action_output = run_post_action(kubeconfig_path,
103+
failed_scenario[0], failed_scenario[1])
104+
if post_action_output is not False:
105+
failed_post_scenarios.remove(failed_scenario)
106+
else:
107+
logging.info('Post action scenario ' + str(failed_scenario) + "is still failing")
108+
109+
# check post actions
110+
if len(scenario) > 1:
111+
post_action_output = run_post_action(kubeconfig_path, scenario[1], pre_action_output)
112+
if post_action_output is False:
113+
failed_post_scenarios.append([scenario[1], pre_action_output])
114+
115+
return failed_post_scenarios
116+
117+
15118
# Main function
16119
def main(cfg):
17120
# Start kraken
@@ -24,7 +127,6 @@ def main(cfg):
24127
config = yaml.full_load(f)
25128
kubeconfig_path = config["kraken"]["kubeconfig_path"]
26129
scenarios = config["kraken"]["scenarios"]
27-
cerberus_enabled = config["cerberus"]["cerberus_enabled"]
28130
wait_duration = config["tunings"]["wait_duration"]
29131
iterations = config["tunings"]["iterations"]
30132
daemon_mode = config["tunings"]['daemon_mode']
@@ -59,41 +161,34 @@ def main(cfg):
59161
% str(iterations))
60162
iterations = int(iterations)
61163

164+
failed_post_scenarios = []
62165
# Loop to run the chaos starts here
63166
while (int(iteration) < iterations):
64167
# Inject chaos scenarios specified in the config
168+
logging.info("Executing scenarios for iteration " + str(iteration))
65169
try:
66170
# Loop to run the scenarios starts here
67171
for scenario in scenarios:
68-
logging.info("Injecting scenario: %s" % (scenario))
172+
pre_action_output = run_post_action(kubeconfig_path, scenario[1])
69173
runcommand.invoke("powerfulseal autonomous --use-pod-delete-instead-of-ssh-kill"
70174
" --policy-file %s --kubeconfig %s --no-cloud"
71175
" --inventory-kubernetes --headless"
72-
% (scenario, kubeconfig_path))
73-
logging.info("Scenario: %s has been successfully injected!" % (scenario))
74-
75-
if cerberus_enabled:
76-
cerberus_url = config["cerberus"]["cerberus_url"]
77-
if not cerberus_url:
78-
logging.error("url where Cerberus publishes True/False signal "
79-
"is not provided.")
80-
sys.exit(1)
81-
cerberus_status = requests.get(cerberus_url).content
82-
cerberus_status = True if cerberus_status == b'True' else False
83-
if not cerberus_status:
84-
logging.error("Received a no-go signal from Cerberus, looks like the"
85-
" cluster is unhealthy. Please check the Cerberus report"
86-
" for more details. Test failed.")
87-
sys.exit(1)
88-
else:
89-
logging.info("Received a go signal from Ceberus, the cluster is "
90-
"healthy. Test passed.")
176+
% (scenario[0], kubeconfig_path))
177+
178+
logging.info("Scenario: %s has been successfully injected!" % (scenario[0]))
91179
logging.info("Waiting for the specified duration: %s" % (wait_duration))
92180
time.sleep(wait_duration)
181+
failed_post_scenarios = post_actions(kubeconfig_path, scenario,
182+
failed_post_scenarios, pre_action_output)
183+
publish_kraken_status(config, failed_post_scenarios)
93184
except Exception as e:
94185
logging.error("Failed to run scenario: %s. Encountered the following exception: %s"
95-
% (scenario, e))
186+
% (scenario[0], e))
96187
iteration += 1
188+
logging.info("")
189+
if failed_post_scenarios:
190+
logging.error("Post scenarios are still failing at the end of all iterations")
191+
sys.exit(1)
97192
else:
98193
logging.error("Cannot find a config at %s, please check" % (cfg))
99194
sys.exit(1)

scenarios/etcd.yml

100644100755
-3
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,9 @@ scenarios:
1111
- labels:
1212
namespace: "openshift-etcd"
1313
selector: "k8s-app=etcd"
14-
1514
filters:
1615
- randomSample:
1716
size: 1
18-
19-
# The actions will be executed in the order specified
2017
actions:
2118
- kill:
2219
probability: 1

scenarios/openshift-apiserver.yml

100644100755
File mode changed.

scenarios/openshift-kube-apiserver.yml

100644100755
-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ scenarios:
1111
- labels:
1212
namespace: "openshift-kube-apiserver"
1313
selector: "app=openshift-kube-apiserver"
14-
1514
filters:
1615
- randomSample:
1716
size: 1

scenarios/post_action_etcd.yml

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
config:
2+
runStrategy:
3+
runs: 1
4+
maxSecondsBetweenRuns: 10
5+
minSecondsBetweenRuns: 1
6+
scenarios:
7+
- name: "check 3 pods are in namespace with selector: etcd"
8+
steps:
9+
- podAction:
10+
matches:
11+
- labels:
12+
namespace: "openshift-etcd"
13+
selector: "k8s-app=etcd"
14+
filters:
15+
- property:
16+
name: "state"
17+
value: "Running"
18+
# The actions will be executed in the order specified
19+
actions:
20+
- checkPodCount:
21+
count: 3

scenarios/post_action_etcd_example.sh

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#!/bin/bash
2+
pods="$(oc get pods -n openshift-etcd | grep -c Running)"
3+
echo "$pods"
+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#!/usr/bin/env python3
2+
import subprocess
3+
import logging
4+
5+
6+
def run(cmd):
7+
try:
8+
output = subprocess.Popen(cmd, shell=True,
9+
universal_newlines=True, stdout=subprocess.PIPE,
10+
stderr=subprocess.STDOUT)
11+
(out, err) = output.communicate()
12+
logging.info("out " + str(out))
13+
except Exception as e:
14+
logging.error("Failed to run %s, error: %s" % (cmd, e))
15+
return out
16+
17+
18+
pods_running = run("oc get pods -n openshift-etcd | grep -c Running").rstrip()
19+
20+
if pods_running == str(3):
21+
print("There were 3 pods running properly")
22+
else:
23+
print("ERROR there were " + str(pods_running) + " pods running instead of 3")
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
config:
2+
runStrategy:
3+
runs: 1
4+
maxSecondsBetweenRuns: 30
5+
minSecondsBetweenRuns: 1
6+
scenarios:
7+
- name: "check 3 pods are in namespace with selector: openshift-apiserver"
8+
steps:
9+
- podAction:
10+
matches:
11+
- labels:
12+
namespace: "openshift-apiserver"
13+
selector: "app=openshift-apiserver"
14+
15+
filters:
16+
- property:
17+
name: "state"
18+
value: "Running"
19+
20+
# The actions will be executed in the order specified
21+
actions:
22+
- checkPodCount:
23+
count: 3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
config:
2+
runStrategy:
3+
runs: 1
4+
maxSecondsBetweenRuns: 30
5+
minSecondsBetweenRuns: 1
6+
scenarios:
7+
- name: "check 3 pods are in namespace with selector: openshift-kube-apiserver"
8+
steps:
9+
- podAction:
10+
matches:
11+
- labels:
12+
namespace: "openshift-kube-apiserver"
13+
selector: "app=openshift-kube-apiserver"
14+
filters:
15+
- property:
16+
name: "state"
17+
value: "Running"
18+
# The actions will be executed in the order specified
19+
actions:
20+
- checkPodCount:
21+
count: 3

0 commit comments

Comments
 (0)