-
Notifications
You must be signed in to change notification settings - Fork 9
/
check_puppet.rb
146 lines (119 loc) · 4.18 KB
/
check_puppet.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/ruby
# A simple nagios check that should be run as root
# perhaps under the mcollective NRPE plugin and
# can check when the last run was done of puppet.
# It can also check fail counts and skip machines
# that are not enabled
#
# The script will use the puppet last_run-summar.yaml
# file to determine when last Puppet ran else the age
# of the statefile.
require 'optparse'
require 'yaml'
lockfile = "/var/lib/puppet/state/puppetdlock"
statefile = "/var/lib/puppet/state/state.yaml"
summaryfile = "/var/lib/puppet/state/last_run_summary.yaml"
enabled = true
running = false
lastrun_failed = false
lastrun = 0
failcount = 0
warn = 0
crit = 0
enabled_only = false
failures = false
opt = OptionParser.new
opt.on("--critical [CRIT]", "-c", Integer, "Critical threshold, time or failed resources") do |f|
crit = f.to_i
end
opt.on("--warn [WARN]", "-w", Integer, "Warning thresold, time of failed resources") do |f|
warn = f.to_i
end
opt.on("--check-failures", "-f", "Check for failed resources instead of time since run") do |f|
failures = true
end
opt.on("--only-enabled", "-e", "Only alert if Puppet is enabled") do |f|
enabled_only = true
end
opt.on("--lock-file [FILE]", "-l", "Location of the lock file, default #{lockfile}") do |f|
lockfile = f
end
opt.on("--state-file [FILE]", "-t", "Location of the state file, default #{statefile}") do |f|
statefile = f
end
opt.on("--summary-file [FILE]", "-s", "Location of the summary file, default #{summaryfile}") do |f|
summaryfile = f
end
opt.parse!
if warn == 0 || crit == 0
puts "Please specify a warning and critical level"
exit 3
end
if File.exists?(lockfile)
if File::Stat.new(lockfile).zero?
enabled = false
else
running = true
end
end
lastrun = File.stat(statefile).mtime.to_i if File.exists?(statefile)
if File.exists?(summaryfile)
begin
summary = YAML.load_file(summaryfile)
lastrun = summary["time"]["last_run"]
# machines that outright failed to run like on missing dependencies
# are treated as huge failures. The yaml file will be valid but
# it wont have anything but last_run in it
unless summary.include?("events")
failcount = 99
else
# and unless there are failures, the events hash just wont have the failure count
eventsfail = summary["events"]["failure"] || 0
resourcesfail = summary["resources"]["failed"] || 0
failcount = eventsfail + resourcesfail
end
rescue
failcount = 0
summary = nil
end
end
time_since_last_run = Time.now.to_i - lastrun
unless failures
if enabled_only && enabled == false
puts "OK: Puppet is currently disabled, not alerting. Last run #{time_since_last_run} seconds ago with #{failcount} failures"
exit 0
end
if time_since_last_run >= crit
puts "CRITICAL: Puppet last ran #{time_since_last_run} seconds ago, expected < #{crit}"
exit 2
elsif time_since_last_run >= warn
puts "WARNING: Puppet last ran #{time_since_last_run} seconds ago, expected < #{warn}"
exit 1
else
if enabled
puts "OK: Puppet is currently enabled, last run #{time_since_last_run} seconds ago with #{failcount} failures"
else
puts "OK: Puppet is currently disabled, last run #{time_since_last_run} seconds ago with #{failcount} failures"
end
exit 0
end
else
if enabled_only && enabled == false
puts "OK: Puppet is currently disabled, not alerting. Last run #{time_since_last_run} seconds ago with #{failcount} failures"
exit 0
end
if failcount >= crit
puts "CRITICAL: Puppet last ran had #{failcount} failures, expected < #{crit}"
exit 2
elsif failcount >= warn
puts "WARNING: Puppet last ran had #{failcount} failures, expected < #{warn}"
exit 1
else
if enabled
puts "OK: Puppet is currently enabled, last run #{time_since_last_run} seconds ago with #{failcount} failures"
else
puts "OK: Puppet is currently disabled, last run #{time_since_last_run} seconds ago with #{failcount} failures"
end
exit 0
end
end