#!/bin/sh # Nagios plugin to monitor Puppet agent state # # Copyright (c) 2011 Alexander Swen # # Permission to use, copy, modify, and distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. # # # Example configuration # # Typical this check is placed on a client and runs via nrpe. # So add this to nrpe.cfg: # command[check_puppet_agent]=/usr/lib/nagios/plugins/check_puppet # or if you want to specify options (rather than have the script calculate key values and facts) then something like # command[check_puppet_agent]=/usr/lib/nagios/plugins/check_puppet -w 3600 -c 7200 -s /var/lib/puppet/state/last_run_summary.yaml -d 0 # This should warn when the agent hasnt run for an hour and go critical after two hours # if you have dont_blame_nrpe=1 set you can choose to # command[check_puppet_agent]=/usr/lib/nagios/plugins/check_puppet -w $ARG1$ -c $ARG2$ -s $ARG3$ -d $ARG4$ # # define service { # use generic-service # service_description Puppet agent # check_command check_nrpe!check_puppet_agent # or # check_command check_nrpe!check_puppet_agent!3600!7200 #} # # Sudo required. # The user running this script must be allowed using sudo to run puppet config print, e.g. in /etc/sudoers include the 3 lines # User_Alias NAGIOS=nagios # Cmnd_Alias PUPPETCHECK=/usr/bin/puppet config print all, \ # puppet 2 # /usr/bin/puppet config print, \ # puppet 3 # /usr/bin/puppet config print --section agent # other puppet version # NAGIOS ALL=NOPASSWD:PUPPETCHECK # # CHANGELOG: # 20120126 A.Swen created. # 20120214 trey85stang Modified, added getopts, usage, defaults. # 20120220 A.Swen lastrunfile can be overriden. # 20130717 A.Swen Moved finding lastrunfile to after getopts and made it conditional to param -s. # Added option to tell script if puppet agent is started from cron or as a daemon (-d). # Switched to use awk to filter values from lastrunfile and set them as params. # Updated some comments. # Removed bug in search for process (that would previously always find something because grep find it's processline). # "puppet agent --configprint lastrunfile" has to be run as root. As normal user it yields ~/.puppet/var/state. # Based on feedback Михайло Масик updated: # - Puppet --configprint => puppet agent --configprint (version 3 has new way of printing config). # - Added new pattern to search for process. # - Added test kill -0 to see if process is still there. # 20130725 A.Swen Based on feedback Михайло Масик updated a test (removed ! from test). # 20130725 A.Swen Added sudo to puppet config print pidfile. # 20131209 Mark Ruys Issue warning when last_run_report.yaml contain errors. # 20141015 A.Swen Add show disabled status. # 20141127 KissT Remove requirement to have sudo custom rule. # 20150917 A.Swen Based on an idea of Daniel Lawrence check for major version to decide how to print config. # Based on idea of D.Stirling switched to sh. # Findout puppet executable location using which. # Based on an idea of D.Stirling updated daemon check. # Based on an idea of D.Stirling made BSD compattible. # Based on an idea of BTriller fix the getopts command to parse the agent_disabled_lockfile option. # 20151201 Akomakom Add perf data option. # More reliable yaml parsing. # If $HOME not set: set it. # Fix PS command for Suse. # 20151218 K.A. Gillow Calculate warn/crit based on runinterval and splay setting rather than use fixed settings. # Check system has been up longer than crit/warn time otherwise don't yet trigger normally relevant fault levels. # We never generally want puppet disabled so change to warning. # 20151229 A.Swen Fix bug in PERF_DATA (replace compset by set). # Prettify $PERF_DATA output. # 20160201 S. Sams Changes to PERF_DATA output format to increase compatibility with Nagios Plugin guidelines. # Add compatibility with Puppet 4.x # 20160315 J. Yaworski Add -v, allowing to pass a version to compare # 20160815 L. Buriola Add -E to show first error on output # 20170426 benwtr Detect failure to retrieve catalog from server as a warning. # 20180324 deric Discard puppet config error (logging) output # FUNCTIONS result () { case $1 in 0) echo "OK: Puppet agent $version running catalog version $config, and executed at $last_run_human for last time. $PERF_DATA";rc=0 ;; 1) echo "UNKNOWN: last_run_summary.yaml not found, not readable or incomplete";rc=3 ;; 2) echo "WARNING: Last run was $time_since_last seconds ago. Warn is $WARN. $PERF_DATA";rc=1 ;; 3) echo "CRITICAL: Last run was $time_since_last seconds ago. Crit is $CRIT. $PERF_DATA";rc=2 ;; 4) echo "CRITICAL: Puppet daemon not running or something wrong with process";rc=2 ;; 5) echo "UNKNOWN: no WARN or CRIT parameters were sent to this check";rc=3 ;; 6) echo "CRITICAL: Last run had 1 or more errors. Check the logs. $FIRST_ERROR $PERF_DATA";rc=2 ;; 7) echo "DISABLED: Reason: $(sed -e 's/{"disabled_message":"//' -e 's/"}//' $agent_disabled_lockfile). $PERF_DATA";rc=1 ;; 8) echo "UNKNOWN: No Puppet executable found";rc=3 ;; 9) echo "UNKNOWN: Internal error: $2"; rc=3 ;; 10) echo "OK (PROBABLY): Puppet agent last successful run $last_run_human (runinterval $runinterval, splay $splay, splaylimit $splay limit) but system has not been up long enough to guarantee a fresh puppet run should have occurred";rc=0 ;; 11) echo "INFO: Puppet agent is version $version, but should be $wanted_version. $PERF_DATA";rc=0 ;; 12) echo "UNKNOWN: last_run_report.yaml not found, not readable or incomplete";rc=3 ;; 13) echo "WARNING: Failed to retrieve catalog on last run.";rc=1 ;; 14) echo "UNKNOWN: No sudo executable found";rc=3 ;; esac exit $rc } usage () { echo "" echo "USAGE: " echo " $0 [-c 7200] [-w 3600] [-d 0] [-l agent_disabled_lockfile] [-s lastrunfile] [-r lastrunreport] [-v wanted_version] [-PEh]" echo " -c Critical threshold (default 7200 seconds)" echo " -w Warning threshold (default 3600 seconds)" echo " -d 0|1: puppet agent should be a daemon(1) or not (0).(default 1)" echo " -h Show this help." echo " -l Agent_disabled_lockfile (default: /var/lib/puppet/state/agent_disabled.lock)" echo " -s Lastrunfile (default: /var/lib/puppet/state/last_run_summary.yaml)" echo " -r Lastrunreport (default: /var/lib/puppet/state/last_run_report.yaml)" echo " -P Enable perf_data in the output" echo " -E Show first error in the output" echo " -v The version of puppet that should be running" echo "" exit 1 } # Get a flat representation of yaml without relying on external tools. parse_yaml () { local prefix=$2 local s='[[:space:]]*' w='[a-zA-Z0-9_]*' fs=$(echo @|tr @ '\034') sed -ne "s|^\($s\):|\1|" \ -e "s|^\($s\)\($w\)$s:$s[\"']\(.*\)[\"']$s\$|\1$fs\2$fs\3|p" \ -e "s|^\($s\)\($w\)$s:$s\(.*\)$s\$|\1$fs\2$fs\3|p" $1 | awk -F$fs '{ indent = length($1)/2; vname[indent] = $2; for (i in vname) {if (i > indent) {delete vname[i]}} if (length($3) > 0) { vn=""; for (i=0; i/dev/null 2>&1 || result 14 # Find out Puppet major version to determine configprint syntax. puppet_major_version=$($PUPPET -V|cut -d. -f1) [ -z "$puppet_major_version" ] && result 9 "Puppet version unknown from $($PUPPET -V 2>&1)" # Set Puppet configprint syntax. case $puppet_major_version in 2) puppet_config_print="sudo $PUPPET config print all" ;; 3) puppet_config_print="sudo $PUPPET config print" ;; *) puppet_config_print="sudo $PUPPET config print --section agent" ;; esac puppet_config_output="$($puppet_config_print 2> /dev/null)" # construct WARN and CRIT times based on runinterval plus a safety buffer # if they have not already been explicitly set runinterval=$(parse_puppet_config "runinterval") splaylimit=0 splay=$(parse_puppet_config "splay") [ "$splay" != "false" ] && splaylimit=$(parse_puppet_config "splaylimit") [ -z "$WARN" ] && WARN=$(($runinterval + $splaylimit)) [ -z "$CRIT" ] && CRIT=$(($WARN + $runinterval)) #now check we finally have some sensible settings [ -z "$WARN" -o $WARN -lt 30 ] && result 5 [ -z "$CRIT" -o $CRIT -lt 60 ] && result 5 # If the disabled lockfile is not given as a param try to find it ourselves. [ -z "$agent_disabled_lockfile" ] && agent_disabled_lockfile=$(parse_puppet_config "agent_disabled_lockfile") # If there's a disabled.lock file don't look any further. [ -f "$agent_disabled_lockfile" ] && result 7 # If the lastrunfile is not given as a param try to find it ourselves. [ -z "$lastrunfile" ] && lastrunfile=$(parse_puppet_config "lastrunfile") # Check if state file exists. [ -s $lastrunfile -a -r $lastrunfile ] || result 1 # If the lastrunreport is not given as a param try to find it ourselves. [ -z "$lastrunreport" ] && lastrunreport=$(parse_puppet_config "lastrunreport") # Check if the lastrunreport is readable [ -r "$lastrunreport" ] || result 12 # Check if state file exists. [ -n "$SHOW_ERROR" ] && ( [ -s $lastrunreport -a -r $lastrunreport ] || result 12 ) # Check if daemonized was set, else set default to 1. [ -n "$daemonized" ] || daemonized=1 # If Puppet agent runs as a daemon there should be a process. We can't check so much when it is triggered by cron. if [ $daemonized -eq 1 ];then # Puppet version 4 changed several paths, determine correct ones if [ $puppet_major_version -ge 4 ];then puppet_daemon_rundir="puppetlabs" puppet_daemon_regex="/opt/puppetlabs/puppet/bin/ruby /opt/puppetlabs/puppet/bin/puppet" else puppet_daemon_rundir="puppet" puppet_daemon_regex="/usr(/local)?/bin/ruby[^ ]* /usr(/local)?/s?bin/puppetd?" fi # Check puppet daemon: [ "$(ps axfww|egrep "$puppet_daemon_regex"|grep -v egrep)" ] || result 4 uname -a|grep -q BSD && default_pidfile=/var/$puppet_daemon_rundir/run/agent.pid || default_pidfile=/var/run/$puppet_daemon_rundir/agent.pid [ -e $default_pidfile ] && pidfile=$default_pidfile || pidfile=$(parse_puppet_config "pidfile") # If there is a pidfile tell me the pid, else fail. [ -f $pidfile ]&&pid=$(cat $pidfile)||result 4 # See if the process is running. ps -p $pid > /dev/null || result 4 # On Linux test if the pid we found in the pidfile is puppet: if uname -a|grep -q Linux;then grep -q puppet /proc/$pid/cmdline ||result 4 fi fi # parse last run file # puppet version 4 files have less intendation, add prefix to match parsed variables from older versions [ $puppet_major_version -ge 4 ] && yaml_prefix="_" eval $(parse_yaml $lastrunfile $yaml_prefix) # this flattens the hierarchy to single-level name/value variables, eg: # _events_total="14" # _version_config="1448907293" # Construct perf data using anything that starts with "_resources_ or _time_total" if [ -n "$PERF" ] ; then for V in $(set | grep "^_resources_\|^_time_total") ; do PERF_DATA="$(echo $V | sed 's/^_//' | sed "s/='/=/" | sed "s/'$//") $PERF_DATA" done PERF_DATA="| $PERF_DATA" fi # Construct FIRST_ERROR using last_run_report.yaml if [ -n "$SHOW_ERROR" ] ; then FIRST_ERROR=$(get_first_error) fi # If the last run failed to retrieve the catalog from the server grep -q 'Could not retrieve catalog from remote server' $lastrunreport && result 13 # Check when last run happened. last_run=$_time_last_run last_run_human=$(date -d @$last_run +%c) now=$(date +%s) # Check how long system been up in seconds uptime=$(cut -f1 -d' ' /proc/uptime | cut -f1 -d.) # Assess last run time relative to warn/crit values and system uptime. time_since_last=$((now-last_run)) [ $time_since_last -ge $CRIT -a $uptime -ge $CRIT ] && result 3 [ $time_since_last -ge $CRIT -a $uptime -lt $CRIT ] && result 10 [ $time_since_last -ge $WARN -a $uptime -ge $WARN ] && result 2 [ $time_since_last -ge $WARN -a $uptime -lt $WARN ] && result 10 # Get some more info from the yaml file. config=$_version_config version=$_version_puppet failed=$_resources_failed failure=$_events_failure failed_to_restart=$_resources_failed_to_restart # If any of the values above doesn't return raise an error. [ -z "$last_run" -o -z "$config" -o -z "$version" -o -z "$failed" -o -z "$failure" -o -z "$failed_to_restart" ] && result 1 # If anything went wrong last run => crit. [ $failed -gt 0 -o $failure -gt 0 -o $failed_to_restart -gt 0 ] && result 6 # If $wanted_version is set, compare it to the running version if [ -n "$wanted_version" -a -n "$version" ]; then [ "$wanted_version" != "$version" ] && result 11 fi # If we reached here all is ok. result 0 # END