Sometimes you have cronjobs that just hang for no reason and block the next cron run. Well, here is an easy way to keep an eye out for such a process.
Tell the script what to look for and what the time limits should be (in seconds) and away you go.
Version 1.0 Copyright 2013, Charles Williams (chuck@itadmins.net) (http://www.itadmins.net/)
check_longrunning_proc is a Nagios plugin to check a specific process via ps and report how long it has been running. You must provide a complete string for the process to be checked as well as the warning and critical time constraints (in seconds).
check_longrunning_proc -p 'sh /usr/local/somecommand' -w 120 -c 240
Options:
-p/–process)
You need to provide a string containing the exact process
-f/–file)
Name PID file containing the PID of the process to check
-w/–warning)
Defines a warning level for a target which is explained below.
-c/–critical)
Defines a critical level for a target which is explained below.
-m/–missing)
Check state to be reported if the process is missing (not running). C,W,U,O - Default is OK
1. accept PID file option to read PID number directly instead of having to rely on pgrep.
#!/bin/sh
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
#
# Changes
#
# 1.1:
# Added the -f/--file option for PID file support
#
# 1.0:
# Initial version
#
PROGNAME=`basename $0`
VERSION="Version 1.1"
AUTHOR="Copyright 2013, Charles Williams (chuck@itadmins.net) (http://www.itadmins.net/)"
ST_OK=0
ST_WR=1
ST_CR=2
ST_UK=3
missing="O"
print_version() {
echo "$VERSION $AUTHOR"
}
print_help() {
print_version $PROGNAME $VERSION
echo ""
echo "$PROGNAME is a Nagios plugin to check a specific process via ps"
echo "and report how long it has been running."
echo "You must provide a complete string for the process to be checked"
echo "as well as the warning and critical time constraints (in seconds)."
echo ""
echo "$PROGNAME -p 'sh /usr/local/somecommand' -w 120 -c 240"
echo ""
echo "Options:"
echo " -p/--process)"
echo " You need to provide a string containing the exact process"
echo " -f/--file)"
echo " Name PID file containing the PID of the process to check"
echo " -w/--warning)"
echo " Defines a warning level for a target which is explained"
echo " below."
echo " -c/--critical)"
echo " Defines a critical level for a target which is explained"
echo " below."
echo " -m/--missing)"
echo " Check state to be reported if the process is missing (not"
echo " running). C,W,U,O - Default is OK"
exit $ST_UK
}
while test -n "$1"; do
case "$1" in
-help|-h)
print_help
exit $ST_UK
;;
--version|-v)
print_version $PROGNAME $VERSION
exit $ST_UK
;;
--process|-p)
process=$2
shift
;;
--file|-f)
if [ -f $2 ]
then
pidfileval=$(cat $2)
else
echo "The PID file provided can not be found!"
fi
exit $ST_UK
;;
--warning|-w)
warning=$2
shift
;;
--critical|-c)
critical=$2
shift
;;
--missing|-m)
missing=$2
shift
;;
*)
echo "Unknown argument: $1"
print_help
exit $ST_UK
;;
esac
shift
done
get_proc() {
if [ -z "$process" -a -z "$pidfileval" ]
then
echo "Please provide a process or PID file to check!"
print_help
exit $ST_UK
fi
}
get_wcdiff() {
if [ ! -z "$warning" -a ! -z "$critical" ]
then
wclvls=1
if [ ${warning} -gt ${critical} ]
then
wcdiff=1
fi
elif [ ! -z "$warning" -a -z "$critical" ]
then
wcdiff=2
elif [ -z "$warning" -a ! -z "$critical" ]
then
wcdiff=2
elif [ -z "$warning" -o -z "$critical" ]
then
wcdiff=2
fi
}
val_wcdiff() {
if [ "$wcdiff" = 1 ]
then
echo "Please adjust your warning/critical thresholds. The warning \
must be lower than the critical level!"
print_help
exit $ST_UK
elif [ "$wcdiff" = 2 ]
then
echo "Please also set a warning AND critical threshold!"
print_help
exit $ST_UK
fi
}
get_vals() {
tpid=0
tseconds=0
old_ifs=$IFS
if [ -z $pidfileval ]
then
for ps_pid in $(pgrep -f "${process}") ; do
if [ -z "$ps_pid" -a "$missing" = "C" ]
then
echo "CRITICAL - Process is not running!"
exit $ST_CR
elif [ -z "$ps_pid" -a "$missing" = "W" ]
then
echo "WARNING - Process is not running."
exit $ST_WA
elif [ -z "$ps_pid" -a "$missing" = "U" ]
then
echo "UNKOWN - Process is not running."
exit $ST_UK
elif [ -z "$ps_pid" -a "$missing" = "O" ]
then
echo "OK - Process is not running."
exit $ST_OK
fi
running=$(($(awk '{print $1}' FS=\. /proc/uptime) - $(awk '{printf ("%10d\n",$22/100)}' /proc/$ps_pid/stat)))
if [ "$running" -gt "$tseconds" ]
then
tseconds=$running
tpid=$ps_pid
fi
done
else
ps_pid=$pidfileval
running=$(($(awk '{print $1}' FS=\. /proc/uptime) - $(awk '{printf ("%10d\n",$22/100)}' /proc/$ps_pid/stat)))
process=$(awk '{gsub (/\(|\)/, ""); printf ("%s\n",$2)}' /proc/$ps_pid/stat)
if [ "$running" -gt "$tseconds" ]
then
tseconds=$running
tpid=$ps_pid
fi
fi
ps_pid=$tpid
running=$tseconds
}
get_proc
get_wcdiff
val_wcdiff
get_vals
output="Process: ${process}, PID: ${ps_pid}, Elapsed Time: ${running}"
perfdata="'PROCESS'=${process} 'PID'=${ps_pid} 'ETIME'=${running}"
if [ "$running" -ge "$warning" -a "$running" -lt "$critical" ]
then
echo "WARNING - ${output} | ${perfdata}"
exit $ST_WR
elif [ "$running" -ge "$critical" ]
then
echo "CRITICAL - ${output} | ${perfdata}"
exit $ST_CR
else
echo "OK - ${output} | ${perfdata} ]"
exit $ST_OK
fi