Sometimes you have cronjobs that just hang for no reason and block the next cron run. Well, here is an easy way to keep an eye out for such a process.
Tell the script what to look for and what the time limits should be (in seconds) and away you go.
Version 1.0 Copyright 2013, Charles Williams (chuck@itadmins.net) (http://www.itadmins.net/)
check_longrunning_proc is a Nagios plugin to check a specific process via ps and report how long it has been running. You must provide a complete string for the process to be checked as well as the warning and critical time constraints (in seconds).
check_longrunning_proc -p 'sh /usr/local/somecommand' -w 120 -c 240
Options:
-p/–process)
You need to provide a string containing the exact process
-f/–file)
Name PID file containing the PID of the process to check
-w/–warning)
Defines a warning level for a target which is explained below.
-c/–critical)
Defines a critical level for a target which is explained below.
-m/–missing)
Check state to be reported if the process is missing (not running). C,W,U,O - Default is OK
1. accept PID file option to read PID number directly instead of having to rely on pgrep.
#!/bin/sh # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA # # Changes # # 1.1: # Added the -f/--file option for PID file support # # 1.0: # Initial version # PROGNAME=`basename $0` VERSION="Version 1.1" AUTHOR="Copyright 2013, Charles Williams (chuck@itadmins.net) (http://www.itadmins.net/)" ST_OK=0 ST_WR=1 ST_CR=2 ST_UK=3 missing="O" print_version() { echo "$VERSION $AUTHOR" } print_help() { print_version $PROGNAME $VERSION echo "" echo "$PROGNAME is a Nagios plugin to check a specific process via ps" echo "and report how long it has been running." echo "You must provide a complete string for the process to be checked" echo "as well as the warning and critical time constraints (in seconds)." echo "" echo "$PROGNAME -p 'sh /usr/local/somecommand' -w 120 -c 240" echo "" echo "Options:" echo " -p/--process)" echo " You need to provide a string containing the exact process" echo " -f/--file)" echo " Name PID file containing the PID of the process to check" echo " -w/--warning)" echo " Defines a warning level for a target which is explained" echo " below." echo " -c/--critical)" echo " Defines a critical level for a target which is explained" echo " below." echo " -m/--missing)" echo " Check state to be reported if the process is missing (not" echo " running). C,W,U,O - Default is OK" exit $ST_UK } while test -n "$1"; do case "$1" in -help|-h) print_help exit $ST_UK ;; --version|-v) print_version $PROGNAME $VERSION exit $ST_UK ;; --process|-p) process=$2 shift ;; --file|-f) if [ -f $2 ] then pidfileval=$(cat $2) else echo "The PID file provided can not be found!" fi exit $ST_UK ;; --warning|-w) warning=$2 shift ;; --critical|-c) critical=$2 shift ;; --missing|-m) missing=$2 shift ;; *) echo "Unknown argument: $1" print_help exit $ST_UK ;; esac shift done get_proc() { if [ -z "$process" -a -z "$pidfileval" ] then echo "Please provide a process or PID file to check!" print_help exit $ST_UK fi } get_wcdiff() { if [ ! -z "$warning" -a ! -z "$critical" ] then wclvls=1 if [ ${warning} -gt ${critical} ] then wcdiff=1 fi elif [ ! -z "$warning" -a -z "$critical" ] then wcdiff=2 elif [ -z "$warning" -a ! -z "$critical" ] then wcdiff=2 elif [ -z "$warning" -o -z "$critical" ] then wcdiff=2 fi } val_wcdiff() { if [ "$wcdiff" = 1 ] then echo "Please adjust your warning/critical thresholds. The warning \ must be lower than the critical level!" print_help exit $ST_UK elif [ "$wcdiff" = 2 ] then echo "Please also set a warning AND critical threshold!" print_help exit $ST_UK fi } get_vals() { tpid=0 tseconds=0 old_ifs=$IFS if [ -z $pidfileval ] then for ps_pid in $(pgrep -f "${process}") ; do if [ -z "$ps_pid" -a "$missing" = "C" ] then echo "CRITICAL - Process is not running!" exit $ST_CR elif [ -z "$ps_pid" -a "$missing" = "W" ] then echo "WARNING - Process is not running." exit $ST_WA elif [ -z "$ps_pid" -a "$missing" = "U" ] then echo "UNKOWN - Process is not running." exit $ST_UK elif [ -z "$ps_pid" -a "$missing" = "O" ] then echo "OK - Process is not running." exit $ST_OK fi running=$(($(awk '{print $1}' FS=\. /proc/uptime) - $(awk '{printf ("%10d\n",$22/100)}' /proc/$ps_pid/stat))) if [ "$running" -gt "$tseconds" ] then tseconds=$running tpid=$ps_pid fi done else ps_pid=$pidfileval running=$(($(awk '{print $1}' FS=\. /proc/uptime) - $(awk '{printf ("%10d\n",$22/100)}' /proc/$ps_pid/stat))) process=$(awk '{gsub (/\(|\)/, ""); printf ("%s\n",$2)}' /proc/$ps_pid/stat) if [ "$running" -gt "$tseconds" ] then tseconds=$running tpid=$ps_pid fi fi ps_pid=$tpid running=$tseconds } get_proc get_wcdiff val_wcdiff get_vals output="Process: ${process}, PID: ${ps_pid}, Elapsed Time: ${running}" perfdata="'PROCESS'=${process} 'PID'=${ps_pid} 'ETIME'=${running}" if [ "$running" -ge "$warning" -a "$running" -lt "$critical" ] then echo "WARNING - ${output} | ${perfdata}" exit $ST_WR elif [ "$running" -ge "$critical" ] then echo "CRITICAL - ${output} | ${perfdata}" exit $ST_CR else echo "OK - ${output} | ${perfdata} ]" exit $ST_OK fi