Table of Contents

check_longrunning_proc plugin

Sometimes you have cronjobs that just hang for no reason and block the next cron run. Well, here is an easy way to keep an eye out for such a process.

Tell the script what to look for and what the time limits should be (in seconds) and away you go.

Howto

Version 1.0 Copyright 2013, Charles Williams (chuck@itadmins.net) (http://www.itadmins.net/)

check_longrunning_proc is a Nagios plugin to check a specific process via ps and report how long it has been running. You must provide a complete string for the process to be checked as well as the warning and critical time constraints (in seconds).

check_longrunning_proc -p 'sh /usr/local/somecommand' -w 120 -c 240

Options:

-p/–process)

   You need to provide a string containing the exact process

-f/–file)

   Name PID file containing the PID of the process to check

-w/–warning)

   Defines a warning level for a target which is explained
   below.

-c/–critical)

   Defines a critical level for a target which is explained
   below.

-m/–missing)

   Check state to be reported if the process is missing (not
   running). C,W,U,O - Default is OK

ToDo

1. accept PID file option to read PID number directly instead of having to rely on pgrep.

Script

#!/bin/sh
#   This program is free software; you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation; either version 2 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program; if not, write to the Free Software
#   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
#
# Changes
#
# 1.1:
#   Added the -f/--file option for PID file support
#
# 1.0:
#   Initial version
#
PROGNAME=`basename $0`
VERSION="Version 1.1"
AUTHOR="Copyright 2013, Charles Williams (chuck@itadmins.net) (http://www.itadmins.net/)"

ST_OK=0
ST_WR=1
ST_CR=2
ST_UK=3
missing="O"

print_version() {
    echo "$VERSION $AUTHOR"
}

print_help() {
    print_version $PROGNAME $VERSION
    echo ""
    echo "$PROGNAME is a Nagios plugin to check a specific process via ps"
    echo "and report how long it has been running."
    echo "You must provide a complete string for the process to be checked"
    echo "as well as the warning and critical time constraints (in seconds)."
    echo ""
    echo "$PROGNAME -p 'sh /usr/local/somecommand' -w 120 -c 240"
    echo ""
    echo "Options:"
    echo "  -p/--process)"
    echo "     You need to provide a string containing the exact process"
    echo "  -f/--file)"
    echo "     Name PID file containing the PID of the process to check" 
    echo "  -w/--warning)"
    echo "     Defines a warning level for a target which is explained"
    echo "     below."
    echo "  -c/--critical)"
    echo "     Defines a critical level for a target which is explained"
    echo "     below."
    echo "  -m/--missing)"
    echo "     Check state to be reported if the process is missing (not"
    echo "     running). C,W,U,O - Default is OK"
    exit $ST_UK
}

while test -n "$1"; do
    case "$1" in
        -help|-h)
            print_help
            exit $ST_UK
            ;;
        --version|-v)
            print_version $PROGNAME $VERSION
            exit $ST_UK
            ;;
        --process|-p)
            process=$2
            shift
            ;;
        --file|-f)
        			if [ -f $2 ]
        			then
            		pidfileval=$(cat $2)
            else
            	echo "The PID file provided can not be found!"
            fi
            exit $ST_UK
            ;;
        --warning|-w)
            warning=$2
            shift
            ;;
        --critical|-c)
            critical=$2
            shift
            ;;
	--missing|-m)
	    missing=$2
	    shift
	    ;;
        *)
            echo "Unknown argument: $1"
            print_help
            exit $ST_UK
            ;;
        esac
    shift
done

get_proc() {
	if [ -z "$process" -a -z "$pidfileval" ]
	then
		echo "Please provide a process or PID file to check!"
		print_help
		exit $ST_UK
	fi
}

get_wcdiff() {
    if [ ! -z "$warning" -a ! -z "$critical" ]
    then
        wclvls=1
        if [ ${warning} -gt ${critical} ]
        then
            wcdiff=1
        fi
    elif [ ! -z "$warning" -a -z "$critical" ]
    then
        wcdiff=2
    elif [ -z "$warning" -a ! -z "$critical" ]
    then
        wcdiff=2
    elif [ -z "$warning" -o -z "$critical" ]
    then
    	wcdiff=2
    fi
}

val_wcdiff() {
    if [ "$wcdiff" = 1 ]
    then
        echo "Please adjust your warning/critical thresholds. The warning \
must be lower than the critical level!"
			print_help
        exit $ST_UK
    elif [ "$wcdiff" = 2 ]
    then
        echo "Please also set a warning AND critical threshold!"
        print_help
        exit $ST_UK
    fi
}

get_vals() {
	tpid=0
	tseconds=0
	old_ifs=$IFS
	
	if [ -z $pidfileval ]
	then
		for ps_pid in $(pgrep -f "${process}") ; do
	    	if [ -z "$ps_pid" -a "$missing" = "C" ]
	    	then
		      	echo "CRITICAL - Process is not running!"
	      		exit $ST_CR
			elif [ -z "$ps_pid" -a "$missing" = "W" ]
			then
				echo "WARNING - Process is not running."
				exit $ST_WA
			elif [ -z "$ps_pid" -a "$missing" = "U" ]
			then
				echo "UNKOWN - Process is not running."
				exit $ST_UK
			elif [ -z "$ps_pid" -a "$missing" = "O" ]
			then
				echo "OK - Process is not running."
				exit $ST_OK
	    	fi
	
			running=$(($(awk '{print $1}' FS=\. /proc/uptime) - $(awk '{printf ("%10d\n",$22/100)}' /proc/$ps_pid/stat)))
			if [ "$running" -gt "$tseconds" ]
			then
				tseconds=$running
				tpid=$ps_pid
			fi
		done
	else
			ps_pid=$pidfileval
			running=$(($(awk '{print $1}' FS=\. /proc/uptime) - $(awk '{printf ("%10d\n",$22/100)}' /proc/$ps_pid/stat)))
			process=$(awk '{gsub (/\(|\)/, ""); printf ("%s\n",$2)}' /proc/$ps_pid/stat)
			if [ "$running" -gt "$tseconds" ]
			then
				tseconds=$running
				tpid=$ps_pid
			fi
	fi

	ps_pid=$tpid
	running=$tseconds
}

get_proc
get_wcdiff
val_wcdiff
get_vals

output="Process: ${process}, PID: ${ps_pid}, Elapsed Time: ${running}"
perfdata="'PROCESS'=${process} 'PID'=${ps_pid} 'ETIME'=${running}"

if [ "$running" -ge "$warning" -a "$running" -lt "$critical" ]
then
	echo "WARNING - ${output} | ${perfdata}"
	exit $ST_WR
elif [ "$running" -ge "$critical" ]
then
	echo "CRITICAL - ${output} | ${perfdata}"
	exit $ST_CR
else
   echo "OK - ${output} | ${perfdata} ]"
	exit $ST_OK
fi