js#vista.png msort nsort
js#vista.png msort nsort
Sometimes you have cronjobs that just hang for no reason and block the next cron run. Well, here is an easy way to keep an eye out for such a process.
Tell the script what to look for and what the time limits should be (in seconds) and away you go.
Version 1.0 Copyright 2013, Charles Williams (chuck@itadmins.net) (http://www.itadmins.net/)
check_longrunning_proc is a Nagios plugin to check a specific process via ps and report how long it has been running. You must provide a complete string for the process to be checked as well as the warning and critical time constraints (in seconds).
check_longrunning_proc -p 'sh /usr/local/somecommand' -w 120 -c 240
Options:
-p/–process)
You need to provide a string containing the exact process
-f/–file)
Name PID file containing the PID of the process to check
-w/–warning)
Defines a warning level for a target which is explained below.
-c/–critical)
Defines a critical level for a target which is explained below.
-m/–missing)
Check state to be reported if the process is missing (not running). C,W,U,O - Default is OK
1. accept PID file option to read PID number directly instead of having to rely on pgrep.
#!/bin/sh # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA # # Changes # # 1.1: # Added the -f/--file option for PID file support # # 1.0: # Initial version # PROGNAME=`basename $0` VERSION="Version 1.1" AUTHOR="Copyright 2013, Charles Williams (chuck@itadmins.net) (http://www.itadmins.net/)" ST_OK=0 ST_WR=1 ST_CR=2 ST_UK=3 missing="O" print_version() { echo "$VERSION $AUTHOR" } print_help() { print_version $PROGNAME $VERSION echo "" echo "$PROGNAME is a Nagios plugin to check a specific process via ps" echo "and report how long it has been running." echo "You must provide a complete string for the process to be checked" echo "as well as the warning and critical time constraints (in seconds)." echo "" echo "$PROGNAME -p 'sh /usr/local/somecommand' -w 120 -c 240" echo "" echo "Options:" echo " -p/--process)" echo " You need to provide a string containing the exact process" echo " -f/--file)" echo " Name PID file containing the PID of the process to check" echo " -w/--warning)" echo " Defines a warning level for a target which is explained" echo " below." echo " -c/--critical)" echo " Defines a critical level for a target which is explained" echo " below." echo " -m/--missing)" echo " Check state to be reported if the process is missing (not" echo " running). C,W,U,O - Default is OK" exit $ST_UK } while test -n "$1"; do case "$1" in -help|-h) print_help exit $ST_UK ;; --version|-v) print_version $PROGNAME $VERSION exit $ST_UK ;; --process|-p) process=$2 shift ;; --file|-f) if [ -f $2 ] then pidfileval=$(cat $2) else echo "The PID file provided can not be found!" fi exit $ST_UK ;; --warning|-w) warning=$2 shift ;; --critical|-c) critical=$2 shift ;; --missing|-m) missing=$2 shift ;; *) echo "Unknown argument: $1" print_help exit $ST_UK ;; esac shift done get_proc() { if [ -z "$process" -a -z "$pidfileval" ] then echo "Please provide a process or PID file to check!" print_help exit $ST_UK fi } get_wcdiff() { if [ ! -z "$warning" -a ! -z "$critical" ] then wclvls=1 if [ ${warning} -gt ${critical} ] then wcdiff=1 fi elif [ ! -z "$warning" -a -z "$critical" ] then wcdiff=2 elif [ -z "$warning" -a ! -z "$critical" ] then wcdiff=2 elif [ -z "$warning" -o -z "$critical" ] then wcdiff=2 fi } val_wcdiff() { if [ "$wcdiff" = 1 ] then echo "Please adjust your warning/critical thresholds. The warning \ must be lower than the critical level!" print_help exit $ST_UK elif [ "$wcdiff" = 2 ] then echo "Please also set a warning AND critical threshold!" print_help exit $ST_UK fi } get_vals() { tpid=0 tseconds=0 old_ifs=$IFS if [ -z $pidfileval ] then for ps_pid in $(pgrep -f "${process}") ; do if [ -z "$ps_pid" -a "$missing" = "C" ] then echo "CRITICAL - Process is not running!" exit $ST_CR elif [ -z "$ps_pid" -a "$missing" = "W" ] then echo "WARNING - Process is not running." exit $ST_WA elif [ -z "$ps_pid" -a "$missing" = "U" ] then echo "UNKOWN - Process is not running." exit $ST_UK elif [ -z "$ps_pid" -a "$missing" = "O" ] then echo "OK - Process is not running." exit $ST_OK fi running=$(($(awk '{print $1}' FS=\. /proc/uptime) - $(awk '{printf ("%10d\n",$22/100)}' /proc/$ps_pid/stat))) if [ "$running" -gt "$tseconds" ] then tseconds=$running tpid=$ps_pid fi done else ps_pid=$pidfileval running=$(($(awk '{print $1}' FS=\. /proc/uptime) - $(awk '{printf ("%10d\n",$22/100)}' /proc/$ps_pid/stat))) process=$(awk '{gsub (/\(|\)/, ""); printf ("%s\n",$2)}' /proc/$ps_pid/stat) if [ "$running" -gt "$tseconds" ] then tseconds=$running tpid=$ps_pid fi fi ps_pid=$tpid running=$tseconds } get_proc get_wcdiff val_wcdiff get_vals output="Process: ${process}, PID: ${ps_pid}, Elapsed Time: ${running}" perfdata="'PROCESS'=${process} 'PID'=${ps_pid} 'ETIME'=${running}" if [ "$running" -ge "$warning" -a "$running" -lt "$critical" ] then echo "WARNING - ${output} | ${perfdata}" exit $ST_WR elif [ "$running" -ge "$critical" ] then echo "CRITICAL - ${output} | ${perfdata}" exit $ST_CR else echo "OK - ${output} | ${perfdata} ]" exit $ST_OK fi