Difference between revisions of "How to Set Up NAL (Nagios Alarm Handler) to monitor an EPICS network"

From EPICSWIKI
Line 185: Line 185:


<pre>
<pre>
#!/bin/sh
#!/bin/sh
#
#
Line 195: Line 196:
# Script to retrieve EPICS PV Name status using the "caget" command.
# Script to retrieve EPICS PV Name status using the "caget" command.
# Written by Mauro Giacchini ([email protected])
# Written by Mauro Giacchini ([email protected])
# Last Modified: 25-08-2007
# Last Modified: 17-11-2007
#
#
# Usage: ./check_caget.sh -pv <PV name>
# Usage: ./check_caget.sh -pv <PV name>
Line 265: Line 266:


DEBUG="0"
DEBUG="0"
#####################################################################################
# CAGET LOCATION
# This option determines where the caget executable is located.
# The default /usr/bin/caget should be made with a symbolic link
# made by root (i.e.): ln -s /opt/epics/base-3.14.9/bin/linux-x86/caget /usr/bin/caget
CAGET_LOCATION=/usr/bin/caget




Line 278: Line 289:
STATE_UNKNOWN=3  # UNKNOWNN = orange
STATE_UNKNOWN=3  # UNKNOWNN = orange


VERSION="v1.2"
VERSION="v1.3"


#####################################################################################
#####################################################################################
Line 294: Line 305:


     echo ""
     echo ""
     echo "Usage: check_caget -pv <PV name> "
     echo "Usage: check_caget_dev_gw -pv <PV name> "
     echo "Usage: check_caget -pv <PV name> -H <EPICS_CA_ADDR_LIST>"
     echo "Usage: check_caget_dev_gw -pv <PV name> -H <EPICS_CA_ADDR_LIST>"
     echo "Usage: check_caget -pv <PV name> -p <EPICS_CA_SERVER_PORT>"
     echo "Usage: check_caget_dev_gw -pv <PV name> -p <EPICS_CA_SERVER_PORT>"
     echo "Usage: check_caget -pv <PV name> -expval <EXPECTED VALUE>"
     echo "Usage: check_caget_dev_gw -pv <PV name> -expval <EXPECTED VALUE>"
     echo "Usage: check_caget [-h] [--help]"
     echo "Usage: check_caget_dev_gw [-h] [--help]"
     echo "Usage: check_caget [-V]"
     echo "Usage: check_caget_dev_gw [-V]"
     echo ""
     echo ""
}
}
Line 322: Line 333:


#####################################################################################
#####################################################################################
# Check the EPICS_BASE variable, and caget presence.
# Check the caget presence.
 


verify_caget_presence() {
verify_caget_presence() {




if ! type caget >/dev/null 2>&1; then
if ! type $CAGET_LOCATION >/dev/null 2>&1; then


echo "STATUS CRITICAL: caget not found (Did you set up the rigth one Nagios USERn?)"
echo "STATUS CRITICAL: caget not found (Did you set up the rigth one Nagios USERn? _or_ caget not found!)"
exit $STATE_CRITICAL
exit $STATE_CRITICAL
fi
fi
Line 426: Line 438:




CAGET_REPLY=`caget -a $PVNAME`
#CAGET_REPLY=`caget -a $PVNAME`
CAGET_REPLY=`$CAGET_LOCATION -a $PVNAME`


IFS=" "
IFS=" "
Line 482: Line 495:
;;
;;
esac
esac




</pre>
</pre>

Revision as of 06:00, 17 November 2007

This is a Wiki: Please add or correct things as you find them wrong, misleading or out-of date.


Here's how to install NAL using yum on Fedora Core 7 Linux box

Nagios default installation

The default nagios installation can be done by doing like so

    root> yum -y install nagios

With that I've installed nagios.i386 0:2.9-1.fc7 version; this is the last stable version (the 3.0 is under development).

Optional, but recommended if you have enough disk space: install more Nagios plugins and addons.

    root> yum -y install nagios-plugins nagios-plugins-all

That is not necessary but should be useful if you use NAL like a regular NAGIOS server more than only NAL (to monitor EPICS PVs network).


Nagios Default Folder Locations

By default Nagios yum installation, Nagios stores the following file location into your harddisk

   * /etc/nagios/ - Nagios configuration folder locations
   * /var/log/nagios/ - Nagios log and messages folder locations
   * /usr/share/nagios/ - Nagios, docs, sounds, and image folder locations
   * /usr/lib/nagios/cgi-bin/ - Nagios CGI folder location
   * /usr/bin - Nagios binaries
   * /etc/httpd/conf.d/ - Nagios Apache folder files
   * /etc/logrotate.d/nagios/ - Nagios log rotation file


Insert the EPICS Nagios Plugins

Going to http://www.nagiosexchange.org ; using the "search" button, look to "epics". You will find a project named "Nagios plugins for EPICS". Scroll down that page, click over "check_caget_dev_gw.sh". Download it and save into the /usr/lib/nagios/plugins/ Change the permission to check_caget_dev_gw.sh

    root> chmod  +x check_caget_dev_gw.sh

now verify that is usable with:

    > ./check_caget_dev_gw.sh --help

verfing using camonitor a PV, ie for me giacchinHost:aiExample

    > camonitor giacchinHost:aiExample

Using the EPICS follows variables you should avoid to bradcast the network, for me the variblese were:

    EPICS_CA_AUTO_ADDR_LIST=NO
    EPICS_CA_ADDR_LIST=127.0.0.1

therefore may I test the plugin with the follow command:

    > ./check_caget_dev_gw.sh -pv giacchinHost:aiExample -H 127.0.0.1
    > STATE_OK: giacchinHost:aiExample 5 2007-11-16 15:23:18.560231  ; te: 0 sec.

if that reply correctly the status of your PV you can continue the installation.


Now install the EPICS logos images

download the epics.logo.images.tar available from the same place in http://www.nagiosexchange.org that contains : EPICSlogo.gd2 EPICSlogo.gif EPICSlogoSmall.gif

and install that:

   root> tar -xvf epics.logo.images.tar
   root> mv EPICSlogo* /usr/share/nagios/html/images/logos/

Save the original Nagios setup and replace it

Go to /etc folder and save the original setup

    root> tar cvf nagios.or.tar ./nagios/

download there the etc.nagios.tar available at same place at nagiosexchange.org

and restore the nagios folder with that:

    root> tar xvf  ./etc.nagios.tar

Note: Now looking around the files into /etc/nagios and adjust that to meet your network setup requirements.


NAGIOS check configuration file

For sanity checking, make sure you verify Nagios config files. This can be done like so

    root> nagios -v /etc/nagios/nagios.cfg

The above command would show you for any erroneous lines frin Nagios config file.


HTTPD configuration

Check the presence of line: "include conf.d/*.conf"

in /etc/httpd/conf/httpd.conf

Check the paths into the file : /etc/httpd/conf.d/nagios.conf

Make a file named .htaccess into /usr/lib/nagios/cgi-bin/ and /usr/share/nagios/html/

which will contains:

   AuthName "Nagios Access"
   AuthType Basic
   AuthUserFile /etc/nagios/passwd
   require valid-user

Now create a nagios user with the following command:

    root> htpasswd -c /etc/nagios/passwd nagiosadmin


NAGIOS as a Linux service

Basically, at this point of basic Nagios configuration, restarting Nagios should be successful.

Reload your apache service together with your Nagios service like so

    root> service httpd restart
    root> service Nagios stop
    root> service Nagios start
    root> service Nagios status

Open your favorite web-browser on http://localhost/nagios/

login like "nagiosadmin", give your password and enjoy!


See my nagios screen shots in action:

Nagios Service Details


NagiosServicesDetail.jpg


Nagios Alert Histogram


NagiosAlertHistogram.jpg



Conclusions

There are a lot of other interesting feature that comes from free using NAGIOS, looking around you should find a lot yourself. There is a cool Firefox plugin https://addons.mozilla.org/it/firefox/addon/3607 which give you the possibility to continuous monitoring the PVs during the regular usage of the browser.

At this time Ralph Lange has realized a test to NAL at Bessy. A great acknowledgments to him, he has supported me since the idea of use Nagios born in my mind.

More information about NAL could be found at : http://www.lnl.infn.it/~epics/NAL.html You should find there a special LivEPICS version (Linux Live CD EPICS fully equipped) with NAGIOS pre-setted and ready to use.

Thank you for your attention! Please, give me a your feedback, and fell free to drop me an email, I'll be happy to continue to work on this idea if someone is interested to use it.


Mauro Giacchini (INFN-LNL)

--MauroGiacchini 06:40, 16 Nov 2007 (CST)



The Plugin Script

/usr/lib/nagios/plugins/check_caget_dev_gw.sh script for Nagios


#!/bin/sh
#
#####################################################################################
#####################################################################################
##                           Nagios plugin to check EPICS PV Status                ##
#####################################################################################
#####################################################################################
#
# Script to retrieve EPICS PV Name status using the "caget" command.
# Written by Mauro Giacchini ([email protected])
# Last Modified: 17-11-2007
#
# Usage: ./check_caget.sh -pv <PV name>
#
# Description:
#   	This script uses caget command to retrieve the PV status. 
#
# Limitations:
# 	This script has been tested on Linux Fedora Core 6.
#
# Output:
# 	The output contains the "te" time elapsed calculated like a difference from PV's
# timestamp and the linux "date" command (suggestion: use ntp common server
# to IOCs and Nagios server box). The STATUS of the service (..of the PV)
# follow the severity rules:
#
# Severity (none) >>>> STATE_OK		# OK = green
#
# Severity MINOR  >>>> STATE_WARNING	# WARNING = yellow
#
# Severity MAJOR  >>>> STATE_CRITICAL	# CRITICAL = red
#
# PV not found    >>>> STATE_UNKNOWN	# UNKNOWNN = orange
#
# In case of Severity (none) it show the stdout of "caget -a" with appended the "te".
#
# Other notes:
#  Firefox Plugin : A FireFox extension is avilable to monitor Nagios server.
#  https://addons.mozilla.org/it/firefox/addon/3607
#
# Nagios configuration setup: 
# 	You need to add the command to commands.cfg
# 
# define command{
# 	command_name	check_caget
# 	command_line	$USER1$/check_caget.sh -pv $ARG1$
# 	}
#
#	And, you need to add the service to services.cfg
#
# define service{
#        use         		generic-service	;
#        host_name		IOC_Example	;
#        service_description   	aiExample	;
#        is_volatile           	0		;
#        check_period		24x7		;
#        max_check_attempts    	3		;
#        normal_check_interval 	3		;
#        retry_check_interval  	1		;
#        contact_groups        	admins		;
#        notification_interval 	120		;
#        notification_period   	24x7		;
#        notification_options  	w,u,c,r		;
#        check_command         	check_caget!rootHost:aiExample	;
#        }
#
# then place this script in the /usr/lib/nagios/plugins/ on the Nagios box server.
# Don't forget to set the right execution permission to this file.
#
# Threshold and ranges: please, have a look at:
# http://nagiosplug.sourceforge.net/developer-guidelines.html#THRESHOLDFORMAT
#
# Last: This script still needs debugging and fixups (exercise for reader) :-)
#
#####################################################################################
# DEBUGGING OPTION
# This option determines whether or not debugging messages are showed 
# Values: 0=debugging off, 1=debugging on

DEBUG="0"


#####################################################################################
# CAGET LOCATION
# This option determines where the caget executable is located.
# The default /usr/bin/caget should be made with a symbolic link
# made by root (i.e.): ln -s /opt/epics/base-3.14.9/bin/linux-x86/caget /usr/bin/caget


CAGET_LOCATION=/usr/bin/caget


#####################################################################################
# Script exit status

STATE_OK=0		# OK = green

STATE_WARNING=1		# WARNING = yellow

STATE_CRITICAL=2	# CRITICAL = red

STATE_UNKNOWN=3  	# UNKNOWNN = orange

VERSION="v1.3"

#####################################################################################
# print_revision() function

print_revision (){
    
    echo "Check_caget (nagios-plugins 1.4 to nagios 2.9) (EPICS base 3.14.9) $VERSION"
}

#####################################################################################
# print_usage() function

print_usage() {

    	echo ""
    	echo "Usage: check_caget_dev_gw -pv <PV name> "
    	echo "Usage: check_caget_dev_gw -pv <PV name> -H <EPICS_CA_ADDR_LIST>"
    	echo "Usage: check_caget_dev_gw -pv <PV name> -p <EPICS_CA_SERVER_PORT>"
    	echo "Usage: check_caget_dev_gw -pv <PV name> -expval <EXPECTED VALUE>"
    	echo "Usage: check_caget_dev_gw [-h] [--help]"
    	echo "Usage: check_caget_dev_gw [-V]"
    	echo ""
}

#####################################################################################
# print_help() function

print_help() {
	echo ""
	print_usage
	echo ""
	echo "Script to retrieve the PV status for EPICS control systems."
	echo ""
	echo "This plugin not developped by the Nagios Plugin group."
	echo "Please do not e-mail them for support on this plugin, since"
	echo "they won't know what you're talking about :P"
	echo ""
	echo "For contact info: [email protected]"
	echo "Download : http://www.lnl.infn.it/~epics/"
	echo ""
}

#####################################################################################
# Check the caget presence.


verify_caget_presence() {


if ! type $CAGET_LOCATION >/dev/null 2>&1; then

	echo "STATUS CRITICAL: caget not found (Did you set up the rigth one Nagios USERn? _or_ caget not found!)"
	exit $STATE_CRITICAL
fi
}


#####################################################################################
# Control caget plugin input parameters

EXPVAL=""
EPICS_CA_ADDR_LIST="" 	# Default YES
EPICS_CA_SERVER_PORT="" # Default 5064 _and_  	value > 5000
EPICS_CA_SERVER_PORT_MIN="5000"

while test -n "$1"; do

    case "$1" in

	--help)
	print_help
	exit $STATE_OK
	;;

	-h)
	print_help
	exit $STATE_OK
	;;

	-V)
	print_revision
	exit $STATE_OK
	;;

	-pv)
	PVNAME=$2
	shift
	;;

	-expval)
	EXPVAL=$2
	if [ -z $EXPVAL ]; then
  	  	echo "STATUS CRITICAL: Expected value absent"
   	 	exit $STATE_CRITICAL
	fi
	shift
	;;

	-H)
	EPICS_CA_ADDR_LIST=$2
	if [ -z $EPICS_CA_ADDR_LIST ]; then
  	  	echo "STATUS CRITICAL: Expected EPICS_CA_ADDR_LIST absent"
   	 	exit $STATE_CRITICAL
	fi
	export EPICS_CA_ADDR_LIST
	EPICS_CA_AUTO_ADDR_LIST="NO"
	export EPICS_CA_AUTO_ADDR_LIST
	shift
	;;

	-p)
	EPICS_CA_SERVER_PORT=$2
	if [ -z $EPICS_CA_SERVER_PORT ]; then
  	  	echo "STATUS CRITICAL: Expected EPICS_CA_SERVER_PORT absent"
   	 	exit $STATE_CRITICAL
	fi
	if [ $EPICS_CA_SERVER_PORT -le $EPICS_CA_SERVER_PORT_MIN ]; then
  	  	echo "STATUS CRITICAL: Expected EPICS_CA_SERVER_PORT minor than allowed (5001)"
   	 	exit $STATE_CRITICAL
	fi
	export EPICS_CA_SERVER_PORT
	shift
	;;

	*)
	echo ""
	echo "Unknow argument: $1"
	print_usage
	exit $STATE_UNKNOWN
	;;
 
esac
shift
done


verify_caget_presence

if [ -z $PVNAME ]; then

    echo "STATUS CRITICAL: PV Name not specified"
    exit $STATE_CRITICAL
fi

#####################################################################################
# FINALLY... RETRIEVING THE VALUES (caget)


#CAGET_REPLY=`caget -a $PVNAME`
CAGET_REPLY=`$CAGET_LOCATION -a $PVNAME`

IFS=" "
read pvname date time value status severity<<END
$CAGET_REPLY
END

if [ -z $pvname ]; then

    echo "STATE_UNKNOWN: $PVNAME not found"
    exit $STATE_UNKNOWN
 fi

#####################################################################################
# Calculus difference between the PV timestamp and the actual time

	SPACE=" "
	dte1=$(date --date "$date$SPACE$time" +%s)
    	dte2=$(date +%s)
    	diffSec=$((dte2-dte1))
    	if ((diffSec < 0)); then abs=-1; else abs=1; fi
	te=$((diffSec/abs))
#    	echo "Time elapsed (sec.): $te"

#####################################################################################
# Output the NAGIOS status using an expected value

if [ $EXPVAL ]; then

		if  [[ $value -eq $EXPVAL ]] ;
			then echo "STATE_OK: Expected value ($EXPVAL) to $pvname match; te: $te sec."
			exit $STATE_OK;
			else  echo "STATUS CRITICAL: Expected value ($EXPVAL) to $pvname didn't match"
			exit $STATE_CRITICAL; 
		fi
fi

#####################################################################################
# Output the NAGIOS status using the Severity field
case $severity in

	MAJOR)
    	echo "STATUS CRITICAL: $pvname in MAJOR severity status; te: $te sec."
    	exit $STATE_CRITICAL
	;;

	MINOR)
    	echo "STATE_WARNING: $pvname in MINOR severity status; te: $te sec."
    	exit $STATE_WARNING
	;;

	*)
	echo "STATE_OK: $pvname $value $date $time $status ; te: $te sec."
    	exit $STATE_OK
	;;
esac