#!/usr/bin/perl # ----------------------------------------------------------------------------- # check_bandwidth 1.0.0 # ~ Nagios(r) SNMP Network Traffic Monitor Plugin # ~ Copyright 2007, Jonathan Wright # ~ based on the check_traffic plugin by Adrian Weiczorek # and check_snmp_cisco_ifstatus by Altinity Limited # ----------------------------------------------------------------------------- # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307 # ----------------------------------------------------------------------------- # Note: Nagios is a registered trademark of Ethan Galstad. # setup Perl use strict; use diagnostics; use warnings; # import modules use Net::SNMP; # ----------------------------------------------------------------------------- # define the variabes we're going to need and then fill them our ($_storage, %_oid, %_status, %_options, $_cycle); our ($session, $error, $message); # location to store traffic information from last SNMP poll $_storage = '/tmp/.traffic'; # location of the programs we're going to use later %_oid = ( 'sysDesc' => ".1.3.6.1.2.1.1.1.0", # generic system details 'ifNumber' => ".1.3.6.1.2.1.2.1.0", # number of interfaces # for the following, append interface number to obtain value 'ifDescription' => '.1.3.6.1.2.1.2.2.1.2.', # interface description 'ifType' => '.1.3.6.1.2.1.2.2.1.3.', # interface type 'ifSpeed' => '.1.3.6.1.2.1.2.2.1.5.', # interface bandwidth limit 'ifConnected' => '.1.3.6.1.2.1.2.2.1.7.', # interface up/down (physically) 'ifEnabled' => '.1.3.6.1.2.1.2.2.1.8.', # interface up/down (software) 'ifReceived' => '.1.3.6.1.2.1.2.2.1.10.', # interface bytes in 'ifTransmitted' => '.1.3.6.1.2.1.2.2.1.16.' # interface bytes out ); # status codes to return upon various conditions (as understood by Nagios(r)) %_status = ( 'UNKNOWN' => '-1', 'OK' => '0', 'WARNING' => '1', 'CRITICAL' => '2' ); # command-line controlled options (including defaults) %_options = ( 'community' => 'public', 'version' => '2c', 'hostname' => '', 'interface' => '', 'override' => 0, 'warning' => 75, 'critical' => 90, 'timeout' => 15, 'on-the-fly' => 0, 'check-up' => 0, 'pause' => 10, 'use-bytes' => 0, 'use-mega' => 0 ); # calculate the point at which the counter will cycle (based on 32-bit counter): # TODO: move this code to the bandwidth calculation section and work out how to # test whether the counter is 32-bit or 64-bit (and process accordingly). $_cycle = ((1<<31)*2); # ----------------------------------------------------------------------------- # start the program by processing the command-line options while (my $arg = shift) { # test against known arguments, and assign variables based on them if ($arg =~ /^-(H|-hostname)$/) { $_options{'hostname'} = shift; } elsif ($arg =~ /^-(C|-community)$/) { $_options{'community'} = shift; } elsif ($arg =~ /^-(i|-interface)$/) { $_options{'interface'} = shift; } elsif ($arg =~ /^-(o|-override)$/) { $_options{'override'} = shift; } elsif ($arg =~ /^-(w|-warning)$/) { $_options{'warning'} = uc(shift); } elsif ($arg =~ /^-(c|-critical)$/) { $_options{'critical'} = uc(shift); } elsif ($arg =~ /^-(t|-timeout)$/) { $_options{'timeout'} = shift; } elsif ($arg =~ /^-(p|-pause)$/) { $_options{'pause'} = shift; } elsif ($arg =~ /^-(f|-on-the-fly)$/) { $_options{'on-the-fly'} = 1; } elsif ($arg =~ /^-(u|-check-up)$/) { $_options{'check-up'} = 1; } elsif ($arg =~ /^-(b|-use-bytes)$/) { $_options{'use-bytes'} = 1; } elsif ($arg =~ /^-(m|-use-mega)$/) { $_options{'use-mega'} = 1; } elsif ($arg =~ /^-(h|-help)$/) { usage(1); } else { # if argument is unknown, then output error and exit print "Unknown option: $arg\n"; usage(0); } } # before we use the variables/options from the command line, we need to make # sure they are in the correct format and are usable first foreach my $key (keys %_options) { if (($key eq 'hostname' || $key eq 'interface') && $_options{$key} eq '') { issue('No value given for --'.$key); } if ($key eq 'hostname') { issue('Invalid hostname given ('.$_options{$key}.')') unless (lc($_options{$key}) =~ /^[a-z0-9_-]+(\.[a-z0-9_-]+)+$/); } if ($key eq 'warning' || $key eq 'critical') { issue('Invalid value given for --'.$key) unless ($_options{$key} =~ /^[0-9]+[KMGEP]?(,[0-9]+[KMGEP]?)?$/); } if ($key eq 'override' || $key eq 'timeout' || $key eq 'pause') { issue('Invalid value given for --'.$key) unless ($_options{$key} =~ /^[0-9]+$/); } } # begin by trying to esablish a connection with the SNMP interface with the # device under the community given. ($session, $error) = Net::SNMP->session( -timeout => $_options{'timeout'}, -version => $_options{'version'}, -hostname => $_options{'hostname'}, -community => $_options{'community'} ); # if we cannot get the sysDesc OID, assume that the attempt has failed critical('SNMP agent not responding ('.$error.'): Check settings & try again') unless (defined($session->get_request($_oid{'sysDesc'}))); # register variables we're going to use in the next section our($return, %count, %recount, $cache); # find the interface on the system - append the interface number on the end # of the ifType, etc. OID's. first, find the number of interfaces and made # sure we're within the available limit critical('Cannot get number of interfaces from SNMP agent') unless (defined($return = $session->get_request($_oid{'ifNumber'}))); critical( 'Interface value invalid: '.$return->{$_oid{'ifNumber'}}.' available; '. 'trying to use interface '.$_options{'interface'} ) if ($_options{'interface'} > $return->{$_oid{'ifNumber'}}); # run a check against the interface if requested # TODO: Known Bug - Net::SNMP always returns 1 for the interface status, # regardless of whether the interface is up or down # (SNMP reports 'INTEGER: down(2)' which is evaluated to 1, not 0) check_up() if ($_options{'check-up'}); # unless on-the-fly requested, try to find and open the cache file for the # interface, retrieving the previous values $cache = $_storage.'/'.$_options{'hostname'}.'/'.$_options{'interface'}; if ($_options{'on-the-fly'} || !(-e $cache)) { # fetch the inital set of results from the device before sleeping for the # alloted time to calculate the bandwidth value %count = get_count(); sleep($_options{'pause'}); } else { # open the file and get the contents open (CACHE, "< ".$cache) or critical('Cannot open interface cache file for reading'); while () { # split each line and add it into the %count hash chomp; my ($key, $value) = split(':'); $count{$key} = $value; } close(CACHE); unless ( # make sure that there is no chance that the counter has cycled around # (i.e. the counter hasn't cycled based on the speed of the interface) # TODO: this may be too short - look into coding comparison between # this and a minimum period the data should be valid for (10m) ((time - $count{'ifTimeStamp'}) < ($_cycle/($recount{'ifSpeed'}/8))) && # make sure we have values for tx and rx (($count{'ifReceived'} > 0) && ($count{'ifTransmitted'} > 0)) ) { # if any of these fail, ignore the values and fetch a fresh set of results # from the device, before sleeping the alloted time %count = get_count(); sleep($_options{'pause'}); } } # fetch the next set of values from the device which will be calculated # against the original set to work out bandwidth %recount = get_count(); # close the SNMP session now, as it's no longer needed $session->close(); # save the data from %recount above back into the cache file, but first check # that the required directories exist first so that we can write the file unless ($_options{'on-the-fly'}) { # check (and make) the main root directory for the storage mkdir ($_storage) or critical('Cannot create storage directory') unless (-e $_storage); # check (and make) the hostname directory for the interface files mkdir ($_storage.'/'.$_options{'hostname'}) or critical('Cannot create storage directory for hostname') unless (-e $_storage.'/'.$_options{'hostname'}); # save the data, one file per interface open (CACHE, "> ".$cache) or critical('Cannot open interface cache file for writing'); foreach my $key (keys %recount) { print CACHE $key.":".$recount{$key}."\n"; } close(CACHE); } # register variables we're going to use in the next section our($tx, $rx, $report, $status); # calculate the bandwidth used (data transferred/time), but to prevent negative # transfers (and associated issues with limits), if the newer recount value is # less than the cached value, the counter has cycled. Therefore, add the # maximum possible value (at which the counter cycled) to the newer records $recount{'ifReceived'} += $_cycle if ($recount{'ifReceived'} < $count{'ifReceived'}); $recount{'ifTransmitted'} += $_cycle if ($recount{'ifTransmitted'} < $count{'ifTransmitted'}); # calculate bandwidth used (tx/rx are in bytes, convert to bits as all # internal data and calcualtions should be done in bits) $rx = (8*($recount{'ifReceived'} - $count{'ifReceived'})) / ($recount{'ifTimeStamp'} - $count{'ifTimeStamp'}); $tx = (8*($recount{'ifTransmitted'} - $count{'ifTransmitted'})) / ($recount{'ifTimeStamp'} - $count{'ifTimeStamp'}); # create the text report which we're going to send back to Nagios(r) (and can be # read by the admin via the site or via a notice) $report = sprintf( # Nagios(r) Plugin Report: 'Status Information|Performance Data' 'Out: %sps; In: %sps (Sent %s, Received %s in %s seconds)|Port %s (Current Sent: %s; Current Received: %s)', adjust($tx), adjust($rx), adjust($recount{'ifTransmitted'}-$count{'ifTransmitted'}), adjust($recount{'ifReceived'}-$count{'ifReceived'}), ($recount{'ifTimeStamp'}-$count{'ifTimeStamp'}), $recount{'ifDescription'}, adjust($recount{'ifTransmitted'}), adjust($recount{'ifReceived'}) ); # test the values calculated against the warning and critical limits # given and therefore work out which status should be reported to Naguis. # First, register the variables we're going to be using our (%levels, $tx_limit, $rx_limit); # To minimise duplication of code, create a hash table with the keys as the # levels, pointing to the de-referenced sub-routines which will report # the status back to Nagios(r) %levels = ( # start with CRITICAL first, as it's higher than WARNING and can be # triggered, even if WARNING has a higher trigger than CRITICAL 'critical' => \&critical, 'warning' => \&warning ); # process each level, and break down the limits into their tx/rx values so we # can test them against the foreach my $key (keys %levels) { # check if we have two separate values for tx/rx limits at this level if ($_options{$key} =~ /^[0-9]+[KMGEP]?,[0-9]+[KMGEP]?$/) { # yep, so split ($tx_limit, $rx_limit) = split(',', $_options{$key}); } else { # nope, so duplicate ($tx_limit, $rx_limit) = ($_options{$key}, $_options{$key}); } # if the values have no suffix (i.e. K, M or G), then they're going to be # %age values - re-set the limit values to %age of available bandwidth $tx_limit = $recount{'ifSpeed'}*($tx_limit/100) if ($tx_limit =~ /^[0-9]+$/); $rx_limit = $recount{'ifSpeed'}*($rx_limit/100) if ($rx_limit =~ /^[0-9]+$/); # then run tests against each of the tx and rx values, triggering the # de-referenced sub-routine if either of them trigger $levels{$key}($report) if (test($tx_limit, $tx) or test($rx_limit, $rx)); } # if we've reached this stage, no errors have been triggered and so # it's safe to report that everything is OK. Return information and exit # with OK status value. print 'OK '.$report; exit($_status{'OK'}); # ----------------------------------------------------------------------------- # handle standard error message (due to incorrect configuration) sub issue { $message = shift; print "ERR - $message\n" unless ($message eq ''); usage(0); } # handle warning error messages sub warning { # retrieve the error message and set a default if none given # before outputting and exiting $message = (($message = shift) eq '' ? 'No error message given' : $message); print "WARNING $message\n"; # make sure any SNMP session has been closed and exit with WARNING status $session->close(); exit($_status{'WARNING'}); } # handle critical error messages sub critical { # retrieve the error message and set a default if none given # before outputting and exiting $message = (($message = shift) eq '' ? 'No error message given' : $message); print "CRITICAL $message\n"; # make sure any SNMP session has been closed and exit with CRITICAL status $session->close(); exit($_status{'CRITICAL'}); } # run the SNMP query to get the tx/rx data from the interface on the device sub get_count { # prepare the list of OIDs that we're going to send to the SNMP agent my (@list) = ( $_oid{'ifSpeed'}.$_options{'interface'}, # interface speed $_oid{'ifReceived'}.$_options{'interface'}, # received bits $_oid{'ifTransmitted'}.$_options{'interface'}, # transmitted bits $_oid{'ifDescription'}.$_options{'interface'} # interface name/description ); # run the SNMP query, and throw a CRITICAL error message if we can't get # any data back from the device critical('Values unavailable for interface '.$_options{'interface'}) unless (defined($return = $session->get_request(-varbindlist=>[@list]))); # create and return a hash table with all the values returned return my %values = ( # the time of the data needs to be recorded along with the data itself 'ifTimeStamp' => time, # if the speed has been overridden at the command line, ignore the value # from the SNMP data returned by the device 'ifSpeed' => ($_options{'override'} ? $_options{'override'} : $return->{$list[0]}), # use the remaining data as normal 'ifReceived' => $return->{$list[1]}, 'ifTransmitted' => $return->{$list[2]}, 'ifDescription' => $return->{$list[3]} ); } # run an SNMP query to check that the network interface is connectect and # enabled sub check_up { # prepare the list of OIDs that we're going to send to the SNMP agent my (@list) = ( $_oid{'ifConnected'}.$_options{'interface'}, # interface connected $_oid{'ifEnabled'}.$_options{'interface'}, # interface enabled ); # run the SNMP query, and throw a CRITICAL error message if we can't get # any data back from the device critical('Values unavailable for interface '.$_options{'interface'}) unless (defined($return = $session->get_request(-varbindlist=>[@list]))); # this time throw a CRITICAL error message only if both values are not # true (i.e. the interface is not connected and/or not enabled) critical( 'Interface '.$_options{'interface'}.' not enabled ('. ($return->{$list[0]} ? 'Up' : 'Down').', '. ($return->{$list[0]} ? 'Up' : 'Down').')' ) unless ($return->{$list[0]} && $return->{$list[1]}); } # take a bits value and convert it into human-readable format for output # to Nagios(r) (doesn't change the original data used for calculations) sub adjust { # register the variables we're going to need our ($value, $ext); # get the value we're going to report (and convert it into bytes if # requested by the --bytes command-line argument) $value = ($_options{'use-bytes'} ? (shift)/8 : shift); # if the --use-mega command-line option has been supplied, force all # conversion to multipuls of megabyte or megabit. if ($_options{'use-mega'}) { $value = ($value/(1024*1024)); $ext = 'M'; # otherwise keep diving by 1024 while we still have suffixes available, # until we have a human-readable number (i.e. between 1 and 1024) } else { my (@exts) = qw(K M G E P); while ($value > 1024 && scalar @exts > 0) { $value = ($value/1024); $ext = shift @exts; } } # return the value, formatting to 2 decimal places and correct termonology # for bits and bytes return sprintf('%0.2f%s', $value, $ext.($_options{'use-bytes'} ? 'B' : 'b')); } # take the value and the limit and compare so see if we've passed it sub test { # register the variables we're going to need our($limit, $multi, $value); # get the supplied limit $limit = shift; # get the value we're going to report (and convert it into bytes if # requested by the --bytes command-line argument) $value = ($_options{'use-bytes'} ? (shift)/8 : shift); # if the limit has a suffex, it not yet an absolute value which can be # compared - convert it back to bits based on the suffix if ($limit =~ /^[0-9]+[KMGEP]$/) { # get the suffix $multi = chop($limit); # and do the calculation if ($multi eq 'K') { $limit = $limit*(1024); } elsif ($multi eq 'M') { $limit = $limit*(1024**2); } elsif ($multi eq 'G') { $limit = $limit*(1024**3); } elsif ($multi eq 'E') { $limit = $limit*(1024**4); } elsif ($multi eq 'P') { $limit = $limit*(1024**5); } } # test the value against the limit - only return true of the value has # passed the limit return ($value > $limit); } # ----------------------------------------------------------------------------- # program usage guidelines sub usage { # get and test to see if we need to display the full help output my $full = (shift == 1 ? 1 : 0); # output about and usage (depending on $full above) print "check_bandwidth v1.0.0\n Copyright 2007 Jonathan Wright \n\n". "Poll (via SNMP) a network port and calculate bandwidth usage\n\n" if $full; print "Usage: check_bandwidth --hostname hostname --interface name [--community name]\n". " [--timeout seconds] [--pause seconds] [--warning value]\n". " [--critical value] [--check-up] [--on-the-fly] [--bytes] [--use-mega]\n"; print " see --help for further information\n" unless $full; print "\nOptions:\n". " -h, --help\n". " Display this help message\n". " -H, --host STRING\n". " IP address or hosting of (remote) device to query\n". " -C, --community STRING (default ".$_options{'community'}.")\n". " SNMP community name to use for polling\n". " -t, --timeout INTEGER (default ".$_options{'timeout'}."s)\n". " Set the timeout value for communications with host via SNMP\n". " -i, --interface STRING\n". " Name or number of the interface to be queried\n". " -o, --override INTEGER (default ".($_options{'override'}?$_options{'override'}:'Off').")\n". " Override the maximum throughput available on selected port\n". " -f, --on-the-fly (default ".($_options{'on-the-fly'}?'On':'Off').")\n". " Perform all calculation of bandwidth on-the-fly and don't use store\n". " -p, --pause INTEGER (default ".$_options{'timeout'}."s)\n". " Set the length of the pause when calculating results on-the-fly\n". " (or between first checks when no cache value exists)\n". " -u, --check-up (default ".($_options{'check-up'}?'On':'Off').")\n". " Set the timeout value for communications with host via SNMP\n". " -w, --warning INTEGER[,INTEGER] (default ".$_options{'warning'}.")\n". " Value at which to trigger WARNING. Default is %age of available bandwidth\n". " Single value is for both RX & TX - to specify different limits, use RX,TX\n". " (if K/M/G appended, treated as absolute value in bits (or --bytes)\n". " -c, --critical INTEGER[,INTEGER] (default ".$_options{'critical'}.")\n". " Value at which to trigger CRITICAL. Same semantics as --warning above\n". " -b, --use-bytes (default ".($_options{'use-bytes'}?'On':'Off').")\n". " Use bytes instead of bits in all calculations (i.e. Megabytes not Megabits)\n". " -m, --use-mega (default ".($_options{'use-mega'}?'On':'Off').")\n". " Force use of Megabit/Megabyte in all output (don't use Kilo or Giga)\n". "\n" if $full; exit($_status{'UNKNOWN'}); }