diff --git a/check_bird b/check_bird new file mode 100755 index 0000000..5cc2f3c --- /dev/null +++ b/check_bird @@ -0,0 +1,388 @@ +#!/usr/bin/env python +# +# Copyright (c) 2014 Catalyst.net Ltd +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +""" +Check various aspects of a running BIRD daemon (interface states, several routing protocols) by interrogating the +daemon via its control socket. Intended to be run from nagios. + +Michael Fincham . +""" + +import argparse +import re +import socket +import sys + +BIRD_CONTROL_SOCKET="/run/bird/bird.ctl" + +NAGIOS_OK = 0 +NAGIOS_WARNING = 1 +NAGIOS_CRITICAL = 2 +NAGIOS_UNKNOWN = 3 + +class BirdChecker(object): + + def __init__(self, control_socket, ignore): + self.control_socket = control_socket + self.ignore = ignore + + # mostly from pybird.py + def _send_query(self, query): + """ + Open a socket to the BIRD control socket, send the query and get + the raw response. + """ + + sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + sock.connect(self.control_socket) + sock.send("%s\n" % query) + + data = '' + prev_data = None + + while (data.find("\n0000") == -1) and (data.find("\n8003") == -1) and (data.find("\n0013") == -1) and (data.find("\n9001") == -1) and (data.find("\n8001") == -1): + data += sock.recv(1024) + if data == prev_data: + raise ValueError("Could not read additional data from BIRD") + prev_data = data + + sock.close() + return str(data) + + def _interface_flags(self): + """ + Parse the output of 'show interfaces' and return a dict containing all + interfaces with their set flags. + + The raw data looks like: + + 1001-br-nat up (index=19) + 1004- MultiAccess Broadcast Multicast AdminUp LinkUp MTU=1500 + 1003- 192.0.2.X/32 (Primary, scope univ) + 192.0.2.X/32 (Unselected, scope univ) + 192.0.2.X/32 (Unselected, scope univ) + 192.0.2.X/32 (Unselected, scope univ) + + Will be turned in to: + + {'br-nat': ['MultiAccess', 'Broadcast', 'Multicast', 'AdminUp', 'LinkUp', 'MTU=1500']} + """ + + interface_data = self._send_query('show interfaces') + + interfaces = {} + current_interface = '' + + for line in iter(interface_data.splitlines(False)): + if line.startswith('1001-'): # line describes the interface + current_interface = line[5:].split()[0] + elif line.startswith('1004-'): # line describes the interface flags + interfaces[current_interface] = line.strip().split()[1:] + + return interfaces + + def _generic_protocol_properties(self, command, protocol_id, columns, index_column): + """ + Parse the output of an arbitrary command that returns output similar to 'show bfd sessions' + and return a dict containing all protocols' sessions with their properties. + + The raw data looks like: + + 1020-bfd1: + IP address Interface State Since Interval Timeout + 192.0.2.X + 192.0.2.X + + Will be turned in to: + + {'bfd1': {'192.0.2.X + 'interval': '0.100', + 'since': '2014-10-06', + 'state': 'Up', + 'timeout': '0.500'}, + '192.0.2.X + 'interval': '0.100', + 'since': '2014-09-24', + 'state': 'Up', + 'timeout': '0.500'}}} + + When called as: + + _generic_protocol_properties('show bfd sessions', 1020, ('ip_address', 'interface', 'state', 'since', 'interval', 'timeout'), 'ip_address') + """ + + protocol_data = self._send_query(command) + protocols = {} + current_protocol = '' + + skip_line = False + + for line in iter(protocol_data.splitlines(False)): + if skip_line: + skip_line = False + continue + elif line.startswith('%i-' % protocol_id): # line marking the start of a protocol + current_protocol = line[5:].split()[0][:-1] + protocols[current_protocol] = {} + skip_line = True # skip next line (the column headers) + elif line.startswith(' '): # hopefully a protocol record + properties = {} + + line = line.strip().split(None, len(columns)) + for column_number, column_name in enumerate(columns): + properties[column_name] = line[column_number] + protocols.pop(index_column, None) + protocols[current_protocol][properties[index_column]] = dict((k, v) for (k, v) in properties.iteritems() if k != index_column) + + if self.ignore is not None: + return({k:v for k,v in protocols.iteritems() if not self.ignore.search(k)}) + else: + return(protocols) + + def _protocol_properties(self): + # XXX this is a very cargo cult function... tidy it + """ + Parse the output of 'show protocols' and return a dict containing all protocols' sessions with their properties. + + The raw data looks like: + + 2002-name proto table state since info + 1002-bogons Static master up 2014-10-29 + kernel1 Kernel master up 2014-10-29 + device1 Device master up 2014-10-29 + edge Direct master up 2014-10-29 + bfd1 BFD master up 2014-10-29 + aggregates Static master up 2014-10-29 + blackholes Static master up 2014-10-29 + core OSPF master up 2014-10-29 Running + 0000 + + Will be turned in to: + + {'aggregates': {'info': '', + 'proto': 'Static', + 'since': '2014-10-29', + 'state': 'up', + 'table': 'master'}, + (and so on...) + + When called as: + + _protocol_properties() + """ + + columns = ('name', 'proto', 'table', 'state', 'since', 'info') + index_column = 'name' + + protocol_data = self._send_query('show protocols') + + instances = {} + + for line in iter(protocol_data.splitlines(False)): + if line.startswith('2002-'): # column names + continue + elif line.startswith('1002-') or line.startswith(' '): # hopefully a protocol record + properties = {} + + if line.startswith('1002-'): + line = line[5:] + + line = line.strip().split(None, len(columns)) + for column_number, column_name in enumerate(columns): + try: + properties[column_name] = line[column_number] + except: + properties[column_name] = '' + + instance_name = properties['name'] + properties.pop('name', None) + instances[instance_name] = properties + + if self.ignore is not None: + return({k:v for k,v in instances.iteritems() if not self.ignore.search(k)}) + else: + return(instances) + + def check_interfaces(self): + """ + Check that all interfaces are either up or disabled, returns a tuple of (nagios_code, reason). + """ + + disabled = [] + down = [] + up = [] + unknown = [] + + for interface, flags in self._interface_flags().iteritems(): + if 'AdminDown' in flags: + disabled.append(interface) + elif 'LinkDown' in flags: + down.append(interface) + elif 'LinkUp' in flags: + up.append(interface) + else: + unknown.append(interface) + + if down: + return((NAGIOS_CRITICAL, '%i down (%s)' % (len(down), ', '.join(down)))) + + if unknown: + return((NAGIOS_UNKNOWN, '%i unknown (%s)' % (len(unknown), ', '.join(unknown)))) + + if up and not disabled: + return((NAGIOS_OK, '%i up' % len(up))) + elif up and disabled: + return((NAGIOS_OK, '%i up, %i disabled' % (len(up), len(disabled)))) + else: + return((NAGIOS_OK, 'no interfaces')) + + def check_bfd(self): + """ + Check that all configured BFD sessions are 'up'. + """ + + down = [] + up = [] + + bfd_sessions = self._generic_protocol_properties('show bfd sessions', 1020, ('ip_address', 'interface', 'state', 'since', 'interval', 'timeout'), 'ip_address') + + for protocol, sessions in bfd_sessions.iteritems(): + for neighbor, properties in sessions.iteritems(): + if properties['state'] != 'Up': + down.append(neighbor) + else: + up.append(neighbor) + + if down: + return((NAGIOS_CRITICAL, '%i down (%s)' % (len(down), ', '.join(down)))) + + if up: + return((NAGIOS_OK, '%i up' % len(up))) + else: + return((NAGIOS_OK, 'BFD not running')) + + def check_ospf(self): + """ + Check that all configured OSPF neighbors are 'Full'. + """ + + down = [] + up = [] + + for protocol, sessions in self._generic_protocol_properties('show ospf neighbors', 1013, ('router_id', 'pri', 'state', 'dtime', 'interface', 'router_ip'), 'router_id').iteritems(): + for neighbor, properties in sessions.iteritems(): + if not properties['state'].lower().startswith('full/'): + down.append(neighbor) + else: + up.append(neighbor) + + if down: + return((NAGIOS_CRITICAL, '%i down (%s)' % (len(down), ', '.join(down)))) + + if up: + return((NAGIOS_OK, '%i up' % len(up))) + else: + return((NAGIOS_OK, 'OSPF not running')) + + def check_bgp(self): + """ + Check that all configured BGP neighbors are 'Established'. + """ + + down = [] + up = [] + disabled = [] + + protocol_instances = self._protocol_properties() + + for instance, properties in protocol_instances.iteritems(): + + if properties['proto'] == 'BGP' and properties['state'] == 'down': + disabled.append(instance) + continue + + if properties['proto'] == 'BGP' and properties['info'] != 'Established': + down.append(instance) + elif properties['proto'] == 'BGP': + up.append(instance) + + if down: + return((NAGIOS_CRITICAL, '%i down (%s)' % (len(down), ', '.join(down)))) + + if up and not disabled: + return((NAGIOS_OK, '%i up' % len(up))) + elif up and disabled: + return((NAGIOS_OK, '%i up, %i disabled' % (len(up), len(disabled)))) + else: + return((NAGIOS_OK, 'BGP not running')) + + def check_proto(self, protocol): + """ + Check that the queried protocol is 'up'. + """ + + protocol_instances = self._protocol_properties() + + if protocol not in protocol_instances: + return (NAGIOS_CRITICAL, "%s not found"%protocol) + + properties = protocol_instances[protocol] + + if properties['state'] == 'down': + return (NAGIOS_OK, "%s is disabled"%(protocol)) + + if properties['state'] != 'up': + return (NAGIOS_CRITICAL, "%s is not up (%s)\nInfo: %s"%(protocol, properties["state"], properties["info"])) + + return (NAGIOS_OK, "%s is up\nInfo: %s"%(protocol, properties["info"])) + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('check', choices=('interfaces', 'bfd', 'ospf', 'bgp', 'single_protocol'), help='which check to run') + parser.add_argument('--control-socket', default=BIRD_CONTROL_SOCKET, help='location of BIRD control socket, defaults to %s' % BIRD_CONTROL_SOCKET) + parser.add_argument('--ignore', default=None, help='if supplied, a regular expression of protocol names which should be ignored') + parser.add_argument("--protocol", default=None, help="the protocol to check if used with 'single_protocol'") + args = parser.parse_args() + + try: + ignore = re.compile(args.ignore) + except: + ignore = None + + checker = BirdChecker(args.control_socket, ignore) + + try: + if args.check == 'interfaces': + return_code, description = checker.check_interfaces() + elif args.check == 'bfd': + return_code, description = checker.check_bfd() + elif args.check == 'ospf': + return_code, description = checker.check_ospf() + elif args.check == 'bgp': + return_code, description = checker.check_bgp() + elif args.check == 'single_protocol': + return_code, description = checker.check_proto(args.protocol) + else: + raise NotImplementedError("no check matching '%s'" % args.check) # this will only happen if not all possible choices in argparse are not implemented + except socket.error: + sys.stdout.write("error: could not connect to bird\n") + sys.exit(NAGIOS_UNKNOWN) + + sys.stdout.write("%s\n" % description) + sys.exit(return_code)