0
0
Fork 0
mirror of https://github.com/alerta/alerta.git synced 2025-01-24 17:29:39 +00:00
alerta_alerta/sbin/alert-checker.py
2012-09-18 12:45:55 +01:00

219 lines
7.7 KiB
Python
Executable file

#!/usr/bin/env python
########################################
#
# alert-checker.py - Alert Nagios Check
#
########################################
import os
import sys
import optparse
try:
import json
except ImportError:
import simplejson as json
import stomp
import subprocess
import shlex
import time
import datetime
import logging
import uuid
import re
__version__ = '1.1.5'
BROKER_LIST = [('monitoring.guprod.gnl', 61613),('localhost', 61613)] # list of brokers for failover
ALERT_QUEUE = '/queue/alerts'
DEFAULT_TIMEOUT = 86400
EXPIRATION_TIME = 600 # seconds = 10 minutes
NAGIOS_PLUGINS = '/usr/lib64/nagios/plugins'
LOGFILE = '/var/log/alerta/alert-checker.log'
# Command-line options
parser = optparse.OptionParser(
version="%prog " + __version__,
description="Alert Nagios Check - runs a Nagios plug-in and sends the result to the alerting system. Alerts have a resource (including service and environment), event name, value and text. A severity of 'normal' is used if none given. Tags and group are optional.",
epilog='alert-checker.py --nagios "check_procs -w 10 -c 20 --metric=CPU" --resource Sever1 --event ProcStatus --group OS --env PROD --svc Discussion')
parser.add_option("-n",
"--nagios",
dest="nagios",
help="Nagios check command line")
parser.add_option("-r",
"--resource",
dest="resource",
help="Resource under alarm eg. hostname, network device, application.")
parser.add_option("-e",
"--event",
dest="event",
help="Event name eg. HostAvail, PingResponse, AppStatus")
parser.add_option("-g",
"--group",
dest="group",
help="Event group eg. Application, Backup, Database, HA, Hardware, Job, Network, OS, Performance, Security")
parser.add_option("-E",
"--environment",
action="append",
dest="environment",
help="Environment eg. PROD, REL, QA, TEST, CODE, STAGE, DEV, LWP, INFRA")
parser.add_option("-S",
"--svc",
"--service",
action="append",
dest="service",
help="Service eg. R1, R2, Discussion, Soulmates, ContentAPI, MicroApp, FlexibleContent, Mutualisation, SharedSvcs")
parser.add_option("-T",
"--tag",
action="append",
dest="tags",
help="Tag the event with anything and everything.")
parser.add_option("-o",
"--timeout",
type=int,
dest="timeout",
default=DEFAULT_TIMEOUT,
help="Timeout in seconds that OPEN alert will persist in console.")
parser.add_option("-q",
"--quiet",
action="store_true",
default=False,
help="Do not display alert id.")
parser.add_option("-d",
"--dry-run",
action="store_true",
default=False,
help="Do not send alert.")
VALID_SEVERITY = [ 'CRITICAL', 'MAJOR', 'MINOR', 'WARNING', 'NORMAL', 'INFORM', 'DEBUG' ]
VALID_ENVIRONMENT = [ 'PROD', 'REL', 'QA', 'TEST', 'CODE', 'STAGE', 'DEV', 'LWP','INFRA' ]
SEVERITY_CODE = {
# ITU RFC5674 -> Syslog RFC5424
'CRITICAL': 1, # Alert
'MAJOR': 2, # Crtical
'MINOR': 3, # Error
'WARNING': 4, # Warning
'NORMAL': 5, # Notice
'INFORM': 6, # Informational
'DEBUG': 7, # Debug
}
options, args = parser.parse_args()
if not options.resource:
parser.print_help()
parser.error("Must supply event resource using -r or --resource")
if not options.event:
parser.print_help()
parser.error("Must supply event name using -e or --event")
if not options.group:
options.group = 'Misc'
if not all(x in VALID_ENVIRONMENT for x in options.environment):
parser.print_help()
parser.error("Must supply one or more environments from %s" % ','.join(VALID_ENVIRONMENT))
else:
options.environment = [x.upper() for x in options.environment]
if not options.service:
parser.print_help()
parser.error("Must supply one or more service using -S or --service")
if not options.nagios:
parser.print_help()
parser.error("Must supply full command line for Nagios check using -n or --nagios")
def main():
try:
logging.basicConfig(level=logging.INFO, format="%(asctime)s alert-checker[%(process)d] %(levelname)s - %(message)s", filename=LOGFILE)
except IOError:
pass
# Run Nagios plugin check
args = shlex.split(os.path.join(NAGIOS_PLUGINS, options.nagios))
logging.info('Running %s', args)
check = subprocess.Popen(args, stdout=subprocess.PIPE)
stdout = check.communicate()[0]
rc = check.returncode
# Parse Nagios plugin check output
if rc == 0:
severity = 'NORMAL'
elif rc == 1:
severity = 'WARNING'
elif rc == 2:
severity = 'CRITICAL'
elif rc == 3:
severity = 'INFORM' # XXX - aka UNKNOWN
m = re.match(r'(?P<value>.*)\s*[-|:]', stdout)
if m:
value = m.group('value')
else:
value = 'unmatched'
text = stdout.strip()
alertid = str(uuid.uuid4()) # random UUID
createTime = datetime.datetime.utcnow()
headers = dict()
headers['type'] = "exceptionAlert"
headers['correlation-id'] = alertid
headers['persistent'] = 'true'
headers['expires'] = int(time.time() * 1000) + EXPIRATION_TIME * 1000
alert = dict()
alert['id'] = alertid
alert['resource'] = options.resource
alert['event'] = options.event
alert['group'] = options.group
alert['value'] = value
alert['severity'] = severity
alert['severityCode'] = SEVERITY_CODE[alert['severity']]
alert['environment'] = options.environment
alert['service'] = options.service
alert['text'] = text
alert['type'] = 'exceptionAlert'
alert['tags'] = options.tags
alert['summary'] = '%s - %s %s is %s on %s %s' % (','.join(options.environment), severity, options.event, value, ','.join(options.service), options.resource)
alert['createTime'] = createTime.replace(microsecond=0).isoformat() + ".%03dZ" % (createTime.microsecond//1000)
alert['origin'] = 'alert-checker/%s' % os.uname()[1]
alert['thresholdInfo'] = options.nagios
alert['timeout'] = options.timeout
logging.info('%s : Nagios plugin %s => %s (rc=%d)', alertid, options.nagios, text, rc)
logging.info('%s : %s', alertid, json.dumps(alert))
if (not options.dry_run):
try:
conn = stomp.Connection(BROKER_LIST)
conn.start()
conn.connect(wait=True)
except Exception, e:
print >>sys.stderr, "ERROR: Could not connect to broker - %s" % e
logging.error('Could not connect to broker %s', e)
sys.exit(1)
try:
conn.send(json.dumps(alert), headers, destination=ALERT_QUEUE)
except Exception, e:
print >>sys.stderr, "ERROR: Failed to send alert to broker - %s " % e
logging.error('Failed to send alert to broker %s', e)
sys.exit(1)
broker = conn.get_host_and_port()
logging.info('%s : Alert sent to %s:%s', alertid, broker[0], str(broker[1]))
conn.disconnect()
if not options.quiet:
print alertid
sys.exit(0)
else:
print "%s %s" % (json.dumps(headers, indent=4), json.dumps(alert, indent=4))
if __name__ == '__main__':
main()