0
0
Fork 0
mirror of https://github.com/alerta/alerta.git synced 2025-01-24 17:29:39 +00:00
alerta_alerta/bin/alert-aws.py
2012-12-20 23:05:35 +00:00

330 lines
13 KiB
Python
Executable file

#!/usr/bin/env python
########################################
#
# alert-aws.py - Amazon Web Service Alerter
#
########################################
import os
import sys
import time
import urllib
import urllib2
try:
import json
except ImportError:
import simplejson as json
import yaml
import stomp
import datetime
import logging
import uuid
import boto.ec2
__program__ = 'alert-aws'
__version__ = '1.0.8'
BROKER_LIST = [('localhost', 61613)] # list of brokers for failover
ALERT_QUEUE = '/queue/alerts'
BASE_URL = 'http://localhost/alerta/api/v1'
DEFAULT_TIMEOUT = 86400
WAIT_SECONDS = 60
GLOBAL_CONF = '/opt/alerta/conf/alerta-global.yaml'
LOGFILE = '/var/log/alerta/alert-aws.log'
PIDFILE = '/var/run/alerta/alert-aws.pid'
DISABLE = '/opt/alerta/conf/alert-aws.disable'
AWSCONF = '/opt/alerta/conf/alert-aws.yaml'
SEVERITY_CODE = {
# ITU RFC5674 -> Syslog RFC5424
'CRITICAL': 1, # Alert
'MAJOR': 2, # Crtical
'MINOR': 3, # Error
'WARNING': 4, # Warning
'NORMAL': 5, # Notice
'INFORM': 6, # Informational
'DEBUG': 7, # Debug
}
# Globals
info = dict()
last = dict()
lookup = dict()
def ec2_status():
global conn, globalconf, awsconf, BASE_URL, info, last
if 'endpoint' in globalconf:
BASE_URL = '%s/alerta/api/v1' % globalconf['endpoint']
url = '%s/alerts?%s' % (BASE_URL, awsconf.get('filter','tags=cloud:AWS/EC2'))
if 'proxy' in globalconf:
os.environ['http_proxy'] = globalconf['proxy']['http']
os.environ['https_proxy'] = globalconf['proxy']['https']
last = info.copy()
info = dict()
for account,keys in awsconf['accounts'].iteritems():
access_key = keys.get('aws_access_key_id','')
secret_key = keys.get('aws_secret_access_key','')
logging.debug('AWS Account=%s, AwsAccessKey=%s, AwsSecretKey=************************************%s', account, access_key, secret_key[-4:])
for region in awsconf['regions']:
try:
ec2 = boto.ec2.connect_to_region(region, aws_access_key_id=access_key, aws_secret_access_key=secret_key)
except boto.exception.EC2ResponseError, e:
logging.warning('EC2 API call connect_to_region(region=%s) failed: %s', region, e)
continue
logging.info('Get all instances for account %s in %s', account, region)
try:
reservations = ec2.get_all_instances()
except boto.exception.EC2ResponseError, e:
logging.warning('EC2 API call get_all_instances() failed: %s', e)
continue
instances = [i for r in reservations for i in r.instances if i.tags]
for i in instances:
info[i.id] = dict()
info[i.id]['state'] = i.state
info[i.id]['stage'] = i.tags.get('Stage','unknown')
info[i.id]['role'] = i.tags.get('Role','unknown')
info[i.id]['tags'] = [ 'os:Linux', 'role:%s' % info[i.id]['role'], 'datacentre:%s' % region, 'virtual:xen', 'cloud:AWS/EC2', 'account:%s' % account ]
info[i.id]['tags'].append('cluster:%s_%s' % (info[i.id]['role'], region)) # FIXME - replace match on cluster with match on role
# FIXME - this is a hack until all EC2 instances are keyed off instance id
logging.debug('%s -> %s', i.private_dns_name, i.id)
lookup[i.private_dns_name.split('.')[0]] = i.id
logging.info('Get system and instance status for account %s in %s', account, region)
try:
status = ec2.get_all_instance_status()
except boto.exception.EC2ResponseError, e:
logging.warning('EC2 API call get_all_instance_status() failed: %s', e)
continue
results = dict((i.id, s.system_status.status+':'+s.instance_status.status) for i in instances for s in status if s.id == i.id)
for i in instances:
if i.id in results:
info[i.id]['status'] = results[i.id]
else:
info[i.id]['status'] = u'not-available:not-available'
# Get list of all alerts from EC2
logging.info('Get list of EC2 alerts from %s', url)
try:
response = json.loads(urllib2.urlopen(url, None, 15).read())['response']
except urllib2.URLError, e:
logging.error('Could not get list of alerts from resources located in EC2: %s', e)
response = None
if response and 'alerts' in response and 'alertDetails' in response['alerts']:
logging.info('Retreived %s EC2 alerts', response['total'])
alertDetails = response['alerts']['alertDetails']
for alert in alertDetails:
alertid = alert['id']
resource = alert['resource']
# resource might be 'i-01234567:/tmp'
if ':' in resource:
resource = resource.split(':')[0]
if resource.startswith('ip-'): # FIXME - transform ip-10-x-x-x to i-01234567
logging.debug('%s : Transforming resource %s -> %s', alertid, resource, lookup.get(resource, resource))
resource = lookup.get(resource, resource)
# Delete alerts for instances that are no longer listed by EC2 API
if resource not in info:
logging.info('%s : EC2 instance %s is no longer listed, DELETE associated alert', alertid, resource)
data = '{ "_method": "delete" }'
# data = '{ "status": "DELETED" }' # XXX - debug only
elif info[resource]['state'] == 'terminated' and alert['status'] != 'ACK' and alert['event'] not in ['Ec2InstanceState', 'Ec2StatusChecks']:
logging.info('%s : EC2 instance %s is terminated, ACK associated alert', alertid, resource)
data = '{ "status": "ACK" }'
else:
continue
# Delete alert or update alert status
url = '%s/alerts/alert/%s' % (BASE_URL, alertid)
logging.debug('%s : %s %s', alertid, url, data)
req = urllib2.Request(url, data)
try:
response = json.loads(urllib2.urlopen(req).read())['response']
except urllib2.URLError, e:
logging.error('%s : API endpoint error: %s', alertid, e)
continue
if response['status'] == 'ok':
logging.info('%s : Successfully updated alert', alertid)
else:
logging.warning('%s : Failed to update alert: %s', alertid, response['message'])
for instance in info:
for check, event in [('state', 'Ec2InstanceState'),
('status','Ec2StatusChecks')]:
if instance not in last or check not in last[instance]:
last[instance] = dict()
last[instance][check] = 'unknown'
if last[instance][check] != info[instance][check]:
# Defaults
resource = instance
group = 'AWS/EC2'
value = info[instance][check]
text = 'Instance was %s now it is %s' % (last[instance][check], info[instance][check])
environment = [ info[instance]['stage'] ]
service = [ 'EC2' ] # NOTE: Will be transformed to correct service using Ec2ServiceLookup
tags = info[instance]['tags']
correlate = ''
# instance-state = pending | running | shutting-down | terminated | stopping | stopped
if check == 'state':
if info[instance][check] == 'running':
severity = 'NORMAL'
else:
severity = 'WARNING'
# system-status = ok | impaired | initializing | insufficient-data | not-applicable
# instance status = ok | impaired | initializing | insufficient-data | not-applicable
elif check == 'status':
if info[instance][check] == 'ok:ok':
severity = 'NORMAL'
text = "System and instance status checks are ok"
elif info[instance][check].startswith('ok'):
severity = 'WARNING'
text = 'Instance status check is %s' % info[instance][check].split(':')[1]
elif info[instance][check].endswith('ok'):
severity = 'WARNING'
text = 'System status check is %s' % info[instance][check].split(':')[0]
else:
severity = 'WARNING'
text = 'System status check is %s and instance status check is %s' % tuple(info[instance][check].split(':'))
alertid = str(uuid.uuid4()) # random UUID
createTime = datetime.datetime.utcnow()
headers = dict()
headers['type'] = "statusAlert"
headers['correlation-id'] = alertid
alert = dict()
alert['id'] = alertid
alert['resource'] = resource
alert['event'] = event
alert['group'] = group
alert['value'] = value
alert['severity'] = severity.upper()
alert['severityCode'] = SEVERITY_CODE[alert['severity']]
alert['environment'] = environment
alert['service'] = service
alert['text'] = text
alert['type'] = 'statusAlert'
alert['tags'] = tags
alert['summary'] = '%s - %s %s is %s on %s %s' % (','.join(environment), severity.upper(), event, value, ','.join(service), resource)
alert['createTime'] = createTime.replace(microsecond=0).isoformat() + ".%03dZ" % (createTime.microsecond//1000)
alert['origin'] = "%s/%s" % (__program__, os.uname()[1])
alert['thresholdInfo'] = 'n/a'
alert['timeout'] = DEFAULT_TIMEOUT
alert['correlatedEvents'] = correlate
logging.info('%s : %s', alertid, json.dumps(alert))
while not conn.is_connected():
logging.warning('Waiting for message broker to become available')
time.sleep(1.0)
try:
conn.send(json.dumps(alert), headers, destination=ALERT_QUEUE)
broker = conn.get_host_and_port()
logging.info('%s : Alert sent to %s:%s', alertid, broker[0], str(broker[1]))
except Exception, e:
logging.error('Failed to send alert to broker %s', e)
def send_heartbeat():
global conn
heartbeatid = str(uuid.uuid4()) # random UUID
createTime = datetime.datetime.utcnow()
headers = dict()
headers['type'] = "heartbeat"
headers['correlation-id'] = heartbeatid
heartbeat = dict()
heartbeat['id'] = heartbeatid
heartbeat['type'] = "heartbeat"
heartbeat['createTime'] = createTime.replace(microsecond=0).isoformat() + ".%03dZ" % (createTime.microsecond//1000)
heartbeat['origin'] = "%s/%s" % (__program__,os.uname()[1])
heartbeat['version'] = __version__
try:
conn.send(json.dumps(heartbeat), headers, destination=ALERT_QUEUE)
broker = conn.get_host_and_port()
logging.info('%s : Heartbeat sent to %s:%s', heartbeatid, broker[0], str(broker[1]))
except Exception, e:
logging.error('Failed to send heartbeat to broker %s', e)
def main():
global conn, globalconf, awsconf
logging.basicConfig(level=logging.INFO, format="%(asctime)s alert-aws[%(process)d] %(levelname)s - %(message)s", filename=LOGFILE)
logging.info('Starting up Alert Amazon Web Services EC2 version %s', __version__)
# Write pid file if not already running
if os.path.isfile(PIDFILE):
pid = open(PIDFILE).read()
try:
os.kill(int(pid), 0)
logging.error('Process with pid %s already exists, exiting', pid)
sys.exit(1)
except OSError:
pass
file(PIDFILE, 'w').write(str(os.getpid()))
while os.path.isfile(DISABLE):
logging.warning('Disable flag exists (%s). Sleeping...', DISABLE)
time.sleep(120)
# Read in global configuration file
try:
globalconf = yaml.load(open(GLOBAL_CONF))
logging.info('Loaded %d global configurations OK', len(globalconf))
except Exception, e:
logging.warning('Failed to load global configuration: %s. Exit.', e)
sys.exit(1)
# Read in configuration file
try:
awsconf = yaml.load(open(AWSCONF))
logging.info('Loaded %d AWS account configurations OK', len(awsconf))
except Exception, e:
logging.warning('Failed to load AWS account configuration: %s. Exit.', e)
sys.exit(1)
# Connect to message broker
try:
conn = stomp.Connection(BROKER_LIST)
conn.start()
conn.connect(wait=True)
except Exception, e:
logging.error('Stomp connection error: %s', e)
while True:
try:
ec2_status()
send_heartbeat()
time.sleep(WAIT_SECONDS)
except (KeyboardInterrupt, SystemExit):
conn.disconnect()
os.unlink(PIDFILE)
sys.exit(0)
if __name__ == '__main__':
main()