466 lines
17 KiB
Python
466 lines
17 KiB
Python
import datetime
|
|
import json
|
|
import logging
|
|
import platform
|
|
import queue
|
|
import re
|
|
import socket
|
|
import ssl
|
|
import sys
|
|
import threading
|
|
import time
|
|
from http.server import BaseHTTPRequestHandler as BHRH
|
|
from urllib.error import URLError # pylint: disable=no-name-in-module
|
|
from urllib.parse import urlparse # pylint: disable=no-name-in-module
|
|
from urllib.request import ( # pylint: disable=no-name-in-module
|
|
HTTPBasicAuthHandler, ProxyHandler, Request, build_opener, install_opener,
|
|
urlopen)
|
|
|
|
import settings
|
|
from alertaclient.api import Client
|
|
|
|
HTTP_RESPONSES = {k: v[0] for k, v in list(BHRH.responses.items())}
|
|
|
|
# Add missing responses
|
|
HTTP_RESPONSES[102] = 'Processing'
|
|
HTTP_RESPONSES[207] = 'Multi-Status'
|
|
HTTP_RESPONSES[422] = 'Unprocessable Entity'
|
|
HTTP_RESPONSES[423] = 'Locked'
|
|
HTTP_RESPONSES[424] = 'Failed Dependency'
|
|
HTTP_RESPONSES[506] = 'Variant Also Negotiates'
|
|
HTTP_RESPONSES[507] = 'Insufficient Storage'
|
|
HTTP_RESPONSES[510] = 'Not Extended'
|
|
|
|
_HTTP_ALERTS = [
|
|
'HttpConnectionError',
|
|
'HttpServerError',
|
|
'HttpClientError',
|
|
'HttpRedirection',
|
|
'HttpContentError',
|
|
'HttpResponseSlow',
|
|
'HttpResponseOK',
|
|
'HttpResponseRegexError',
|
|
'HttpResponseRegexOK'
|
|
]
|
|
|
|
__version__ = '3.3.0'
|
|
|
|
LOOP_EVERY = 60 # seconds
|
|
# TARGET_FILE = 'urlmon.targets' # FIXME -- or settings.py ???
|
|
SERVER_THREADS = 20
|
|
SLOW_WARNING_THRESHOLD = 5000 # ms
|
|
SLOW_CRITICAL_THRESHOLD = 10000 # ms
|
|
MAX_TIMEOUT = 15000 # ms
|
|
SSL_DAYS = 30
|
|
SSL_DAYS_PANIC = 7
|
|
|
|
|
|
LOG = logging.getLogger('alerta.urlmon')
|
|
logging.basicConfig(
|
|
format='%(asctime)s - %(name)s: %(levelname)s - %(message)s', level=logging.DEBUG)
|
|
|
|
|
|
class WorkerThread(threading.Thread):
|
|
|
|
def __init__(self, queue, api):
|
|
|
|
threading.Thread.__init__(self)
|
|
LOG.debug('Initialising %s...', self.getName())
|
|
|
|
self.queue = queue # internal queue
|
|
self.api = api # send alerts api
|
|
|
|
def run(self):
|
|
|
|
while True:
|
|
LOG.debug('Waiting on input queue...')
|
|
try:
|
|
check, queue_time = self.queue.get()
|
|
except TypeError:
|
|
LOG.info('%s is shutting down.', self.getName())
|
|
break
|
|
|
|
if time.time() - queue_time > LOOP_EVERY:
|
|
LOG.warning('URL request for %s to %s expired after %d seconds.', check['resource'], check['url'],
|
|
int(time.time() - queue_time))
|
|
self.queue.task_done()
|
|
continue
|
|
|
|
resource = check['resource']
|
|
LOG.info('%s polling %s...', self.getName(), resource)
|
|
status, reason, body, rtt = self.urlmon(check)
|
|
|
|
status_regex = check.get('status_regex', None)
|
|
search_string = check.get('search', None)
|
|
rule = check.get('rule', None)
|
|
warn_thold = check.get('warning', SLOW_WARNING_THRESHOLD)
|
|
crit_thold = check.get('critical', SLOW_CRITICAL_THRESHOLD)
|
|
checker_api = check.get('api_endpoint', None)
|
|
checker_apikey = check.get('api_key', None)
|
|
check_ssl = check.get('check_ssl')
|
|
if (checker_api and checker_apikey):
|
|
local_api = Client(endpoint=checker_api, key=checker_apikey)
|
|
else:
|
|
local_api = self.api
|
|
|
|
try:
|
|
description = HTTP_RESPONSES[status]
|
|
except KeyError:
|
|
description = 'undefined'
|
|
|
|
if not status:
|
|
event = 'HttpConnectionError'
|
|
severity = 'major'
|
|
value = reason
|
|
text = 'Error during connection or data transfer (timeout=%d).' % MAX_TIMEOUT
|
|
|
|
elif status_regex:
|
|
if re.search(status_regex, str(status)):
|
|
event = 'HttpResponseRegexOK'
|
|
severity = 'normal'
|
|
value = '%s (%d)' % (description, status)
|
|
text = 'HTTP server responded with status code %d that matched "%s" in %dms' % (
|
|
status, status_regex, rtt)
|
|
else:
|
|
event = 'HttpResponseRegexError'
|
|
severity = 'major'
|
|
value = '%s (%d)' % (description, status)
|
|
text = 'HTTP server responded with status code %d that failed to match "%s"' % (
|
|
status, status_regex)
|
|
|
|
elif 100 <= status <= 199:
|
|
event = 'HttpInformational'
|
|
severity = 'normal'
|
|
value = '%s (%d)' % (description, status)
|
|
text = 'HTTP server responded with status code %d in %dms' % (
|
|
status, rtt)
|
|
|
|
elif 200 <= status <= 299:
|
|
event = 'HttpResponseOK'
|
|
severity = 'normal'
|
|
value = '%s (%d)' % (description, status)
|
|
text = 'HTTP server responded with status code %d in %dms' % (
|
|
status, rtt)
|
|
|
|
elif 300 <= status <= 399:
|
|
event = 'HttpRedirection'
|
|
severity = 'minor'
|
|
value = '%s (%d)' % (description, status)
|
|
text = 'HTTP server responded with status code %d in %dms' % (
|
|
status, rtt)
|
|
|
|
elif 400 <= status <= 499:
|
|
event = 'HttpClientError'
|
|
severity = 'minor'
|
|
value = '%s (%d)' % (description, status)
|
|
text = 'HTTP server responded with status code %d in %dms' % (
|
|
status, rtt)
|
|
|
|
elif 500 <= status <= 599:
|
|
event = 'HttpServerError'
|
|
severity = 'major'
|
|
value = '%s (%d)' % (description, status)
|
|
text = 'HTTP server responded with status code %d in %dms' % (
|
|
status, rtt)
|
|
|
|
else:
|
|
event = 'HttpUnknownError'
|
|
severity = 'warning'
|
|
value = 'UNKNOWN'
|
|
text = 'HTTP request resulted in an unhandled error.'
|
|
|
|
if event in ['HttpResponseOK', 'HttpResponseRegexOK']:
|
|
if rtt > crit_thold:
|
|
event = 'HttpResponseSlow'
|
|
severity = 'critical'
|
|
value = '%dms' % rtt
|
|
text = 'Website available but exceeding critical RT thresholds of %dms' % crit_thold
|
|
elif rtt > warn_thold:
|
|
event = 'HttpResponseSlow'
|
|
severity = 'warning'
|
|
value = '%dms' % rtt
|
|
text = 'Website available but exceeding warning RT thresholds of %dms' % warn_thold
|
|
if search_string and body:
|
|
LOG.debug('Searching for %s', search_string)
|
|
found = False
|
|
for line in body.split('\n'):
|
|
m = re.search(search_string, line)
|
|
if m:
|
|
found = True
|
|
LOG.debug('Regex: Found %s in %s',
|
|
search_string, line)
|
|
break
|
|
if not found:
|
|
event = 'HttpContentError'
|
|
severity = 'minor'
|
|
value = 'Search failed'
|
|
text = 'Website available but pattern "%s" not found' % search_string
|
|
elif rule and body:
|
|
LOG.debug('Evaluating rule %s', rule)
|
|
headers = check.get('headers', {})
|
|
if 'Content-type' in headers and headers['Content-type'] == 'application/json':
|
|
try:
|
|
body = json.loads(body)
|
|
except ValueError as e:
|
|
LOG.error(
|
|
'Could not evaluate rule %s: %s', rule, e)
|
|
try:
|
|
# NOTE: assumes request body in variable called 'body'
|
|
eval(rule)
|
|
except (SyntaxError, NameError, ZeroDivisionError) as e:
|
|
LOG.error('Could not evaluate rule %s: %s', rule, e)
|
|
except Exception as e:
|
|
LOG.error('Could not evaluate rule %s: %s', rule, e)
|
|
else:
|
|
if not eval(rule):
|
|
event = 'HttpContentError'
|
|
severity = 'minor'
|
|
value = 'Rule failed'
|
|
text = 'Website available but rule evaluation failed (%s)' % rule
|
|
|
|
LOG.debug('URL: %s, Status: %s (%s), Round-Trip Time: %dms -> %s',
|
|
check['url'], description, status, rtt, event)
|
|
|
|
resource = check['resource']
|
|
correlate = _HTTP_ALERTS
|
|
group = 'Web'
|
|
environment = check['environment']
|
|
service = check['service']
|
|
text = text
|
|
tags = check.get('tags', list())
|
|
threshold_info = '%s : RT > %d RT > %d x %s' % (
|
|
check['url'], warn_thold, crit_thold, check.get('count', 1))
|
|
|
|
try:
|
|
local_api.send_alert(
|
|
resource=resource,
|
|
event=event,
|
|
correlate=correlate,
|
|
group=group,
|
|
value=value,
|
|
severity=severity,
|
|
environment=environment,
|
|
service=service,
|
|
text=text,
|
|
event_type='serviceAlert',
|
|
tags=tags,
|
|
attributes={
|
|
'thresholdInfo': threshold_info
|
|
}
|
|
)
|
|
except Exception as e:
|
|
LOG.warning('Failed to send alert: %s', e)
|
|
|
|
if check_ssl:
|
|
ssl_date_fmt = r'%b %d %H:%M:%S %Y %Z'
|
|
context = ssl.create_default_context()
|
|
domain = '{uri.netloc}'.format(uri=urlparse(check.get('url')))
|
|
port = urlparse(check.get('url')).port or 443
|
|
conn = context.wrap_socket(
|
|
socket.socket(socket.AF_INET),
|
|
server_hostname=domain
|
|
)
|
|
conn.settimeout(3.0)
|
|
conn.connect((domain, port))
|
|
ssl_info = conn.getpeercert()
|
|
days_left = datetime.datetime.strptime(
|
|
ssl_info['notAfter'], ssl_date_fmt) - datetime.datetime.utcnow()
|
|
if days_left < datetime.timedelta(days=0):
|
|
text = 'HTTPS cert for %s expired' % check['resource']
|
|
severity = 'critical'
|
|
elif days_left < datetime.timedelta(days=SSL_DAYS) and days_left > datetime.timedelta(days=SSL_DAYS_PANIC):
|
|
text = 'HTTPS cert for {} will expire at {}'.format(
|
|
check['resource'], days_left)
|
|
severity = 'major'
|
|
elif days_left <= datetime.timedelta(days=SSL_DAYS_PANIC):
|
|
text = 'HTTPS cert for {} will expire at {}'.format(
|
|
check['resource'], days_left)
|
|
severity = 'critical'
|
|
else:
|
|
severity = 'normal'
|
|
|
|
try:
|
|
local_api.send_alert(
|
|
resource=resource,
|
|
event='HttpSSLChecker',
|
|
correlate=correlate,
|
|
group=group,
|
|
value='left %s day(s)' % days_left.days,
|
|
severity=severity,
|
|
environment=environment,
|
|
service=service,
|
|
text=text,
|
|
event_type='serviceAlert',
|
|
tags=tags,
|
|
attributes={
|
|
'thresholdInfo': threshold_info
|
|
}
|
|
)
|
|
except Exception as e:
|
|
LOG.warning('Failed to send ssl alert: %s', e)
|
|
|
|
self.queue.task_done()
|
|
LOG.info('%s check complete.', self.getName())
|
|
|
|
self.queue.task_done()
|
|
|
|
@staticmethod
|
|
def urlmon(check):
|
|
|
|
url = check['url']
|
|
post = check.get('post', None)
|
|
count = check.get('count', 1)
|
|
headers = check.get('headers', {})
|
|
username = check.get('username', None)
|
|
password = check.get('password', None)
|
|
realm = check.get('realm', None)
|
|
uri = check.get('uri', None)
|
|
proxy = check.get('proxy', False)
|
|
|
|
status = 0
|
|
reason = None
|
|
body = None
|
|
rtt = 0
|
|
|
|
while True:
|
|
|
|
count -= 1
|
|
start = time.time()
|
|
|
|
if username and password:
|
|
auth_handler = HTTPBasicAuthHandler()
|
|
auth_handler.add_password(realm=realm,
|
|
uri=uri,
|
|
user=username,
|
|
passwd=password)
|
|
if proxy:
|
|
opener = build_opener(auth_handler, ProxyHandler(proxy))
|
|
else:
|
|
opener = build_opener(auth_handler)
|
|
else:
|
|
if proxy:
|
|
opener = build_opener(ProxyHandler(proxy))
|
|
else:
|
|
opener = build_opener()
|
|
install_opener(opener)
|
|
|
|
if 'User-agent' not in headers:
|
|
headers['User-agent'] = 'alert-urlmon/%s' % (__version__)
|
|
|
|
try:
|
|
if post:
|
|
req = Request(url, json.dumps(post), headers=headers)
|
|
else:
|
|
req = Request(url, headers=headers)
|
|
response = urlopen(req, None, MAX_TIMEOUT)
|
|
except ValueError as e:
|
|
LOG.error('Request failed: %s' % e)
|
|
except URLError as e:
|
|
if hasattr(e, 'reason'):
|
|
reason = str(e.reason)
|
|
status = None
|
|
elif hasattr(e, 'code'):
|
|
reason = None
|
|
status = e.code # pylint: disable=no-member
|
|
except Exception as e:
|
|
LOG.warning('Unexpected error: %s' % e)
|
|
else:
|
|
status = response.getcode()
|
|
body = response.read()
|
|
|
|
rtt = int((time.time() - start) * 1000) # round-trip time
|
|
|
|
if status: # return result if any HTTP/S response is received
|
|
break
|
|
|
|
if not count:
|
|
break
|
|
time.sleep(10)
|
|
|
|
return status, reason, body, rtt
|
|
|
|
|
|
class UrlmonDaemon:
|
|
|
|
def __init__(self):
|
|
|
|
self.shuttingdown = False
|
|
|
|
def run(self):
|
|
|
|
self.running = True
|
|
|
|
self.queue = queue.Queue()
|
|
self.api = Client(endpoint=settings.ENDPOINT, key=settings.API_KEY)
|
|
|
|
# Start worker threads
|
|
LOG.debug('Starting %s worker threads...', SERVER_THREADS)
|
|
for i in range(SERVER_THREADS):
|
|
w = WorkerThread(self.queue, self.api)
|
|
try:
|
|
w.start()
|
|
except Exception as e:
|
|
LOG.error('Worker thread #%s did not start: %s', i, e)
|
|
continue
|
|
LOG.info('Started worker thread: %s', w.getName())
|
|
|
|
while not self.shuttingdown:
|
|
try:
|
|
for check in settings.checks:
|
|
self.queue.put((check, time.time()))
|
|
|
|
LOG.debug('Send heartbeat...')
|
|
try:
|
|
origin = '{}/{}'.format('urlmon', platform.uname()[1])
|
|
self.api.heartbeat(
|
|
origin, tags=[__version__], timeout=3600)
|
|
except Exception as e:
|
|
LOG.warning('Failed to send heartbeat: %s', e)
|
|
|
|
time.sleep(LOOP_EVERY)
|
|
LOG.info('URL check queue length is %d', self.queue.qsize())
|
|
|
|
if self.queue.qsize() > 100:
|
|
severity = 'warning'
|
|
else:
|
|
severity = 'ok'
|
|
try:
|
|
self.api.send_alert(
|
|
resource=origin,
|
|
event='big queue for http checks',
|
|
value=self.queue.qsize(),
|
|
severity=severity,
|
|
text='URL check queue length is %d' % self.queue.qsize(),
|
|
event_type='serviceAlert',
|
|
)
|
|
except Exception as e:
|
|
LOG.warning('Failed to send alert: %s', e)
|
|
|
|
except (KeyboardInterrupt, SystemExit):
|
|
self.shuttingdown = True
|
|
|
|
LOG.info('Shutdown request received...')
|
|
self.running = False
|
|
|
|
for i in range(SERVER_THREADS):
|
|
self.queue.put(None)
|
|
w.join()
|
|
|
|
|
|
def main():
|
|
|
|
LOG = logging.getLogger('alerta.urlmon')
|
|
|
|
try:
|
|
UrlmonDaemon().run()
|
|
except Exception as e:
|
|
LOG.error(e, exc_info=1)
|
|
sys.exit(1)
|
|
except KeyboardInterrupt:
|
|
LOG.warning('Exiting alerta urlmon.')
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|