From 1f7790950279012457bd9f5646b9daa20d546436 Mon Sep 17 00:00:00 2001 From: Andrea Dell'Amico Date: Mon, 23 Nov 2015 20:16:04 +0100 Subject: [PATCH] d4science-ghn-cluster/group_vars/mongo_cluster_prod/ganglia.yml: ganglia cluster for the mongodb prod. library/roles/mongodb-org: Install the specific ganglia plugin if ganglia monitoring is enabled. library/roles/couchdb: Install the specific ganglia plugin if ganglia monitoring is enabled. --- couchdb/defaults/main.yml | 3 + couchdb/files/couchdb.py | 322 ++++++++++++ ...{couchdb-old-package.yml => couchdb-1.yml} | 0 couchdb/tasks/ganglia-plugin.yml | 13 + couchdb/tasks/main.yml | 4 +- couchdb/templates/couchdb.pyconf.j2 | 207 ++++++++ ganglia/defaults/main.yml | 10 +- mongodb-org/files/mongodb.py | 496 ++++++++++++++++++ mongodb-org/tasks/ganglia-plugin.yml | 14 + mongodb-org/tasks/main.yml | 58 +- mongodb-org/tasks/mongodb.yml | 55 ++ mongodb-org/templates/mongodb.pyconf.j2 | 109 ++++ 12 files changed, 1231 insertions(+), 60 deletions(-) create mode 100644 couchdb/files/couchdb.py rename couchdb/tasks/{couchdb-old-package.yml => couchdb-1.yml} (100%) create mode 100644 couchdb/tasks/ganglia-plugin.yml create mode 100644 couchdb/templates/couchdb.pyconf.j2 create mode 100755 mongodb-org/files/mongodb.py create mode 100644 mongodb-org/tasks/ganglia-plugin.yml create mode 100644 mongodb-org/tasks/mongodb.yml create mode 100644 mongodb-org/templates/mongodb.pyconf.j2 diff --git a/couchdb/defaults/main.yml b/couchdb/defaults/main.yml index 2fedb852..d984f65c 100644 --- a/couchdb/defaults/main.yml +++ b/couchdb/defaults/main.yml @@ -67,3 +67,6 @@ couchdb_replicator_options: # - { section: 'httpd', option: 'bind_address', value: '{{ couchdb_bind_address }}', state: 'present' } # - { section: 'httpd', option: 'config_whitelist', value: '[{httpd,config_whitelist}, {log,level}]', state: 'present' } +ganglia_enabled: False +couchdb_ganglia_url: http://localhost:5984/_stats +couchdb_ganglia_refresh_rate: 60 diff --git a/couchdb/files/couchdb.py b/couchdb/files/couchdb.py new file mode 100644 index 00000000..21589f8b --- /dev/null +++ b/couchdb/files/couchdb.py @@ -0,0 +1,322 @@ +### This script reports couchdb metrics to ganglia. + +### License to use, modify, and distribute under the GPL +### http://www.gnu.org/licenses/gpl.txt +import logging +import os +import subprocess +import sys +import threading +import time +import traceback +import urllib2 +import json + +logging.basicConfig(level=logging.ERROR) + +_Worker_Thread = None + +class UpdateCouchdbThread(threading.Thread): + + def __init__(self, params): + threading.Thread.__init__(self) + self.running = False + self.shuttingdown = False + self.refresh_rate = int(params['refresh_rate']) + self.metrics = {} + self.settings = {} + self.stats_url = params['stats_url'] + self._metrics_lock = threading.Lock() + self._settings_lock = threading.Lock() + + def shutdown(self): + self.shuttingdown = True + if not self.running: + return + self.join() + + def run(self): + global _Lock + + self.running = True + + while not self.shuttingdown: + time.sleep(self.refresh_rate) + self.refresh_metrics() + + self.running = False + + @staticmethod + def _get_couchdb_stats(url, refresh_rate): + if refresh_rate == 60 or refresh_rate == 300 or refresh_rate == 900: + url += '?range=' + str(refresh_rate) + else: + logging.warning('The specified refresh_rate of %d is invalid and has been substituted with 60!' % refresh_rate) + url += '?range=60' + + # Set time out for urlopen to 2 seconds otherwise we run into the possibility of hosing gmond + c = urllib2.urlopen(url, None, 2) + json_data = c.read() + c.close() + + data = json.loads(json_data) + couchdb = data['couchdb'] + httpd = data['httpd'] + request_methods = data['httpd_request_methods'] + status_codes = data['httpd_status_codes'] + + result = {} + for first_level_key in data: + for second_level_key in data[first_level_key]: + value = data[first_level_key][second_level_key]['current'] + if value is None: + value = 0 + else: + if second_level_key in ['open_databases', 'open_os_files', 'clients_requesting_changes']: + print second_level_key + ': ' + str(value) + value = int(value) + else: + # We need to devide by the range as couchdb provides no per second values + value = float(value) / refresh_rate + result['couchdb_' + first_level_key + '_' + second_level_key ] = value + + return result + + def refresh_metrics(self): + logging.debug('refresh metrics') + + try: + logging.debug(' opening URL: ' + str(self.stats_url)) + data = UpdateCouchdbThread._get_couchdb_stats(self.stats_url, self.refresh_rate) + except: + logging.warning('error refreshing metrics') + logging.warning(traceback.print_exc(file=sys.stdout)) + + try: + self._metrics_lock.acquire() + self.metrics = {} + for k, v in data.items(): + self.metrics[k] = v + except: + logging.warning('error refreshing metrics') + logging.warning(traceback.print_exc(file=sys.stdout)) + return False + + finally: + self._metrics_lock.release() + + if not self.metrics: + logging.warning('error refreshing metrics') + return False + + logging.debug('success refreshing metrics') + logging.debug('metrics: ' + str(self.metrics)) + + return True + + def metric_of(self, name): + logging.debug('getting metric: ' + name) + + try: + if name in self.metrics: + try: + self._metrics_lock.acquire() + logging.debug('metric: %s = %s' % (name, self.metrics[name])) + return self.metrics[name] + finally: + self._metrics_lock.release() + except: + logging.warning('failed to fetch ' + name) + return 0 + + def setting_of(self, name): + logging.debug('getting setting: ' + name) + + try: + if name in self.settings: + try: + self._settings_lock.acquire() + logging.debug('setting: %s = %s' % (name, self.settings[name])) + return self.settings[name] + finally: + self._settings_lock.release() + except: + logging.warning('failed to fetch ' + name) + return 0 + +def metric_init(params): + logging.debug('init: ' + str(params)) + global _Worker_Thread + + METRIC_DEFAULTS = { + 'units': 'requests/s', + 'groups': 'couchdb', + 'slope': 'both', + 'value_type': 'float', + 'format': '%.3f', + 'description': '', + 'call_back': metric_of + } + + descriptions = dict( + couchdb_couchdb_auth_cache_hits={ + 'units': 'hits/s', + 'description': 'Number of authentication cache hits'}, + couchdb_couchdb_auth_cache_misses={ + 'units': 'misses/s', + 'description': 'Number of authentication cache misses'}, + couchdb_couchdb_database_reads={ + 'units': 'reads/s', + 'description': 'Number of times a document was read from a database'}, + couchdb_couchdb_database_writes={ + 'units': 'writes/s', + 'description': 'Number of times a document was changed'}, + couchdb_couchdb_open_databases={ + 'value_type': 'uint', + 'format': '%d', + 'units': 'databases', + 'description': 'Number of open databases'}, + couchdb_couchdb_open_os_files={ + 'value_type': 'uint', + 'format': '%d', + 'units': 'files', + 'description': 'Number of file descriptors CouchDB has open'}, + couchdb_couchdb_request_time={ + 'units': 'ms', + 'description': 'Request time'}, + couchdb_httpd_bulk_requests={ + 'description': 'Number of bulk requests'}, + couchdb_httpd_clients_requesting_changes={ + 'value_type': 'uint', + 'format': '%d', + 'units': 'clients', + 'description': 'Number of clients for continuous _changes'}, + couchdb_httpd_requests={ + 'description': 'Number of HTTP requests'}, + couchdb_httpd_temporary_view_reads={ + 'units': 'reads', + 'description': 'Number of temporary view reads'}, + couchdb_httpd_view_reads={ + 'description': 'Number of view reads'}, + couchdb_httpd_request_methods_COPY={ + 'description': 'Number of HTTP COPY requests'}, + couchdb_httpd_request_methods_DELETE={ + 'description': 'Number of HTTP DELETE requests'}, + couchdb_httpd_request_methods_GET={ + 'description': 'Number of HTTP GET requests'}, + couchdb_httpd_request_methods_HEAD={ + 'description': 'Number of HTTP HEAD requests'}, + couchdb_httpd_request_methods_POST={ + 'description': 'Number of HTTP POST requests'}, + couchdb_httpd_request_methods_PUT={ + 'description': 'Number of HTTP PUT requests'}, + couchdb_httpd_status_codes_200={ + 'units': 'responses/s', + 'description': 'Number of HTTP 200 OK responses'}, + couchdb_httpd_status_codes_201={ + 'units': 'responses/s', + 'description': 'Number of HTTP 201 Created responses'}, + couchdb_httpd_status_codes_202={ + 'units': 'responses/s', + 'description': 'Number of HTTP 202 Accepted responses'}, + couchdb_httpd_status_codes_301={ + 'units': 'responses/s', + 'description': 'Number of HTTP 301 Moved Permanently responses'}, + couchdb_httpd_status_codes_304={ + 'units': 'responses/s', + 'description': 'Number of HTTP 304 Not Modified responses'}, + couchdb_httpd_status_codes_400={ + 'units': 'responses/s', + 'description': 'Number of HTTP 400 Bad Request responses'}, + couchdb_httpd_status_codes_401={ + 'units': 'responses/s', + 'description': 'Number of HTTP 401 Unauthorized responses'}, + couchdb_httpd_status_codes_403={ + 'units': 'responses/s', + 'description': 'Number of HTTP 403 Forbidden responses'}, + couchdb_httpd_status_codes_404={ + 'units': 'responses/s', + 'description': 'Number of HTTP 404 Not Found responses'}, + couchdb_httpd_status_codes_405={ + 'units': 'responses/s', + 'description': 'Number of HTTP 405 Method Not Allowed responses'}, + couchdb_httpd_status_codes_409={ + 'units': 'responses/s', + 'description': 'Number of HTTP 409 Conflict responses'}, + couchdb_httpd_status_codes_412={ + 'units': 'responses/s', + 'description': 'Number of HTTP 412 Precondition Failed responses'}, + couchdb_httpd_status_codes_500={ + 'units': 'responses/s', + 'description': 'Number of HTTP 500 Internal Server Error responses'}) + + if _Worker_Thread is not None: + raise Exception('Worker thread already exists') + + _Worker_Thread = UpdateCouchdbThread(params) + _Worker_Thread.refresh_metrics() + _Worker_Thread.start() + + descriptors = [] + + for name, desc in descriptions.iteritems(): + d = desc.copy() + d['name'] = str(name) + [ d.setdefault(key, METRIC_DEFAULTS[key]) for key in METRIC_DEFAULTS.iterkeys() ] + descriptors.append(d) + + return descriptors + +def metric_of(name): + global _Worker_Thread + return _Worker_Thread.metric_of(name) + +def setting_of(name): + global _Worker_Thread + return _Worker_Thread.setting_of(name) + +def metric_cleanup(): + global _Worker_Thread + if _Worker_Thread is not None: + _Worker_Thread.shutdown() + logging.shutdown() + pass + +if __name__ == '__main__': + from optparse import OptionParser + + try: + logging.debug('running from the cmd line') + parser = OptionParser() + parser.add_option('-u', '--URL', dest='stats_url', default='http://127.0.0.1:5984/_stats', help='URL for couchdb stats page') + parser.add_option('-q', '--quiet', dest='quiet', action='store_true', default=False) + parser.add_option('-r', '--refresh-rate', dest='refresh_rate', default=60) + parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False) + + (options, args) = parser.parse_args() + + descriptors = metric_init({ + 'stats_url': options.stats_url, + 'refresh_rate': options.refresh_rate + }) + + if options.debug: + from pprint import pprint + pprint(descriptors) + + for d in descriptors: + v = d['call_back'](d['name']) + + if not options.quiet: + print ' {0}: {1} {2} [{3}]' . format(d['name'], v, d['units'], d['description']) + + os._exit(1) + + except KeyboardInterrupt: + time.sleep(0.2) + os._exit(1) + except StandardError: + traceback.print_exc() + os._exit(1) + finally: + metric_cleanup() diff --git a/couchdb/tasks/couchdb-old-package.yml b/couchdb/tasks/couchdb-1.yml similarity index 100% rename from couchdb/tasks/couchdb-old-package.yml rename to couchdb/tasks/couchdb-1.yml diff --git a/couchdb/tasks/ganglia-plugin.yml b/couchdb/tasks/ganglia-plugin.yml new file mode 100644 index 00000000..3d1f4460 --- /dev/null +++ b/couchdb/tasks/ganglia-plugin.yml @@ -0,0 +1,13 @@ +--- +# +# The ganglia plugin comes from https://github.com/ganglia/gmond_python_modules +# +- name: Install the ganglia plugin for Couchdb + copy: src=couchdb.py dest=/usr/lib/ganglia/python_modules/couchdb.py owner=root group=root mode=0644 + notify: Restart ganglia monitor + tags: ganglia + +- name: Distribute the ganglia (gmond) configuration for the Couchdb plugin + template: src=couchdb.pyconf.j2 dest=/etc/ganglia/conf.d/couchdb.pyconf owner=root group=root mode=444 + notify: Restart ganglia monitor + tags: ganglia diff --git a/couchdb/tasks/main.yml b/couchdb/tasks/main.yml index a741615a..a561252d 100644 --- a/couchdb/tasks/main.yml +++ b/couchdb/tasks/main.yml @@ -1,5 +1,7 @@ --- -- include: couchdb-old-package.yml +- include: couchdb-1.yml when: couchdb_use_old_package - include: couchdb-2.yml when: not couchdb_use_old_package +- include: ganglia-plugin.yml + when: ganglia_enabled diff --git a/couchdb/templates/couchdb.pyconf.j2 b/couchdb/templates/couchdb.pyconf.j2 new file mode 100644 index 00000000..83e8905a --- /dev/null +++ b/couchdb/templates/couchdb.pyconf.j2 @@ -0,0 +1,207 @@ +# + +modules { + module { + name = 'couchdb' + language = 'python' + + param stats_url { + value = '{{ couchdb_ganglia_url }}' + } + + param refresh_rate { + value = '{{ couchdb_ganglia_refresh_rate }}' + } + } +} + +collection_group { + collect_every = 10 + time_threshold = 20 + + metric { + name = 'couchdb_couchdb_auth_cache_hits' + title = 'Number of authentication cache hits' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_couchdb_auth_cache_misses' + title = 'Number of authentication cache misses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_couchdb_database_reads' + title = 'Number of times a document was read from a database' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_couchdb_database_writes' + title = 'Number of times a document was changed' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_couchdb_open_databases' + title = 'Number of open databases' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_couchdb_open_os_files' + title = 'Number of file descriptors CouchDB has open' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_couchdb_request_time' + title = 'Request Time' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_bulk_requests' + title = 'Number of bulk requests' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_clients_requesting_changes' + title = 'Number of clients for continuous _changes' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_requests' + title = 'Number of HTTP requests' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_temporary_view_reads' + title = 'Number of temporary view reads' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_view_reads' + title = 'Number of view reads' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_request_methods_COPY' + title = 'Number of HTTP COPY requests' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_request_methods_DELETE' + title = 'Number of HTTP DELETE requests' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_request_methods_GET' + title = 'Number of HTTP GET requests' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_request_methods_HEAD' + title = 'Number of HTTP HEAD requests' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_request_methods_POST' + title = 'Number of HTTP POST requests' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_request_methods_PUT' + title = 'Number of HTTP PUT requests' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_200' + title = 'Number of HTTP 200 OK responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_201' + title = 'Number of HTTP 201 Created responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_202' + title = 'Number of HTTP 202 Accepted responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_301' + title = 'Number of HTTP 301 Moved Permanently responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_304' + title = 'Number of HTTP 304 Not Modified responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_400' + title = 'Number of HTTP 400 Bad Request responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_401' + title = 'Number of HTTP 401 Unauthorized responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_403' + title = 'Number of HTTP 403 Forbidden responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_404' + title = 'Number of HTTP 404 Not Found responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_405' + title = 'Number of HTTP 405 Method Not Allowed responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_409' + title = 'Number of HTTP 409 Conflict responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_412' + title = 'Number of HTTP 412 Precondition Failed responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_500' + title = 'Number of HTTP 500 Internal Server Error responses' + value_threshold = 1.0 + } +} diff --git a/ganglia/defaults/main.yml b/ganglia/defaults/main.yml index 374af76e..906bca9c 100644 --- a/ganglia/defaults/main.yml +++ b/ganglia/defaults/main.yml @@ -1,10 +1,12 @@ # These are for reference only. # Define your own set of variables # -ganglia_gmond_cluster: "CNR-ISTI NeMIS Cluster" -ganglia_gmond_cluster_port: 8649 -ganglia_gmond_mcast_addr: 239.2.11.71 -ganglia_gmetad_host: monitoring.research-infrastructures.eu +#ganglia_gmond_cluster: "Ganglia Cluster" +#ganglia_gmond_cluster_port: 8649 +#ganglia_gmond_mcast_addr: 239.2.11.71 +#ganglia_gmetad_host: ganglia-gmetad ganglia_gmond_send_metadata_interval: 60 # Needed to build the correct firewall rules when jmxtrans is in use ganglia_gmond_use_jmxtrans: False +# Used by other roles to install specific ganglia iptables rules or some specific ganglia plugins. Or not. +ganglia_enabled: False diff --git a/mongodb-org/files/mongodb.py b/mongodb-org/files/mongodb.py new file mode 100755 index 00000000..7ef34bed --- /dev/null +++ b/mongodb-org/files/mongodb.py @@ -0,0 +1,496 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# MongoDB gmond module for Ganglia +# +# Copyright (C) 2011 by Michael T. Conigliaro . +# All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# + +import json +import os +import re +import socket +import string +import time +import copy + +NAME_PREFIX = 'mongodb_' +PARAMS = { + 'server_status' : '~/mongodb-osx-x86_64-1.8.1/bin/mongo --host mongodb04.example.com --port 27018 --quiet --eval "printjson(db.serverStatus())"', + 'rs_status' : '~/mongodb-osx-x86_64-1.8.1/bin/mongo --host mongodb04.example.com --port 27018 --quiet --eval "printjson(rs.status())"' +} +METRICS = { + 'time' : 0, + 'data' : {} +} +LAST_METRICS = copy.deepcopy(METRICS) +METRICS_CACHE_TTL = 3 + + +def flatten(d, pre = '', sep = '_'): + """Flatten a dict (i.e. dict['a']['b']['c'] => dict['a_b_c'])""" + + new_d = {} + for k,v in d.items(): + if type(v) == dict: + new_d.update(flatten(d[k], '%s%s%s' % (pre, k, sep))) + else: + new_d['%s%s' % (pre, k)] = v + return new_d + + +def get_metrics(): + """Return all metrics""" + + global METRICS, LAST_METRICS + + if (time.time() - METRICS['time']) > METRICS_CACHE_TTL: + + metrics = {} + for status_type in PARAMS.keys(): + + # get raw metric data + io = os.popen(PARAMS[status_type]) + + # clean up + metrics_str = ''.join(io.readlines()).strip() # convert to string + metrics_str = re.sub('\w+\((.*)\)', r"\1", metrics_str) # remove functions + + # convert to flattened dict + try: + if status_type == 'server_status': + metrics.update(flatten(json.loads(metrics_str))) + else: + metrics.update(flatten(json.loads(metrics_str), pre='%s_' % status_type)) + except ValueError: + metrics = {} + + # update cache + LAST_METRICS = copy.deepcopy(METRICS) + METRICS = { + 'time': time.time(), + 'data': metrics + } + + return [METRICS, LAST_METRICS] + + +def get_value(name): + """Return a value for the requested metric""" + + # get metrics + metrics = get_metrics()[0] + + # get value + name = name[len(NAME_PREFIX):] # remove prefix from name + try: + result = metrics['data'][name] + except StandardError: + result = 0 + + return result + + +def get_rate(name): + """Return change over time for the requested metric""" + + # get metrics + [curr_metrics, last_metrics] = get_metrics() + + # get rate + name = name[len(NAME_PREFIX):] # remove prefix from name + + try: + rate = float(curr_metrics['data'][name] - last_metrics['data'][name]) / \ + float(curr_metrics['time'] - last_metrics['time']) + if rate < 0: + rate = float(0) + except StandardError: + rate = float(0) + + return rate + + +def get_opcounter_rate(name): + """Return change over time for an opcounter metric""" + + master_rate = get_rate(name) + repl_rate = get_rate(name.replace('opcounters_', 'opcountersRepl_')) + + return master_rate + repl_rate + + +def get_globalLock_ratio(name): + """Return the global lock ratio""" + + try: + result = get_rate(NAME_PREFIX + 'globalLock_lockTime') / \ + get_rate(NAME_PREFIX + 'globalLock_totalTime') * 100 + except ZeroDivisionError: + result = 0 + + return result + + +def get_indexCounters_btree_miss_ratio(name): + """Return the btree miss ratio""" + + try: + result = get_rate(NAME_PREFIX + 'indexCounters_btree_misses') / \ + get_rate(NAME_PREFIX + 'indexCounters_btree_accesses') * 100 + except ZeroDivisionError: + result = 0 + + return result + + +def get_connections_current_ratio(name): + """Return the percentage of connections used""" + + try: + result = float(get_value(NAME_PREFIX + 'connections_current')) / \ + float(get_value(NAME_PREFIX + 'connections_available')) * 100 + except ZeroDivisionError: + result = 0 + + return result + + +def get_slave_delay(name): + """Return the replica set slave delay""" + + # get metrics + metrics = get_metrics()[0] + + # no point checking my optime if i'm not replicating + if 'rs_status_myState' not in metrics['data'] or metrics['data']['rs_status_myState'] != 2: + result = 0 + + # compare my optime with the master's + else: + master = {} + slave = {} + try: + for member in metrics['data']['rs_status_members']: + if member['state'] == 1: + master = member + if member['name'].split(':')[0] == socket.getfqdn(): + slave = member + result = max(0, master['optime']['t'] - slave['optime']['t']) / 1000 + except KeyError: + result = 0 + + return result + + +def get_asserts_total_rate(name): + """Return the total number of asserts per second""" + + return float(reduce(lambda memo,obj: memo + get_rate('%sasserts_%s' % (NAME_PREFIX, obj)), + ['regular', 'warning', 'msg', 'user', 'rollovers'], 0)) + + +def metric_init(lparams): + """Initialize metric descriptors""" + + global PARAMS + + # set parameters + for key in lparams: + PARAMS[key] = lparams[key] + + # define descriptors + time_max = 60 + groups = 'mongodb' + descriptors = [ + { + 'name': NAME_PREFIX + 'opcounters_insert', + 'call_back': get_opcounter_rate, + 'time_max': time_max, + 'value_type': 'float', + 'units': 'Inserts/Sec', + 'slope': 'both', + 'format': '%f', + 'description': 'Inserts', + 'groups': groups + }, + { + 'name': NAME_PREFIX + 'opcounters_query', + 'call_back': get_opcounter_rate, + 'time_max': time_max, + 'value_type': 'float', + 'units': 'Queries/Sec', + 'slope': 'both', + 'format': '%f', + 'description': 'Queries', + 'groups': groups + }, + { + 'name': NAME_PREFIX + 'opcounters_update', + 'call_back': get_opcounter_rate, + 'time_max': time_max, + 'value_type': 'float', + 'units': 'Updates/Sec', + 'slope': 'both', + 'format': '%f', + 'description': 'Updates', + 'groups': groups + }, + { + 'name': NAME_PREFIX + 'opcounters_delete', + 'call_back': get_opcounter_rate, + 'time_max': time_max, + 'value_type': 'float', + 'units': 'Deletes/Sec', + 'slope': 'both', + 'format': '%f', + 'description': 'Deletes', + 'groups': groups + }, + { + 'name': NAME_PREFIX + 'opcounters_getmore', + 'call_back': get_opcounter_rate, + 'time_max': time_max, + 'value_type': 'float', + 'units': 'Getmores/Sec', + 'slope': 'both', + 'format': '%f', + 'description': 'Getmores', + 'groups': groups + }, + { + 'name': NAME_PREFIX + 'opcounters_command', + 'call_back': get_opcounter_rate, + 'time_max': time_max, + 'value_type': 'float', + 'units': 'Commands/Sec', + 'slope': 'both', + 'format': '%f', + 'description': 'Commands', + 'groups': groups + }, + { + 'name': NAME_PREFIX + 'backgroundFlushing_flushes', + 'call_back': get_rate, + 'time_max': time_max, + 'value_type': 'float', + 'units': 'Flushes/Sec', + 'slope': 'both', + 'format': '%f', + 'description': 'Flushes', + 'groups': groups + }, + { + 'name': NAME_PREFIX + 'mem_mapped', + 'call_back': get_value, + 'time_max': time_max, + 'value_type': 'uint', + 'units': 'MB', + 'slope': 'both', + 'format': '%u', + 'description': 'Memory-mapped Data', + 'groups': groups + }, + { + 'name': NAME_PREFIX + 'mem_virtual', + 'call_back': get_value, + 'time_max': time_max, + 'value_type': 'uint', + 'units': 'MB', + 'slope': 'both', + 'format': '%u', + 'description': 'Process Virtual Size', + 'groups': groups + }, + { + 'name': NAME_PREFIX + 'mem_resident', + 'call_back': get_value, + 'time_max': time_max, + 'value_type': 'uint', + 'units': 'MB', + 'slope': 'both', + 'format': '%u', + 'description': 'Process Resident Size', + 'groups': groups + }, + { + 'name': NAME_PREFIX + 'extra_info_page_faults', + 'call_back': get_rate, + 'time_max': time_max, + 'value_type': 'float', + 'units': 'Faults/Sec', + 'slope': 'both', + 'format': '%f', + 'description': 'Page Faults', + 'groups': groups + }, + { + 'name': NAME_PREFIX + 'globalLock_ratio', + 'call_back': get_globalLock_ratio, + 'time_max': time_max, + 'value_type': 'float', + 'units': '%', + 'slope': 'both', + 'format': '%f', + 'description': 'Global Write Lock Ratio', + 'groups': groups + }, + { + 'name': NAME_PREFIX + 'indexCounters_btree_miss_ratio', + 'call_back': get_indexCounters_btree_miss_ratio, + 'time_max': time_max, + 'value_type': 'float', + 'units': '%', + 'slope': 'both', + 'format': '%f', + 'description': 'BTree Page Miss Ratio', + 'groups': groups + }, + { + 'name': NAME_PREFIX + 'globalLock_currentQueue_total', + 'call_back': get_value, + 'time_max': time_max, + 'value_type': 'uint', + 'units': 'Operations', + 'slope': 'both', + 'format': '%u', + 'description': 'Total Operations Waiting for Lock', + 'groups': groups + }, + { + 'name': NAME_PREFIX + 'globalLock_currentQueue_readers', + 'call_back': get_value, + 'time_max': time_max, + 'value_type': 'uint', + 'units': 'Operations', + 'slope': 'both', + 'format': '%u', + 'description': 'Readers Waiting for Lock', + 'groups': groups + }, + { + 'name': NAME_PREFIX + 'globalLock_currentQueue_writers', + 'call_back': get_value, + 'time_max': time_max, + 'value_type': 'uint', + 'units': 'Operations', + 'slope': 'both', + 'format': '%u', + 'description': 'Writers Waiting for Lock', + 'groups': groups + }, + { + 'name': NAME_PREFIX + 'globalLock_activeClients_total', + 'call_back': get_value, + 'time_max': time_max, + 'value_type': 'uint', + 'units': 'Clients', + 'slope': 'both', + 'format': '%u', + 'description': 'Total Active Clients', + 'groups': groups + }, + { + 'name': NAME_PREFIX + 'globalLock_activeClients_readers', + 'call_back': get_value, + 'time_max': time_max, + 'value_type': 'uint', + 'units': 'Clients', + 'slope': 'both', + 'format': '%u', + 'description': 'Active Readers', + 'groups': groups + }, + { + 'name': NAME_PREFIX + 'globalLock_activeClients_writers', + 'call_back': get_value, + 'time_max': time_max, + 'value_type': 'uint', + 'units': 'Clients', + 'slope': 'both', + 'format': '%u', + 'description': 'Active Writers', + 'groups': groups + }, + { + 'name': NAME_PREFIX + 'connections_current', + 'call_back': get_value, + 'time_max': time_max, + 'value_type': 'uint', + 'units': 'Connections', + 'slope': 'both', + 'format': '%u', + 'description': 'Open Connections', + 'groups': groups + }, + { + 'name': NAME_PREFIX + 'connections_current_ratio', + 'call_back': get_connections_current_ratio, + 'time_max': time_max, + 'value_type': 'float', + 'units': '%', + 'slope': 'both', + 'format': '%f', + 'description': 'Percentage of Connections Used', + 'groups': groups + }, + { + 'name': NAME_PREFIX + 'slave_delay', + 'call_back': get_slave_delay, + 'time_max': time_max, + 'value_type': 'uint', + 'units': 'Seconds', + 'slope': 'both', + 'format': '%u', + 'description': 'Replica Set Slave Delay', + 'groups': groups + }, + { + 'name': NAME_PREFIX + 'asserts_total', + 'call_back': get_asserts_total_rate, + 'time_max': time_max, + 'value_type': 'float', + 'units': 'Asserts/Sec', + 'slope': 'both', + 'format': '%f', + 'description': 'Asserts', + 'groups': groups + } + ] + + return descriptors + + +def metric_cleanup(): + """Cleanup""" + + pass + + +# the following code is for debugging and testing +if __name__ == '__main__': + descriptors = metric_init(PARAMS) + while True: + for d in descriptors: + print (('%s = %s') % (d['name'], d['format'])) % (d['call_back'](d['name'])) + print '' + time.sleep(METRICS_CACHE_TTL) diff --git a/mongodb-org/tasks/ganglia-plugin.yml b/mongodb-org/tasks/ganglia-plugin.yml new file mode 100644 index 00000000..38f79508 --- /dev/null +++ b/mongodb-org/tasks/ganglia-plugin.yml @@ -0,0 +1,14 @@ +--- +# +# The ganglia plugin comes from https://github.com/ganglia/gmond_python_modules +# +- name: Install the ganglia plugin for MongoDB + copy: src=mongodb.py dest=/usr/lib/ganglia/python_modules/mongodb.py owner=root group=root mode=0444 + notify: Restart ganglia monitor + tags: [ 'ganglia', 'mongodb' ] + +- name: Distribute the ganglia (gmond) configuration for the MongoDB plugin + template: src=mongodb.pyconf.j2 dest=/etc/ganglia/conf.d/mongodb.pyconf owner=root group=root mode=0444 + notify: Restart ganglia monitor + tags: [ 'ganglia', 'mongodb' ] + diff --git a/mongodb-org/tasks/main.yml b/mongodb-org/tasks/main.yml index 3dd55847..c3ad3de4 100644 --- a/mongodb-org/tasks/main.yml +++ b/mongodb-org/tasks/main.yml @@ -1,56 +1,4 @@ --- -- name: Install the mongodb apt key - #apt_key: id=7F0CEB10 state=present - raw: apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 7F0CEB10 - when: mongodb_install_from_external_repo - tags: mongodb - -- name: Install the mongodb repository - copy: content="deb http://downloads-distro.mongodb.org/repo/ubuntu-upstart dist 10gen" dest=/etc/apt/sources.list.d/mongodb.list owner=root group=root mode=044 - when: mongodb_install_from_external_repo - register: external_repo - tags: mongodb - -- name: Install the latest version of mongodb server - apt: pkg={{ item }} state={{ mongodb_pkg_state }} update_cache=yes - with_items: - - mongodb-org - when: - - mongodb_install_from_external_repo - - mongodb_install_packages - tags: mongodb - -- name: Install the mongodb defaults file - copy: content="ENABLE_MONGODB={{ mongodb_start_server }}" dest=/etc/default/mongodb owner=root group=root mode=0444 - when: mongodb_install_conf - tags: mongodb - -- name: Create the mongodb db directory - file: dest={{ mongodb_dbpath }} state=directory owner={{ mongodb_user }} group={{ mongodb_group }} mode=0755 - when: mongodb_install_conf - tags: mongodb - -- name: Create the mongodb log directory - file: dest={{ mongodb_logdir }} state=directory owner={{ mongodb_user }} group={{ mongodb_group }} mode=0755 - when: mongodb_install_conf - tags: mongodb - -- name: Install the mongodb 2.6 configuration - template: src=mongod-2.6.conf.j2 dest=/etc/mongod.conf owner=root group=root mode=0444 - when: mongodb_install_conf - tags: mongodb - -- name: Install the cron job that manages log files rotation - template: src=mongo_log_rotate.sh.j2 dest=/etc/cron.daily/mongo_log_rotate owner=root group=root mode=0555 - tags: [ 'mongodb', 'mongo_logrotate' ] - -- name: Ensure mongodb is started - service: name=mongod state=started enabled=yes - when: ( mongodb_start_server is defined ) and ( mongodb_start_server == 'yes' ) and ( mongodb_install_conf ) - tags: mongodb - -- name: Ensure mongod is stopped and disabled - service: name=mongod state=stopped enabled=no - when: ( mongodb_start_server is defined ) and ( mongodb_start_server == 'no' ) and ( mongodb_install_conf ) - tags: mongodb - +- include: mongodb.yml +- include: ganglia-plugin.yml + when: ganglia_enabled diff --git a/mongodb-org/tasks/mongodb.yml b/mongodb-org/tasks/mongodb.yml new file mode 100644 index 00000000..c5c88a42 --- /dev/null +++ b/mongodb-org/tasks/mongodb.yml @@ -0,0 +1,55 @@ +--- +- name: Install the mongodb apt key + #apt_key: id=7F0CEB10 state=present + raw: apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 7F0CEB10 + when: mongodb_install_from_external_repo + tags: mongodb + +- name: Install the mongodb repository + copy: content="deb http://downloads-distro.mongodb.org/repo/ubuntu-upstart dist 10gen" dest=/etc/apt/sources.list.d/mongodb.list owner=root group=root mode=044 + when: mongodb_install_from_external_repo + register: external_repo + tags: mongodb + +- name: Install the latest version of mongodb server + apt: pkg={{ item }} state={{ mongodb_pkg_state }} update_cache=yes + with_items: + - mongodb-org + when: + - mongodb_install_from_external_repo + - mongodb_install_packages + tags: mongodb + +- name: Install the mongodb defaults file + copy: content="ENABLE_MONGODB={{ mongodb_start_server }}" dest=/etc/default/mongodb owner=root group=root mode=0444 + when: mongodb_install_conf + tags: mongodb + +- name: Create the mongodb db directory + file: dest={{ mongodb_dbpath }} state=directory owner={{ mongodb_user }} group={{ mongodb_group }} mode=0755 + when: mongodb_install_conf + tags: mongodb + +- name: Create the mongodb log directory + file: dest={{ mongodb_logdir }} state=directory owner={{ mongodb_user }} group={{ mongodb_group }} mode=0755 + when: mongodb_install_conf + tags: mongodb + +- name: Install the mongodb 2.6 configuration + template: src=mongod-2.6.conf.j2 dest=/etc/mongod.conf owner=root group=root mode=0444 + when: mongodb_install_conf + tags: mongodb + +- name: Install the cron job that manages log files rotation + template: src=mongo_log_rotate.sh.j2 dest=/etc/cron.daily/mongo_log_rotate owner=root group=root mode=0555 + tags: [ 'mongodb', 'mongo_logrotate' ] + +- name: Ensure mongodb is started + service: name=mongod state=started enabled=yes + when: ( mongodb_start_server is defined ) and ( mongodb_start_server == 'yes' ) and ( mongodb_install_conf ) + tags: mongodb + +- name: Ensure mongod is stopped and disabled + service: name=mongod state=stopped enabled=no + when: ( mongodb_start_server is defined ) and ( mongodb_start_server == 'no' ) and ( mongodb_install_conf ) + tags: mongodb diff --git a/mongodb-org/templates/mongodb.pyconf.j2 b/mongodb-org/templates/mongodb.pyconf.j2 new file mode 100644 index 00000000..a23f9b1c --- /dev/null +++ b/mongodb-org/templates/mongodb.pyconf.j2 @@ -0,0 +1,109 @@ +modules { + module { + name = "mongodb" + language = "python" + param server_status { + value = "mongo --quiet --eval 'printjson(db.serverStatus())'" + } + param rs_status { + value = "mongo --quiet --eval 'printjson(rs.status())'" + } + } +} + +collection_group { + collect_every = 30 + time_threshold = 90 + metric { + name = "mongodb_opcounters_insert" + title = "Inserts" + } + metric { + name = "mongodb_opcounters_query" + title = "Queries" + } + metric { + name = "mongodb_opcounters_update" + title = "Updates" + } + metric { + name = "mongodb_opcounters_delete" + title = "Deletes" + } + metric { + name = "mongodb_opcounters_getmore" + title = "Getmores" + } + metric { + name = "mongodb_opcounters_command" + title = "Commands" + } + metric { + name = "mongodb_backgroundFlushing_flushes" + title = "Flushes" + } + metric { + name = "mongodb_mem_mapped" + title = "Memory-mapped Data" + } + metric { + name = "mongodb_mem_virtual" + title = "Process Virtual Size" + } + metric { + name = "mongodb_mem_resident" + title = "Process Resident Size" + } + metric { + name = "mongodb_extra_info_page_faults" + title = "Page Faults" + } + metric { + name = "mongodb_globalLock_ratio" + title = "Global Write Lock Ratio" + } + metric { + name = "mongodb_indexCounters_btree_miss_ratio" + title = "BTree Page Miss Ratio" + } + metric { + name = "mongodb_globalLock_currentQueue_total" + title = "Total Operations Waiting for Lock" + } + metric { + name = "mongodb_globalLock_currentQueue_readers" + title = "Readers Waiting for Lock" + } + metric { + name = "mongodb_globalLock_currentQueue_writers" + title = "Writers Waiting for Lock" + } + metric { + name = "mongodb_globalLock_activeClients_total" + title = "Total Active Clients" + } + metric { + name = "mongodb_globalLock_activeClients_readers" + title = "Active Readers" + } + metric { + name = "mongodb_globalLock_activeClients_writers" + title = "Active Writers" + } + metric { + name = "mongodb_connections_current" + title = "Open Connections" + } + metric { + name = "mongodb_connections_current_ratio" + title = "Open Connections" + } + metric { + name = "mongodb_slave_delay" + title = "Replica Set Slave Delay" + } + metric { + name = "mongodb_asserts_total" + title = "Asserts per Second" + } +}