====== GlusterFS Diamond Collector ====== The initial test version of this collector. It is a quick hack to verify that I can get the metrics into influxdb/grafana/graphite. The next version will be using the XML output of the gluster utility to gather the metrics. let me know how I can improve it. UPDATE: version 0.2 beta now uses the XML output of the gluster command to get the information needed. UPDATE: version 0.3 beta now has the target_brick and target_volume configuration options available (defaults to all bricks and volumes). and provides all latency/hit related metrics. May consider adding further metrics if needed. TODO: add more metrics and enable targeted gathering per node/brick and volume. thanks # coding=utf-8 """ The GlusterFSCollector currently only collects latency percentages from the GlusterFS storage system. version 0.3 beta Documentation for GlusterFS profiling: http://gluster.readthedocs.org/en/latest/Administrator%20Guide/Monitoring%20Workload/ #### Dependencies * glusterfs [https://www.gluster.org/] * Profiling enabled: gluster volume profile start """ import diamond.collector import subprocess import sys try: import xml.etree.cElementTree as ET except ImportError: import xml.etree.ElementTree as ET metric_base = "glusterfs." target_volume = '' target_brick = '' class GlusterFSCollector(diamond.collector.Collector): def get_default_config_help(self): config_help = super(GlusterFSCollector, self).get_default_config_help() config_help.update({ 'gluster_path': 'complete path to gluster binary.' ' Defaults to /usr/sbin/gluster', 'target_volume': 'which brick to send info on.' ' Defaults to all', 'target_brick': 'which node/server to send metrics for.' ' Defaults to all', }) return config_help def get_default_config(self): config = super(GlusterFSCollector, self).get_default_config() config.update({ 'path': 'glusterfs', 'gluster_path': '/usr/sbin/gluster', 'target_volume': '', 'target_brick': '' }) return config def collect(self): gluster_call = self.config['gluster_path'] + ' volume list' out = subprocess.Popen([gluster_call], stdout=subprocess.PIPE, shell=True) (volumes, err) = out.communicate() for volume in volumes.splitlines(): #self.log.info("checking gluster volume " + volume) if ( volume == self.config['target_volume'] or self.config['target_volume'] == '' ): metric_base = volume xml_out=subprocess.Popen([self.config['gluster_path'] + " volume profile " + volume + " info cumulative --xml"], stdout=subprocess.PIPE, shell=True) (raw_metrics, err) = xml_out.communicate() xml_metrics = ET.XML(raw_metrics) for volelem in xml_metrics.find('volProfile'): if ( volelem.tag == 'brick' ): temp_bval = volelem.find('brickName').text temp_list = temp_bval.split(':') brick_name = temp_list[0] #self.log.info("checking gluster brick " + brick_name) if ( brick_name == self.config['target_brick'] or self.config['target_brick'] == '' ): running_grand_avg_total = running_avg_total = running_calls_total = 0.0 fop_stats = {} for fopstatselem in volelem.find('cumulativeStats').find('fopStats'): #self.log.info("getting gluster metrics") name = fopstatselem.findtext('name') hits = fopstatselem.findtext('hits') avg_latency = float(fopstatselem.findtext('avgLatency')) min_latency = float(fopstatselem.findtext('minLatency')) max_latency = float(fopstatselem.findtext('maxLatency')) fop_total_avg = avg_latency * int(hits) running_grand_avg_total = running_grand_avg_total + fop_total_avg fop_stats[name] = hits, avg_latency, fop_total_avg, min_latency, max_latency for fop in fop_stats: #self.log.info("sending gluster metrics") metric_name = metric_base + "." + brick_name + "." + fop + ".pctLatency" metric_value = (fop_stats[fop][2] / running_grand_avg_total) * 100 self.publish(metric_name, metric_value) metric_name = metric_base + "." + brick_name + "." + fop + ".hits" metric_value = fop_stats[fop][0] self.publish(metric_name, metric_value) metric_name = metric_base + "." + brick_name + "." + fop + ".avgLatency" metric_value = fop_stats[fop][1] self.publish(metric_name, metric_value) metric_name = metric_base + "." + brick_name + "." + fop + ".totalLatency" metric_value = fop_stats[fop][2] self.publish(metric_name, metric_value) metric_name = metric_base + "." + brick_name + "." + fop + ".minLatency" metric_value = fop_stats[fop][3] self.publish(metric_name, metric_value) metric_name = metric_base + "." + brick_name + "." + fop + ".maxLatency" metric_value = fop_stats[fop][4] self.publish(metric_name, metric_value)