diff --git a/collectors/0/couchbase.py b/collectors/0/couchbase.py index 46dc2e17..572a0b92 100755 --- a/collectors/0/couchbase.py +++ b/collectors/0/couchbase.py @@ -21,154 +21,162 @@ COLLECTION_INTERVAL = CONFIG['collection_interval'] COUCHBASE_INITFILE = CONFIG['couchbase_initfile'] -KEYS = frozenset( [ - 'bucket_active_conns', - 'cas_hits', - 'cas_misses', - 'cmd_get', - 'cmd_set', - 'curr_connections', - 'curr_conns_on_port_11209', - 'curr_conns_on_port_11210', - 'ep_queue_size', - 'ep_num_value_ejects', - 'ep_num_eject_failures', - 'ep_oom_errors', - 'ep_tmp_oom_errors', - 'get_hits', - 'get_misses', - 'mem_used', - 'total_connections', - 'total_heap_bytes', - 'total_free_bytes', - 'total_allocated_bytes', - 'total_fragmentation_bytes', - 'tcmalloc_current_thread_cache_bytes', - 'tcmalloc_max_thread_cache_bytes', - 'tcmalloc_unmapped_bytes', - ] ) +KEYS = frozenset([ + 'bucket_active_conns', + 'cas_hits', + 'cas_misses', + 'cmd_get', + 'cmd_set', + 'curr_connections', + 'curr_conns_on_port_11209', + 'curr_conns_on_port_11210', + 'ep_queue_size', + 'ep_num_value_ejects', + 'ep_num_eject_failures', + 'ep_oom_errors', + 'ep_tmp_oom_errors', + 'get_hits', + 'get_misses', + 'mem_used', + 'total_connections', + 'total_heap_bytes', + 'total_free_bytes', + 'total_allocated_bytes', + 'total_fragmentation_bytes', + 'tcmalloc_current_thread_cache_bytes', + 'tcmalloc_max_thread_cache_bytes', + 'tcmalloc_unmapped_bytes', +]) + def find_couchbase_pid(): - """Find out the pid of couchbase""" - if not os.path.isfile(COUCHBASE_INITFILE): - return - - try: - fd = open(COUCHBASE_INITFILE) - for line in fd: - if line.startswith("exec"): - init_script = line.split()[1] - fd.close() - except IOError: - utils.err("Check permission of file (%s)" % COUCHBASE_INITFILE) - return - - try: - fd = open(init_script) - for line in fd: - if line.startswith("PIDFILE"): - pid_file = line.split("=")[1].rsplit()[0] - fd.close() - except IOError: - utils.err("Check permission of file (%s)" % init_script) - return - - try: - fd = open(pid_file) - pid = fd.read() - fd.close() - except IOError: - utils.err("Couchbase-server is not running, since no pid file exists") - sys.exit(13) - - return pid.split()[0] + """Find out the pid of couchbase""" + if not os.path.isfile(COUCHBASE_INITFILE): + return + + try: + fd = open(COUCHBASE_INITFILE) + for line in fd: + if line.startswith("exec"): + init_script = line.split()[1] + fd.close() + except IOError: + utils.err("Check permission of file (%s)" % COUCHBASE_INITFILE) + return + + try: + fd = open(init_script) + for line in fd: + if line.startswith("PIDFILE"): + pid_file = line.split("=")[1].rsplit()[0] + fd.close() + except IOError: + utils.err("Check permission of file (%s)" % init_script) + return + + try: + fd = open(pid_file) + pid = fd.read() + fd.close() + except IOError: + utils.err("Couchbase-server is not running, since no pid file exists") + sys.exit(13) + + return pid.split()[0] + def find_conf_file(pid): - """Returns config file for couchbase-server.""" - try: - fd = open('/proc/%s/cmdline' % pid) - except IOError as e: - utils.err("Couchbase (pid %s) went away ? %s" % (pid, e)) - return - try: - config = fd.read().split("config_path")[1].split("\"")[1] - return config - finally: - fd.close() + """Returns config file for couchbase-server.""" + try: + fd = open('/proc/%s/cmdline' % pid) + except IOError as e: + utils.err("Couchbase (pid %s) went away ? %s" % (pid, e)) + return + try: + config = fd.read().split("config_path")[1].split("\"")[1] + return config + finally: + fd.close() + def find_bindir_path(config_file): - """Returns the bin directory path""" - try: - fd = open(config_file) - except IOError as e: - utils.err("Error for Config file (%s): %s" % (config_file, e)) - return None - try: - for line in fd: - if line.startswith("{path_config_bindir"): - return line.split(",")[1].split("\"")[1] - finally: - fd.close() + """Returns the bin directory path""" + try: + fd = open(config_file) + except IOError as e: + utils.err("Error for Config file (%s): %s" % (config_file, e)) + return None + try: + for line in fd: + if line.startswith("{path_config_bindir"): + return line.split(",")[1].split("\"")[1] + finally: + fd.close() + def list_bucket(bin_dir): - """Returns the list of memcached or membase buckets""" - buckets = [] - if not os.path.isfile("%s/couchbase-cli" % bin_dir): + """Returns the list of memcached or membase buckets""" + buckets = [] + if not os.path.isfile("%s/couchbase-cli" % bin_dir): + return buckets + cli = ("%s/couchbase-cli" % bin_dir) + try: + buck = subprocess.check_output([cli, "bucket-list", "--cluster", + "localhost:8091"]) + except subprocess.CalledProcessError: + return buckets + regex = re.compile("[\s\w]+:[\s\w]+$") + for i in buck.splitlines(): + if not regex.match(i): + buckets.append(i) return buckets - cli = ("%s/couchbase-cli" % bin_dir) - try: - buck = subprocess.check_output([cli, "bucket-list", "--cluster", - "localhost:8091"]) - except subprocess.CalledProcessError: - return buckets - regex = re.compile("[\s\w]+:[\s\w]+$") - for i in buck.splitlines(): - if not regex.match(i): - buckets.append(i) - return buckets + def collect_stats(bin_dir, bucket): - """Returns statistics related to a particular bucket""" - if not os.path.isfile("%s/cbstats" % bin_dir): - return - cli = ("%s/cbstats" % bin_dir) - try: - ts = time.time() - stats = subprocess.check_output([cli, "localhost:11211", "-b", bucket, - "all"]) - except subprocess.CalledProcessError: - return - for stat in stats.splitlines(): - metric = stat.split(":")[0].lstrip(" ") - value = stat.split(":")[1].lstrip(" \t") - if metric in KEYS: - print("couchbase.%s %i %s bucket=%s" % (metric, ts, value, bucket)) + """Returns statistics related to a particular bucket""" + if not os.path.isfile("%s/cbstats" % bin_dir): + return + cli = ("%s/cbstats" % bin_dir) + try: + ts = time.time() + stats = subprocess.check_output([cli, "localhost:11211", "-b", bucket, + "all"]) + except subprocess.CalledProcessError: + return + for stat in stats.splitlines(): + metric = stat.split(":")[0].lstrip(" ") + value = stat.split(":")[1].lstrip(" \t") + if metric in KEYS: + print("couchbase.%s %i %s bucket=%s" % (metric, ts, value, bucket)) + def main(): - utils.drop_privileges() - pid = find_couchbase_pid() - if not pid: - utils.err("Error: Either couchbase-server is not running or file (%s)" - " doesn't exist" % COUCHBASE_INITFILE) - return 13 - - conf_file = find_conf_file(pid) - if not conf_file: - utils.err("Error: Can't find config file (%s)" % conf_file) - return 13 - - bin_dir = find_bindir_path(conf_file) - if not bin_dir: - utils.err("Error: Can't find bindir path in config file") - return 13 - - while True: - # Listing bucket everytime so as to start collecting datapoints - # of any new bucket. - buckets = list_bucket(bin_dir) - for b in buckets: - collect_stats(bin_dir, b) - time.sleep(COLLECTION_INTERVAL) + utils.drop_privileges() + pid = find_couchbase_pid() + if not pid: + utils.err("Error: Either couchbase-server is not running or file (%s)" + " doesn't exist" % COUCHBASE_INITFILE) + return 13 # ask tcollector to not respawn us + + conf_file = find_conf_file(pid) + if not conf_file: + utils.err("Error: Can't find config file (%s)" % conf_file) + return 13 + + bin_dir = find_bindir_path(conf_file) + if not bin_dir: + utils.err("Error: Can't find bindir path in config file") + return 13 + + while True: + # Listing bucket everytime so as to start collecting datapoints + # of any new bucket. + buckets = list_bucket(bin_dir) + for b in buckets: + collect_stats(bin_dir, b) + sys.stdout.flush() + time.sleep(COLLECTION_INTERVAL) + if __name__ == "__main__": - sys.exit(main()) + sys.exit(main()) diff --git a/collectors/0/dfstat.py b/collectors/0/dfstat.py index fe4d7f2f..57b0b565 100755 --- a/collectors/0/dfstat.py +++ b/collectors/0/dfstat.py @@ -39,115 +39,115 @@ # File system types to ignore FSTYPE_IGNORE = frozenset([ - "cgroup", - "debugfs", - "devtmpfs", - "nfs", - "rpc_pipefs", - "rootfs", + "cgroup", + "debugfs", + "devtmpfs", + "nfs", + "rpc_pipefs", + "rootfs", ]) + def main(): - """dfstats main loop""" - try: - f_mounts = open("/proc/mounts", "r") - except IOError as e: - utils.err("error: can't open /proc/mounts: %s" % e) - return 13 # Ask tcollector to not respawn us - - utils.drop_privileges() - - while True: - devices = [] - f_mounts.seek(0) - ts = int(time.time()) - - for line in f_mounts: - # Docs come from the fstab(5) - # fs_spec # Mounted block special device or remote filesystem - # fs_file # Mount point - # fs_vfstype # File system type - # fs_mntops # Mount options - # fs_freq # Dump(8) utility flags - # fs_passno # Order in which filesystem checks are done at reboot time - try: - fs_spec, fs_file, fs_vfstype, fs_mntops, fs_freq, fs_passno = line.split(None) - except ValueError as e: - utils.err("error: can't parse line at /proc/mounts: %s" % e) - continue - - if fs_spec == "none": - continue - elif fs_vfstype in FSTYPE_IGNORE or fs_vfstype.startswith("fuse."): - continue - # startswith(tuple) avoided to preserve support of Python 2.4 - elif fs_file.startswith("/dev") or fs_file.startswith("/sys") or \ - fs_file.startswith("/proc") or fs_file.startswith("/lib") or \ - fs_file.startswith("net:") or fs_file.startswith("/var/lib/kubelet"): - continue - - # keep /dev/xxx device with shorter fs_file (remove mount binds) - device_found = False - if fs_spec.startswith("/dev"): + """dfstats main loop""" + try: + f_mounts = open("/proc/mounts", "r") + except IOError as e: + utils.err("error: can't open /proc/mounts: %s" % e) + return 13 # Ask tcollector to not respawn us + + utils.drop_privileges() + + while True: + devices = [] + f_mounts.seek(0) + ts = int(time.time()) + + for line in f_mounts: + # Docs come from the fstab(5) + # fs_spec # Mounted block special device or remote filesystem + # fs_file # Mount point + # fs_vfstype # File system type + # fs_mntops # Mount options + # fs_freq # Dump(8) utility flags + # fs_passno # Order in which filesystem checks are done at reboot time + try: + fs_spec, fs_file, fs_vfstype, fs_mntops, fs_freq, fs_passno = line.split(None) + except ValueError as e: + utils.err("error: can't parse line at /proc/mounts: %s" % e) + continue + + if fs_spec == "none": + continue + elif fs_vfstype in FSTYPE_IGNORE or fs_vfstype.startswith("fuse."): + continue + # startswith(tuple) avoided to preserve support of Python 2.4 + elif fs_file.startswith("/dev") or fs_file.startswith("/sys") or \ + fs_file.startswith("/proc") or fs_file.startswith("/lib") or \ + fs_file.startswith("net:") or fs_file.startswith("/var/lib/kubelet"): + continue + + # keep /dev/xxx device with shorter fs_file (remove mount binds) + device_found = False + if fs_spec.startswith("/dev"): + for device in devices: + if fs_spec == device[0]: + device_found = True + if len(fs_file) < len(device[1]): + device[1] = fs_file + break + if not device_found: + devices.append([fs_spec, fs_file, fs_vfstype]) + else: + devices.append([fs_spec, fs_file, fs_vfstype]) + for device in devices: - if fs_spec == device[0]: - device_found = True - if len(fs_file) < len(device[1]): - device[1] = fs_file - break - if not device_found: - devices.append([fs_spec, fs_file, fs_vfstype]) - else: - devices.append([fs_spec, fs_file, fs_vfstype]) - - - for device in devices: - fs_spec, fs_file, fs_vfstype = device - try: - r = os.statvfs(fs_file) - except OSError as e: - utils.err("can't get info for mount point: %s: %s" % (fs_file, e)) - continue - - used = r.f_blocks - r.f_bfree - - # conditional expression avoided to preserve support of Python 2.4 - # percent_used = 100 if r.f_blocks == 0 else used * 100.0 / (used + r.f_bavail) - if r.f_blocks == 0: - percent_used = 100 - else: - percent_used = used * 100.0 / (used + r.f_bavail) - - print("df.bytes.total %d %s mount=%s fstype=%s" - % (ts, r.f_frsize * r.f_blocks, fs_file, fs_vfstype)) - print("df.bytes.used %d %s mount=%s fstype=%s" - % (ts, r.f_frsize * used, fs_file, fs_vfstype)) - print("df.bytes.percentused %d %s mount=%s fstype=%s" - % (ts, percent_used, fs_file, fs_vfstype)) - print("df.bytes.free %d %s mount=%s fstype=%s" - % (ts, r.f_frsize * r.f_bavail, fs_file, fs_vfstype)) - - used = r.f_files - r.f_ffree - - # percent_used = 100 if r.f_files == 0 else used * 100.0 / r.f_files - if r.f_files == 0: - percent_used = 100 - else: - percent_used = used * 100.0 / r.f_files - - print("df.inodes.total %d %s mount=%s fstype=%s" - % (ts, r.f_files, fs_file, fs_vfstype)) - print("df.inodes.used %d %s mount=%s fstype=%s" - % (ts, used, fs_file, fs_vfstype)) - print("df.inodes.percentused %d %s mount=%s fstype=%s" - % (ts, percent_used, fs_file, fs_vfstype)) - print("df.inodes.free %d %s mount=%s fstype=%s" - % (ts, r.f_ffree, fs_file, fs_vfstype)) - - sys.stdout.flush() - time.sleep(COLLECTION_INTERVAL) + fs_spec, fs_file, fs_vfstype = device + try: + r = os.statvfs(fs_file) + except OSError as e: + utils.err("can't get info for mount point: %s: %s" % (fs_file, e)) + continue + + used = r.f_blocks - r.f_bfree + + # conditional expression avoided to preserve support of Python 2.4 + # percent_used = 100 if r.f_blocks == 0 else used * 100.0 / (used + r.f_bavail) + if r.f_blocks == 0: + percent_used = 100 + else: + percent_used = used * 100.0 / (used + r.f_bavail) + + print("df.bytes.total %d %s mount=%s fstype=%s" + % (ts, r.f_frsize * r.f_blocks, fs_file, fs_vfstype)) + print("df.bytes.used %d %s mount=%s fstype=%s" + % (ts, r.f_frsize * used, fs_file, fs_vfstype)) + print("df.bytes.percentused %d %s mount=%s fstype=%s" + % (ts, percent_used, fs_file, fs_vfstype)) + print("df.bytes.free %d %s mount=%s fstype=%s" + % (ts, r.f_frsize * r.f_bavail, fs_file, fs_vfstype)) + + used = r.f_files - r.f_ffree + + # percent_used = 100 if r.f_files == 0 else used * 100.0 / r.f_files + if r.f_files == 0: + percent_used = 100 + else: + percent_used = used * 100.0 / r.f_files + + print("df.inodes.total %d %s mount=%s fstype=%s" + % (ts, r.f_files, fs_file, fs_vfstype)) + print("df.inodes.used %d %s mount=%s fstype=%s" + % (ts, used, fs_file, fs_vfstype)) + print("df.inodes.percentused %d %s mount=%s fstype=%s" + % (ts, percent_used, fs_file, fs_vfstype)) + print("df.inodes.free %d %s mount=%s fstype=%s" + % (ts, r.f_ffree, fs_file, fs_vfstype)) + + sys.stdout.flush() + time.sleep(COLLECTION_INTERVAL) if __name__ == "__main__": - sys.stdin.close() - sys.exit(main()) + sys.stdin.close() + sys.exit(main()) diff --git a/collectors/0/docker.py b/collectors/0/docker.py index 422b0386..e137c113 100755 --- a/collectors/0/docker.py +++ b/collectors/0/docker.py @@ -17,13 +17,13 @@ CONFIG = docker_conf.get_config() COLLECTION_INTERVAL = CONFIG['interval'] -CGROUP_PATH =CONFIG['cgroup_path'] +CGROUP_PATH = CONFIG['cgroup_path'] ENABLED = docker_conf.enabled() DOCKER_SOCK = CONFIG['socket_path'] if not ENABLED: - sys.stderr.write("Docker collector is not enabled") - sys.exit(13) + utils.err("Docker collector is not enabled") + sys.exit(13) # proc_names example: # $ cat cpuacct.stat @@ -54,6 +54,7 @@ ), } + def getnameandimage(containerid): # Retrieve container json configuration file @@ -61,7 +62,7 @@ def getnameandimage(containerid): sock.settimeout(5) try: r = sock.connect_ex(DOCKER_SOCK) - if (r != 0): + if r != 0: print("Can not connect to %s" % (DOCKER_SOCK), file=sys.stderr) else: message = 'GET /containers/' + containerid + '/json HTTP/1.1\r\nHost: http\n\n' @@ -92,6 +93,7 @@ def getnameandimage(containerid): except socket.timeout as e: print("Socket: %s" % (e,), file=sys.stderr) + def senddata(datatosend, containerid): if datatosend: datatosend += " containerid="+containerid @@ -102,6 +104,7 @@ def senddata(datatosend, containerid): print("docker.%s" % datatosend) sys.stdout.flush() + def readdockerstats(path, containerid): # update containername and containerimage if needed @@ -164,20 +167,21 @@ def readdockerstats(path, containerid): senddata("%s %d %s" % (datatosend, ts, count), containerid) f_stat.close() + def main(): """docker_cpu main loop""" global containernames global containerimages utils.drop_privileges() - cache=0 + cache = 0 while True: # Connect to Docker socket to get informations about containers every 4 times - if (cache == 0): + if cache == 0: containernames={} containerimages={} cache += 1 - if (cache == 4): + if cache == 4: cache = 0 if os.path.isdir(CGROUP_PATH): @@ -207,5 +211,6 @@ def main(): readdockerstats(CGROUP_PATH + "/lxc/"+level1, level1) time.sleep(COLLECTION_INTERVAL) + if __name__ == "__main__": sys.exit(main()) diff --git a/collectors/0/docker_engine.py b/collectors/0/docker_engine.py index 9ea4b812..d82d6993 100755 --- a/collectors/0/docker_engine.py +++ b/collectors/0/docker_engine.py @@ -16,6 +16,7 @@ from __future__ import print_function import sys +from collectors.lib import utils from collectors.etc import docker_engine_conf from collectors.lib.docker_engine.docker_metrics import DockerMetrics @@ -26,8 +27,8 @@ def main(): if not ENABLED: - sys.stderr.write("Docker-engine collector is not enabled") - sys.exit(13) + utils.err("Docker-engine collector is not enabled") + return 13 # ask tcollector to not respawn us """docker_cpu main loop""" cli = DockerMetrics(METRICS_PATH) diff --git a/collectors/0/elasticsearch.py b/collectors/0/elasticsearch.py index 8329be75..afd6b056 100755 --- a/collectors/0/elasticsearch.py +++ b/collectors/0/elasticsearch.py @@ -16,10 +16,7 @@ # Tested with ES 0.16.5, 0.17.x, 0.90.1 . import errno -try: - import json -except ImportError: - json = None # Handled gracefully in main. Not available by default in <2.6 +import json import socket import sys import threading @@ -28,212 +25,213 @@ from collectors.lib import utils from collectors.etc import elasticsearch_conf - -try: - from http.client import HTTPConnection, OK -except ImportError: - from httplib import HTTPConnection, OK - +from http.client import HTTPConnection, OK COLLECTION_INTERVAL = 15 # seconds -DEFAULT_TIMEOUT = 10.0 # seconds +DEFAULT_TIMEOUT = 10.0 # seconds # regexes to separate differences in version numbers PRE_VER1 = re.compile(r'^0\.') VER1 = re.compile(r'^1\.') STATUS_MAP = { - "green": 0, - "yellow": 1, - "red": 2, + "green": 0, + "yellow": 1, + "red": 2, } class ESError(RuntimeError): - """Exception raised if we don't get a 200 OK from ElasticSearch.""" + """Exception raised if we don't get a 200 OK from ElasticSearch.""" - def __init__(self, resp): - RuntimeError.__init__(self, str(resp)) - self.resp = resp + def __init__(self, resp): + RuntimeError.__init__(self, str(resp)) + self.resp = resp -def request(server, uri, json_in = True): - """Does a GET request of the given uri on the given HTTPConnection.""" - server.request("GET", uri) - resp = server.getresponse() - if resp.status != OK: - raise ESError(resp) - if json_in: - return json.loads(resp.read()) - else: - return resp.read() +def request(server, uri, json_in=True): + """Does a GET request of the given uri on the given HTTPConnection.""" + server.request("GET", uri) + resp = server.getresponse() + if resp.status != OK: + raise ESError(resp) + if json_in: + return json.loads(resp.read()) + else: + return resp.read() def cluster_health(server): - return request(server, "/_cluster/health") + return request(server, "/_cluster/health") def cluster_stats(server): - return request(server, "/_cluster/stats") + return request(server, "/_cluster/stats") def cluster_master_node(server): - return request(server, "/_cat/master", json_in = False).split()[0] + return request(server, "/_cat/master", json_in=False).split()[0] def index_stats(server): - return request(server, "/_cat/indices?v&bytes=b", json_in = False) + return request(server, "/_cat/indices?v&bytes=b", json_in=False) def node_status(server): - return request(server, "/") + return request(server, "/") def node_stats(server, version): - # API changed in v1.0 - if PRE_VER1.match(version): - url = "/_cluster/nodes/_local/stats" - # elif VER1.match(version): - # url = "/_nodes/_local/stats" - else: - url = "/_nodes/_local/stats" - return request(server, url) + # API changed in v1.0 + if PRE_VER1.match(version): + url = "/_cluster/nodes/_local/stats" + # elif VER1.match(version): + # url = "/_nodes/_local/stats" + else: + url = "/_nodes/_local/stats" + return request(server, url) + def printmetric(metric, ts, value, tags): - # Warning, this should be called inside a lock - if tags: - tags = " " + " ".join("%s=%s" % (name.replace(" ",""), value.replace(" ","")) - for name, value in tags.items()) - else: - tags = "" - # Convert any bool values to int, as opentsdb only accepts int or float. - if isinstance(value, bool): - value = int(value) - print("%s %d %s %s" - % (metric, ts, value, tags)) + # Warning, this should be called inside a lock + if tags: + tags = " " + " ".join("%s=%s" % (name.replace(" ", ""), value.replace(" ", "")) + for name, value in tags.items()) + else: + tags = "" + # Convert any bool values to int, as opentsdb only accepts int or float. + if isinstance(value, bool): + value = int(value) + print("%s %d %s %s" + % (metric, ts, value, tags)) + def _traverse(metric, stats, ts, tags): - """ - Recursively traverse the json tree and print out leaf numeric values - Please make sure you call this inside a lock and don't add locking - inside this function - """ - #print metric,stats,ts,tags - if isinstance(stats,dict): - if "timestamp" in stats: - ts = stats["timestamp"] / 1000 # ms -> s - for key in stats.keys(): - if key != "timestamp": - _traverse(metric + "." + key, stats[key], ts, tags) - if isinstance(stats, (list, set, tuple)): - count = 0 - for value in stats: - _traverse(metric + "." + str(count), value, ts, tags) - count += 1 - if utils.is_numeric(stats) and not isinstance(stats, bool): - if isinstance(stats, int): - stats = int(stats) - printmetric(metric, ts, stats, tags) - return + """ + Recursively traverse the json tree and print out leaf numeric values + Please make sure you call this inside a lock and don't add locking + inside this function + """ + # print metric,stats,ts,tags + if isinstance(stats, dict): + if "timestamp" in stats: + ts = stats["timestamp"] / 1000 # ms -> s + for key in stats.keys(): + if key != "timestamp": + _traverse(metric + "." + key, stats[key], ts, tags) + if isinstance(stats, (list, set, tuple)): + count = 0 + for value in stats: + _traverse(metric + "." + str(count), value, ts, tags) + count += 1 + if utils.is_numeric(stats) and not isinstance(stats, bool): + if isinstance(stats, int): + stats = int(stats) + printmetric(metric, ts, stats, tags) + return + def _collect_indices(server, metric, tags, lock): - ts = int(time.time()) - rawtable = index_stats(server).split("\n") - header = rawtable.pop(0).strip() - headerlist = [x.strip() for x in header.split()] - for line in rawtable: - # Copy the cluster tag - newtags = {"cluster": tags["cluster"]} - # Now parse each input - values = line.split() - count = 0 - for value in values: - try: - value = float(value) - if int(value) == value: - value = int(value) - # now print value - with lock: - printmetric(metric + ".cluster.byindex." + headerlist[count], ts, value, newtags) - except ValueError: - # add this as a tag - newtags[headerlist[count]] = value - count += 1 + ts = int(time.time()) + rawtable = index_stats(server).split("\n") + header = rawtable.pop(0).strip() + headerlist = [x.strip() for x in header.split()] + for line in rawtable: + # Copy the cluster tag + newtags = {"cluster": tags["cluster"]} + # Now parse each input + values = line.split() + count = 0 + for value in values: + try: + value = float(value) + if int(value) == value: + value = int(value) + # now print value + with lock: + printmetric(metric + ".cluster.byindex." + headerlist[count], ts, value, newtags) + except ValueError: + # add this as a tag + newtags[headerlist[count]] = value + count += 1 + def _collect_master(server, nodeid, metric, tags, lock): - ts = int(time.time()) - chealth = cluster_health(server) - if "status" in chealth: + ts = int(time.time()) + chealth = cluster_health(server) + if "status" in chealth: + with lock: + printmetric(metric + ".cluster.status", ts, + STATUS_MAP.get(chealth["status"], -1), tags) with lock: - printmetric(metric + ".cluster.status", ts, - STATUS_MAP.get(chealth["status"], -1), tags) - with lock: - _traverse(metric + ".cluster", chealth, ts, tags) + _traverse(metric + ".cluster", chealth, ts, tags) + + ts = int(time.time()) # In case last call took a while. + cstats = cluster_stats(server) + with lock: + _traverse(metric + ".cluster", cstats, ts, tags) - ts = int(time.time()) # In case last call took a while. - cstats = cluster_stats(server) - with lock: - _traverse(metric + ".cluster", cstats, ts, tags) def _collect_server(server, version, lock): - ts = int(time.time()) - rootmetric = "elasticsearch" - nstats = node_stats(server, version) - cluster_name = nstats["cluster_name"] - nodeid, nstats = nstats["nodes"].popitem() - node_name = nstats["name"] - tags = {"cluster": cluster_name, "node": node_name} - #tags.update(nstats["attributes"]) - - if nodeid == cluster_master_node(server): - is_master = 1 - else: - is_master = 0 - with lock: - printmetric(rootmetric + ".is_master", ts, is_master, tags) - if is_master: - _collect_master(server, nodeid, rootmetric, tags, lock) - - _collect_indices(server, rootmetric, tags, lock) - - with lock: - _traverse(rootmetric, nstats, ts, tags) + ts = int(time.time()) + rootmetric = "elasticsearch" + nstats = node_stats(server, version) + cluster_name = nstats["cluster_name"] + nodeid, nstats = nstats["nodes"].popitem() + node_name = nstats["name"] + tags = {"cluster": cluster_name, "node": node_name} + # tags.update(nstats["attributes"]) + + if nodeid == cluster_master_node(server): + is_master = 1 + else: + is_master = 0 + with lock: + printmetric(rootmetric + ".is_master", ts, is_master, tags) + if is_master: + _collect_master(server, nodeid, rootmetric, tags, lock) + + _collect_indices(server, rootmetric, tags, lock) + + with lock: + _traverse(rootmetric, nstats, ts, tags) def main(argv): - utils.drop_privileges() - socket.setdefaulttimeout(DEFAULT_TIMEOUT) - servers = [] - - if json is None: - utils.err("This collector requires the `json' Python module.") - return 1 - - for conf in elasticsearch_conf.get_servers(): - server = HTTPConnection( *conf ) - try: - server.connect() - except socket.error as exc: - if exc.errno == errno.ECONNREFUSED: - continue - raise - servers.append( server ) - - if len( servers ) == 0: - return 13 # No ES running, ask tcollector to not respawn us. - - lock = threading.Lock() - while True: - threads = [] - for server in servers: - status = node_status(server) - version = status["version"]["number"] - t = threading.Thread(target = _collect_server, args = (server, version, lock)) - t.start() - threads.append(t) - for thread in threads: - thread.join() - time.sleep(COLLECTION_INTERVAL) + utils.drop_privileges() + socket.setdefaulttimeout(DEFAULT_TIMEOUT) + servers = [] + + if json is None: + utils.err("This collector requires the `json' Python module.") + return 13 + + for conf in elasticsearch_conf.get_servers(): + server = HTTPConnection(*conf) + try: + server.connect() + except socket.error as exc: + if exc.errno == errno.ECONNREFUSED: + continue + raise + servers.append(server) + + if len(servers) == 0: + return 13 # No ES running, ask tcollector to not respawn us. + + lock = threading.Lock() + while True: + threads = [] + for server in servers: + status = node_status(server) + version = status["version"]["number"] + t = threading.Thread(target=_collect_server, args=(server, version, lock)) + t.start() + threads.append(t) + for thread in threads: + thread.join() + time.sleep(COLLECTION_INTERVAL) + if __name__ == "__main__": - sys.exit(main(sys.argv)) + sys.exit(main(sys.argv)) diff --git a/collectors/0/flume.py b/collectors/0/flume.py index d7202cca..521d1ff8 100755 --- a/collectors/0/flume.py +++ b/collectors/0/flume.py @@ -31,108 +31,102 @@ from __future__ import print_function import errno -try: - import json -except ImportError: - json = None # Handled gracefully in main. Not available by default in <2.6 +import json import socket import sys import time +from http.client import HTTPConnection, OK from collectors.lib import utils try: - from collectors.etc import flume_conf -except ImportError: - flume_conf = None - -try: - from http.client import HTTPConnection, OK + from collectors.etc import flume_conf except ImportError: - from httplib import HTTPConnection, OK + flume_conf = None COLLECTION_INTERVAL = 15 # seconds -DEFAULT_TIMEOUT = 10.0 # seconds +DEFAULT_TIMEOUT = 10.0 # seconds FLUME_HOST = "localhost" FLUME_PORT = 34545 # Exclude values that are not really metrics and totally pointless to keep track of -EXCLUDE = [ 'StartTime', 'StopTime', 'Type' ] +EXCLUDE = ['StartTime', 'StopTime', 'Type'] -def err(msg): - print(msg, file=sys.stderr) class FlumeError(RuntimeError): - """Exception raised if we don't get a 200 OK from Flume webserver.""" - def __init__(self, resp): - RuntimeError.__init__(self, str(resp)) - self.resp = resp + """Exception raised if we don't get a 200 OK from Flume webserver.""" + + def __init__(self, resp): + RuntimeError.__init__(self, str(resp)) + self.resp = resp + def request(server, uri): - """Does a GET request of the given uri on the given HTTPConnection.""" - server.request("GET", uri) - resp = server.getresponse() - if resp.status != OK: - raise FlumeError(resp) - return json.loads(resp.read()) + """Does a GET request of the given uri on the given HTTPConnection.""" + server.request("GET", uri) + resp = server.getresponse() + if resp.status != OK: + raise FlumeError(resp) + return json.loads(resp.read()) def flume_metrics(server): - return request(server, "/metrics") + return request(server, "/metrics") + def main(argv): - if not (flume_conf and flume_conf.enabled() and flume_conf.get_settings()): - sys.exit(13) - - settings = flume_conf.get_settings() - - if (settings['default_timeout']): - DEFAULT_TIMEOUT = settings['default_timeout'] - - if (settings['default_timeout']): - COLLECTION_INTERVAL = settings['collection_interval'] - - if (settings['flume_host']): - FLUME_HOST = settings['flume_host'] - - if (settings['flume_port']): - FLUME_PORT = settings['flume_port'] - - utils.drop_privileges() - socket.setdefaulttimeout(DEFAULT_TIMEOUT) - server = HTTPConnection(FLUME_HOST, FLUME_PORT) - try: - server.connect() - except socket.error as exc: - if exc.errno == errno.ECONNREFUSED: - return 13 # No Flume server available, ask tcollector to not respawn us. - raise - if json is None: - err("This collector requires the `json' Python module.") - return 1 - - def printmetric(metric, value, **tags): - if tags: - tags = " " + " ".join("%s=%s" % (name, value) - for name, value in tags.items()) - else: - tags = "" - print(("flume.%s %d %s %s" % (metric, ts, value, tags))) - - while True: - # Get the metrics - ts = int(time.time()) # In case last call took a while. - stats = flume_metrics(server) - - for metric in stats: - (component, name) = metric.split(".") - tags = {component.lower(): name} - for key,value in stats[metric].items(): - if key not in EXCLUDE: - printmetric(key.lower(), value, **tags) - - time.sleep(COLLECTION_INTERVAL) + if not (flume_conf and flume_conf.enabled() and flume_conf.get_settings()): + sys.exit(13) + + settings = flume_conf.get_settings() + + if settings['default_timeout']: + DEFAULT_TIMEOUT = settings['default_timeout'] + + if settings['default_timeout']: + COLLECTION_INTERVAL = settings['collection_interval'] + + if settings['flume_host']: + FLUME_HOST = settings['flume_host'] + + if settings['flume_port']: + FLUME_PORT = settings['flume_port'] + + utils.drop_privileges() + socket.setdefaulttimeout(DEFAULT_TIMEOUT) + server = HTTPConnection(FLUME_HOST, FLUME_PORT) + try: + server.connect() + except socket.error as exc: + if exc.errno == errno.ECONNREFUSED: + return 13 # No Flume server available, ask tcollector to not respawn us. + raise + if json is None: + utils.err("This collector requires the `json' Python module.") + return 13 # ask tcollector to not respawn us + + def printmetric(metric, value, **tags): + if tags: + tags = " " + " ".join("%s=%s" % (name, value) + for name, value in tags.items()) + else: + tags = "" + print(("flume.%s %d %s %s" % (metric, ts, value, tags))) + + while True: + # Get the metrics + ts = int(time.time()) # In case last call took a while. + stats = flume_metrics(server) + + for metric in stats: + (component, name) = metric.split(".") + tags = {component.lower(): name} + for key, value in stats[metric].items(): + if key not in EXCLUDE: + printmetric(key.lower(), value, **tags) + + time.sleep(COLLECTION_INTERVAL) if __name__ == "__main__": - sys.exit(main(sys.argv)) + sys.exit(main(sys.argv)) diff --git a/collectors/0/g1gc.py b/collectors/0/g1gc.py index 593db037..26f88307 100755 --- a/collectors/0/g1gc.py +++ b/collectors/0/g1gc.py @@ -70,7 +70,6 @@ import traceback from datetime import datetime, timedelta -from subprocess import Popen, PIPE from collectors.lib import utils from collectors.etc import g1gc_conf @@ -128,11 +127,13 @@ } + # Utilities def get_file_end(file_handler): file_handler.seek(0, 2) return file_handler.tell() + def get_latest_gc_log(log_dir, log_name_pattern): sorted_gc_logs = sorted(glob.glob(os.path.join(log_dir, log_name_pattern))) if len(sorted_gc_logs) == 0: @@ -140,25 +141,30 @@ def get_latest_gc_log(log_dir, log_name_pattern): log_dir + '" with pattern: "' + log_name_pattern + '"') return sorted_gc_logs[-1] + def true_unix_timestamp(year, month, day, hour, minute, second, timezone): d = datetime(year, month, day, hour, minute, second) - timedelta(seconds=36 * timezone) return calendar.timegm(d.utctimetuple()) + def to_size_in_mb(data_size, unit): '''Convert size in given unit: GB or B to size in MB ''' if unit == 'G': return data_size * 1024 elif unit == 'B': return data_size / (1024 * 1024.0) else: return data_size + def match_pattern(line): for pattern_name, pattern in pattern_map.items(): m = pattern.match(line) if m: return (pattern_name, m) return (None, None) + def sec2milli(seconds): return 1000 * seconds + def flush_collector(collector): for metric_name, value in collector['data'].items(): print(metric_name % (collector['timestamp'], value)) @@ -166,6 +172,7 @@ def flush_collector(collector): collector['timestamp'] = None collector['data'] = {} + def collect_metric(metric_name, timestamp, value, collector): if collector['timestamp'] != timestamp: flush_collector(collector) @@ -173,6 +180,7 @@ def collect_metric(metric_name, timestamp, value, collector): collector['timestamp'] = timestamp collector['data'][metric_name] = collector['data'].get(metric_name, 0) + value + def collect_metric_with_prefix(prefix, metric_name, timestamp, value, collector): new_metric_name = metric_name p = '' if prefix is None else prefix.strip() @@ -180,38 +188,47 @@ def collect_metric_with_prefix(prefix, metric_name, timestamp, value, collector) new_metric_name = '.'.join([p, metric_name]) collect_metric(new_metric_name, timestamp, value, collector) + def unmatched_gc_log(line): pass + # Simple gc events, don't have inner gc events def concurrent_cleanup_handler(prefix, log_line, timestamp, collector, file_handler): concurrent_clean_up_time = sec2milli(float(pattern_map[GC_PAUSE_PATTERN].match(log_line).group(1))) collect_metric_with_prefix(prefix, "gc.g1.concurrent_cleanup %s %s", timestamp, concurrent_clean_up_time, collector) + def concurrent_mark_handler(prefix, log_line, timestamp, collector, file_handler): concurrent_mark_time = sec2milli(float(pattern_map[GC_PAUSE_PATTERN].match(log_line).group(1))) collect_metric_with_prefix(prefix, "gc.g1.concurrent_mark %s %s", timestamp, concurrent_mark_time, collector) + def concurrent_root_region_scan_handler(prefix, log_line, timestamp, collector, file_handler): concurrent_root_region_scan_time = sec2milli(float(pattern_map[GC_PAUSE_PATTERN].match(log_line).group(1))) collect_metric_with_prefix(prefix, "gc.g1.concurrent_root_region_scan %s %s", timestamp, concurrent_root_region_scan_time, collector) + def cleanup_handler(prefix, log_line, timestamp, collector, file_handler): clean_up_time = sec2milli(float(pattern_map[GC_PAUSE_PATTERN].match(log_line).group(1))) collect_metric_with_prefix(prefix, "gc.g1.duration %s %s phase=cleanup", timestamp, clean_up_time, collector) + def fullgc_handler(prefix, log_line, timestamp, collector, file_handler): full_gc_time = sec2milli(float(pattern_map[GC_PAUSE_PATTERN].match(log_line).group(1))) collect_metric_with_prefix(prefix, "gc.g1.fullgc.duration %s %s", timestamp, full_gc_time, collector) + # Inner gc events, which we should have a matcher object def parallel_time_handler(prefix, matcher, timestamp, collector, file_handler): parallel_time, num_of_gc_workers = float(matcher.group(1)), float(matcher.group(2)) collect_metric_with_prefix(prefix, "gc.g1.duration %s %s phase=parallel-time", timestamp, parallel_time, collector) + def object_copy_handler(prefix, matcher, timestamp, collector, file_handler): min_time, avg_time, max_time = [float(matcher.group(i)) for i in range(1, 4)] collect_metric_with_prefix(prefix, "gc.g1.duration %s %s phase=object-copy", timestamp, avg_time, collector) + def allocation_handler(prefix, matcher, timestamp, collector, file_handler): eden_before_in_size, eden_after_in_size = matcher.group(2), matcher.group(4) eden_before = to_size_in_mb(float(matcher.group(1)), eden_before_in_size) @@ -236,30 +253,37 @@ def allocation_handler(prefix, matcher, timestamp, collector, file_handler): collector['gensize']['survivor'] = survivor_after collector['gensize']['heap'] = heap_after_in_mb + def free_cset_handler(prefix, matcher, timestamp, collector, file_handler): free_cset_time = float(matcher.group(1)) collect_metric_with_prefix(prefix, "gc.g1.duration %s %s phase=free-cset", timestamp, free_cset_time, collector) + def ref_enq_handler(prefix, matcher, timestamp, collector, file_handler): ref_enq_time = float(matcher.group(1)) collect_metric_with_prefix(prefix, "gc.g1.duration %s %s phase=ref-enq", timestamp, ref_enq_time, collector) + def ref_proc_handler(prefix, matcher, timestamp, collector, file_handler): ref_proc_time = float(matcher.group(1)) collect_metric_with_prefix(prefix, "gc.g1.duration %s %s phase=ref-proc", timestamp, ref_proc_time, collector) + def choose_cset_handler(prefix, matcher, timestamp, collector, file_handler): choose_cset_time = float(matcher.group(1)) collect_metric_with_prefix(prefix, "gc.g1.duration %s %s phase=choose-cset", timestamp, choose_cset_time, collector) + def clear_ct_handler(prefix, matcher, timestamp, collector, file_handler): clear_ct_time = float(matcher.group(1)) collect_metric_with_prefix(prefix, "gc.g1.duration %s %s phase=clear-ct", timestamp, clear_ct_time, collector) + def scan_rs_handler(prefix, matcher, timestamp, collector, file_handler): min_time, avg_time, max_time = [float(matcher.group(i)) for i in range(1, 4)] collect_metric_with_prefix(prefix, "gc.g1.duration %s %s phase=scan-rs", timestamp, avg_time, collector) + # Complex GC events: initial-mark, young-pause, mixed-pause and remark # These GC events contains several inner gc events and we must call match_remaining_log to parse remaining gc events def initial_mark_handler(prefix, log_line, timestamp, collector, file_handler): @@ -268,24 +292,28 @@ def initial_mark_handler(prefix, log_line, timestamp, collector, file_handler): collect_metric_with_prefix(prefix, "gc.g1.duration %s %s phase=initial-mark", timestamp, initial_mark_pause_time, collector) match_remaining_log(prefix, timestamp, collector, file_handler) + def young_pause_handler(prefix, log_line, timestamp, collector, file_handler): m = pattern_map[GC_PAUSE_PATTERN].match(log_line) young_pause_time = sec2milli(float(m.group(1))) collect_metric_with_prefix(prefix, "gc.g1.duration %s %s phase=young-pause", timestamp, young_pause_time, collector) match_remaining_log(prefix, timestamp, collector, file_handler) + def mixed_pause_handler(prefix, log_line, timestamp, collector, file_handler): m = pattern_map[GC_PAUSE_PATTERN].match(log_line) mixed_pause_time = sec2milli(float(m.group(1))) collect_metric_with_prefix(prefix, "gc.g1.duration %s %s phase=mixed-pause", timestamp, mixed_pause_time, collector) match_remaining_log(prefix, timestamp, collector, file_handler) + def remark_handler(prefix, log_line, timestamp, collector, file_handler): m = pattern_map[REMARK_PATTERN].match(log_line) ref_process_time, remark_time = [sec2milli(float(m.group(i))) for i in range(1, 3)] collect_metric_with_prefix(prefix, "gc.g1.duration %s %s phase=remark", timestamp, remark_time, collector) match_remaining_log(prefix, timestamp, collector, file_handler) + def match_remaining_log(prefix, timestamp, collector, file_handler): while True: line = file_handler.readline() @@ -304,9 +332,11 @@ def match_remaining_log(prefix, timestamp, collector, file_handler): elif pattern_name == CLEAR_CT_PATTERN: clear_ct_handler(prefix, matcher, timestamp, collector, file_handler) else: unmatched_gc_log(line) + def isPause(type, cause): return 'GC pause' in cause and type in cause + def process_gc_record(prefix, file_handler, timestamp, cause, collector): # process simple gc events if 'concurrent-cleanup-end' in cause: concurrent_cleanup_handler(prefix, cause, timestamp, collector, file_handler) @@ -332,6 +362,7 @@ def process_gc_record(prefix, file_handler, timestamp, cause, collector): remark_handler(prefix, cause, timestamp, collector, file_handler) elif cause[-1] == ']': return + def process_gc_log(collector): prefix = collector['prefix'] @@ -385,6 +416,7 @@ def process_gc_log(collector): return 0 + def main(): interval = g1gc_conf.get_interval() @@ -407,5 +439,7 @@ def main(): sys.stdout.flush() time.sleep(interval) + if __name__ == '__main__': - exit(main()) + sys.exit(main()) + diff --git a/collectors/0/graphite_bridge.py b/collectors/0/graphite_bridge.py index 17cfde35..01664e11 100755 --- a/collectors/0/graphite_bridge.py +++ b/collectors/0/graphite_bridge.py @@ -19,25 +19,24 @@ from collectors.lib import utils import threading -try: - from socketserver import ThreadingTCPServer, BaseRequestHandler -except ImportError: - from SocketServer import ThreadingTCPServer, BaseRequestHandler +from socketserver import ThreadingTCPServer, BaseRequestHandler try: - from collectors.etc import graphite_bridge_conf + from collectors.etc import graphite_bridge_conf except ImportError: - graphite_bridge_conf = None + graphite_bridge_conf = None HOST = '127.0.0.1' PORT = 2003 SIZE = 8192 + class GraphiteServer(ThreadingTCPServer): allow_reuse_address = True print_lock = threading.Lock() + class GraphiteHandler(BaseRequestHandler): def handle_line(self, line): @@ -48,7 +47,6 @@ def handle_line(self, line): else: print(line_parts[0], line_parts[2], line_parts[1]) - def handle(self): data = '' while True: @@ -69,7 +67,7 @@ def handle(self): def main(): if not (graphite_bridge_conf and graphite_bridge_conf.enabled()): - sys.exit(13) + return 13 # ask tcollector to not respawn us utils.drop_privileges() server = GraphiteServer((HOST, PORT), GraphiteHandler) @@ -80,7 +78,6 @@ def main(): server.shutdown() server.server_close() -if __name__ == "__main__": - main() -sys.exit(0) +if __name__ == "__main__": + sys.exit(main()) diff --git a/collectors/0/gstat.py b/collectors/0/gstat.py index 0904ec22..37728095 100755 --- a/collectors/0/gstat.py +++ b/collectors/0/gstat.py @@ -58,19 +58,21 @@ except ImportError: gstat_conf = None -DEFAULT_COLLECTION_INTERVAL=15 - +DEFAULT_COLLECTION_INTERVAL = 15 signal_received = None + + def handlesignal(signum, stack): global signal_received signal_received = signum + def main(): """top main loop""" - collection_interval=DEFAULT_COLLECTION_INTERVAL - collection_filter=".*" - if(gstat_conf): + collection_interval = DEFAULT_COLLECTION_INTERVAL + collection_filter = ".*" + if gstat_conf: config = gstat_conf.get_config() collection_interval=config['collection_interval'] collection_filter=config['collection_filter'] @@ -88,7 +90,7 @@ def main(): except OSError as e: if e.errno == errno.ENOENT: # it makes no sense to run this collector here - sys.exit(13) # we signal tcollector to not run us + return 13 # ask tcollector to not respawn us raise timestamp = 0 @@ -138,5 +140,6 @@ def main(): pass p_gstat.wait() + if __name__ == "__main__": - main() + sys.exit(main()) diff --git a/collectors/0/hadoop_datanode.py b/collectors/0/hadoop_datanode.py index 45ac7318..56bf87fa 100755 --- a/collectors/0/hadoop_datanode.py +++ b/collectors/0/hadoop_datanode.py @@ -23,6 +23,7 @@ from collectors.lib import utils from collectors.lib.hadoop_http import HadoopHttp +COLLECTION_INTERVAL = 15 REPLACEMENTS = { "datanodeactivity-": ["activity"], @@ -57,14 +58,13 @@ def main(args): utils.drop_privileges() if json is None: utils.err("This collector requires the `json' Python module.") - return 13 # Ask tcollector not to respawn us + return 13 # ask tcollector not to respawn us datanode_service = HadoopDataNode() while True: datanode_service.emit() - time.sleep(15) + time.sleep(COLLECTION_INTERVAL) return 0 if __name__ == "__main__": sys.exit(main(sys.argv)) - diff --git a/collectors/0/hadoop_journalnode.py b/collectors/0/hadoop_journalnode.py index 0771edaa..50e326cc 100755 --- a/collectors/0/hadoop_journalnode.py +++ b/collectors/0/hadoop_journalnode.py @@ -23,6 +23,7 @@ from collectors.lib import utils from collectors.lib.hadoop_http import HadoopHttp +COLLECTION_INTERVAL = 90 REPLACEMENTS = { "rpcdetailedactivityforport": ["rpc_activity"], @@ -59,10 +60,9 @@ def main(args): journalnode_service = HadoopJournalNode() while True: journalnode_service.emit() - time.sleep(90) + time.sleep(COLLECTION_INTERVAL) return 0 if __name__ == "__main__": sys.exit(main(sys.argv)) - diff --git a/collectors/0/hadoop_namenode.py b/collectors/0/hadoop_namenode.py index 67ec9fac..cade73f6 100755 --- a/collectors/0/hadoop_namenode.py +++ b/collectors/0/hadoop_namenode.py @@ -23,6 +23,7 @@ from collectors.lib import utils from collectors.lib.hadoop_http import HadoopHttp +COLLECTION_INTERVAL = 90 REPLACEMENTS = { "rpcdetailedactivityforport": ["rpc_activity"], @@ -55,14 +56,13 @@ def main(args): utils.drop_privileges() if json is None: utils.err("This collector requires the `json' Python module.") - return 13 # Ask tcollector not to respawn us + return 13 # ask tcollector not to respawn us name_node_service = HadoopNameNode() while True: name_node_service.emit() - time.sleep(90) + time.sleep(COLLECTION_INTERVAL) return 0 if __name__ == "__main__": sys.exit(main(sys.argv)) - diff --git a/collectors/0/hadoop_yarn_node_manager.py b/collectors/0/hadoop_yarn_node_manager.py index 1df8b138..e61b0681 100755 --- a/collectors/0/hadoop_yarn_node_manager.py +++ b/collectors/0/hadoop_yarn_node_manager.py @@ -23,9 +23,9 @@ import time try: - import json + import json except ImportError: - json = None + json = None import argparse SRCDIR = os.path.join(os.path.dirname(__file__)) @@ -40,53 +40,53 @@ class HadoopYarnNodeManager(HadoopHttp): - """ - Class that will retrieve metrics from an Apache Hadoop Yarn Node Manager JMX API + """ + Class that will retrieve metrics from an Apache Hadoop Yarn Node Manager JMX API - Tested on Apache Hadoop 2.7 - """ + Tested on Apache Hadoop 2.7 + """ - def __init__(self, host='localhost', port=8042): - super(HadoopYarnNodeManager, self).__init__('hadoop', - 'yarn.node_manager', - host, - port) + def __init__(self, host='localhost', port=8042): + super(HadoopYarnNodeManager, self).__init__('hadoop', + 'yarn.node_manager', + host, + port) - def emit(self): - current_time = int(time.time()) - metrics = self.poll() - for context, metric_name, value in metrics: - for key, value in REPLACEMENTS.items(): - if any(_.startswith(key) for _ in context): - context = value - self.emit_metric(context, current_time, metric_name, value) + def emit(self): + current_time = int(time.time()) + metrics = self.poll() + for context, metric_name, value in metrics: + for key, value in REPLACEMENTS.items(): + if any(_.startswith(key) for _ in context): + context = value + self.emit_metric(context, current_time, metric_name, value) # args are useful for testing but no given by TCollector so will inherit defaults normally def main(args): - """ Calls HadoopYarnNodeManager at interval secs - and emits metrics to stdout for TCollector """ - if json is None: - utils.err("This collector requires the `json' Python module.") - return 13 # Ask tcollector not to respawn us - utils.drop_privileges() - parser = argparse.ArgumentParser() - parser.add_argument('-H', '--host', default='localhost', - help='Host to connect to (default: localhost)') - parser.add_argument('-P', '--port', default=8042, type=int, - help='Port to connect to (default: 8042)') - parser.add_argument('-i', '--interval', default=90, type=int, - help='Interval at which to emit metrics') - args = parser.parse_args(args[1:]) - host = args.host - port = args.port - interval = args.interval - yarn_service = HadoopYarnNodeManager(host=host, port=port) - while True: - yarn_service.emit() - time.sleep(interval) - return 0 + """ Calls HadoopYarnNodeManager at interval secs + and emits metrics to stdout for TCollector """ + if json is None: + utils.err("This collector requires the `json' Python module.") + return 13 # ask tcollector not to respawn us + utils.drop_privileges() + parser = argparse.ArgumentParser() + parser.add_argument('-H', '--host', default='localhost', + help='Host to connect to (default: localhost)') + parser.add_argument('-P', '--port', default=8042, type=int, + help='Port to connect to (default: 8042)') + parser.add_argument('-i', '--interval', default=90, type=int, + help='Interval at which to emit metrics') + args = parser.parse_args(args[1:]) + host = args.host + port = args.port + interval = args.interval + yarn_service = HadoopYarnNodeManager(host=host, port=port) + while True: + yarn_service.emit() + time.sleep(interval) + return 0 if __name__ == "__main__": - sys.exit(main(sys.argv)) + sys.exit(main(sys.argv)) diff --git a/collectors/0/hadoop_yarn_resource_manager.py b/collectors/0/hadoop_yarn_resource_manager.py index cc5621f2..bb3912e1 100755 --- a/collectors/0/hadoop_yarn_resource_manager.py +++ b/collectors/0/hadoop_yarn_resource_manager.py @@ -21,72 +21,70 @@ import os import sys import time - -try: - import json -except ImportError: - json = None +import json import argparse +from collectors.lib import utils +from collectors.lib.hadoop_http import HadoopHttp + + SRCDIR = os.path.join(os.path.dirname(__file__)) LIBDIR = os.path.join(SRCDIR, '..', 'lib') sys.path.append(LIBDIR) -# pylint: disable=wrong-import-position -from collectors.lib import utils -from collectors.lib.hadoop_http import HadoopHttp + REPLACEMENTS = { } class HadoopYarnResourceManager(HadoopHttp): - """ - Class that will retrieve metrics from an Apache Hadoop Yarn Resource Manager JMX API + """ + Class that will retrieve metrics from an Apache Hadoop Yarn Resource Manager JMX API - Tested on Apache Hadoop 2.7 - """ + Tested on Apache Hadoop 2.7 + """ - def __init__(self, host='localhost', port=8088): - super(HadoopYarnResourceManager, self).__init__('hadoop', - 'yarn.resource_manager', - host, - port) + def __init__(self, host='localhost', port=8088): + super(HadoopYarnResourceManager, self).__init__('hadoop', + 'yarn.resource_manager', + host, + port) - def emit(self): - current_time = int(time.time()) - metrics = self.poll() - for context, metric_name, value in metrics: - for key, value in REPLACEMENTS.items(): - if any(_.startswith(key) for _ in context): - context = value - self.emit_metric(context, current_time, metric_name, value) + def emit(self): + current_time = int(time.time()) + metrics = self.poll() + for context, metric_name, value in metrics: + for key, value in REPLACEMENTS.items(): + if any(_.startswith(key) for _ in context): + context = value + self.emit_metric(context, current_time, metric_name, value) # args are useful for testing but no given by TCollector so will inherit defaults normally def main(args): - """ Calls HadoopYarnResourceManager at interval secs - and emits metrics to stdout for TCollector """ - if json is None: - utils.err("This collector requires the `json' Python module.") - return 13 # Ask tcollector not to respawn us - utils.drop_privileges() - parser = argparse.ArgumentParser() - parser.add_argument('-H', '--host', default='localhost', - help='Host to connect to (default: localhost)') - parser.add_argument('-P', '--port', default=8088, type=int, - help='Port to connect to (default: 8088)') - parser.add_argument('-i', '--interval', default=90, type=int, - help='Interval at which to emit metrics') - args = parser.parse_args(args[1:]) - host = args.host - port = args.port - interval = args.interval - yarn_service = HadoopYarnResourceManager(host=host, port=port) - while True: - yarn_service.emit() - time.sleep(interval) - return 0 + """ Calls HadoopYarnResourceManager at interval secs + and emits metrics to stdout for TCollector """ + if json is None: + utils.err("This collector requires the `json' Python module.") + return 13 # ask tcollector not to respawn us + utils.drop_privileges() + parser = argparse.ArgumentParser() + parser.add_argument('-H', '--host', default='localhost', + help='Host to connect to (default: localhost)') + parser.add_argument('-P', '--port', default=8088, type=int, + help='Port to connect to (default: 8088)') + parser.add_argument('-i', '--interval', default=90, type=int, + help='Interval at which to emit metrics') + args = parser.parse_args(args[1:]) + host = args.host + port = args.port + interval = args.interval + yarn_service = HadoopYarnResourceManager(host=host, port=port) + while True: + yarn_service.emit() + time.sleep(interval) + return 0 if __name__ == "__main__": - sys.exit(main(sys.argv)) + sys.exit(main(sys.argv)) diff --git a/collectors/0/haproxy.py b/collectors/0/haproxy.py index 0fd82331..e131fa88 100755 --- a/collectors/0/haproxy.py +++ b/collectors/0/haproxy.py @@ -90,50 +90,53 @@ "srv_abrt": "server_aborted_data_transfers" } + def haproxy_pid(): - """Finds out the pid of haproxy process""" - try: - pid = subprocess.check_output(["pidof", "-s", "haproxy"]) - except subprocess.CalledProcessError: - return None - return pid.rstrip() + """Finds out the pid of haproxy process""" + try: + pid = subprocess.check_output(["pidof", "-s", "haproxy"]) + except subprocess.CalledProcessError: + return None + return pid.rstrip() + def find_conf_file(pid): - """Returns the conf file of haproxy.""" - try: - output = subprocess.check_output(["ps", "--no-headers", "-o", "cmd", pid]) - except subprocess.CalledProcessError as e: - utils.err("HAProxy (pid %s) went away? %s" % (pid, e)) - return None - return output.split("-f")[1].split()[0] + """Returns the conf file of haproxy.""" + try: + output = subprocess.check_output(["ps", "--no-headers", "-o", "cmd", pid]) + except subprocess.CalledProcessError as e: + utils.err("HAProxy (pid %s) went away? %s" % (pid, e)) + return None + return output.split("-f")[1].split()[0] + def find_sock_file(conf_file): - """Returns the unix socket file of haproxy.""" - try: - fd = open(conf_file) - except IOError as e: - utils.err("Error: %s. Config file path is relative: %s" % (e, conf_file)) - return None - try: - for line in fd: - if line.lstrip(" \t").startswith("stats socket"): - sock_file = line.split()[2] - if utils.is_sockfile(sock_file): - return sock_file - finally: - fd.close() + """Returns the unix socket file of haproxy.""" + try: + fd = open(conf_file) + except IOError as e: + utils.err("Error: %s. Config file path is relative: %s" % (e, conf_file)) + return None + try: + for line in fd: + if line.lstrip(" \t").startswith("stats socket"): + sock_file = line.split()[2] + if utils.is_sockfile(sock_file): + return sock_file + finally: + fd.close() def collect_stats(sock_file): """Collects stats from haproxy unix domain socket""" - sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) try: - sock.settimeout(COLLECTION_INTERVAL) - sock.connect(sock_file) - sock.send("show stat\n") - statlines = sock.recv(10240).split('\n') + sock.settimeout(COLLECTION_INTERVAL) + sock.connect(sock_file) + sock.send("show stat\n") + statlines = sock.recv(10240).split('\n') finally: - sock.close() + sock.close() ts = time.time() # eat up any empty lines that may be present @@ -183,32 +186,32 @@ def print_metric(line, metric, timestamp): if not value: value = 0 print("haproxy.%s %i %s source=%s cluster=%s" - % (METRIC_NAMES[metric], - timestamp, - value, - line["svname"], - line["pxname"])) + % (METRIC_NAMES[metric], + timestamp, + value, + line["svname"], + line["pxname"])) def main(): - pid = haproxy_pid() - if not pid: - utils.err("Error: HAProxy is not running") - return 13 # Ask tcollector to not respawn us. + pid = haproxy_pid() + if not pid: + utils.err("Error: HAProxy is not running") + return 13 # ask tcollector to not respawn us. - conf_file = find_conf_file(pid) - if not conf_file: - return 13 + conf_file = find_conf_file(pid) + if not conf_file: + return 13 - sock_file = find_sock_file(conf_file) - if sock_file is None: - utils.err("Error: HAProxy is not listening on any unix domain socket") - return 13 + sock_file = find_sock_file(conf_file) + if sock_file is None: + utils.err("Error: HAProxy is not listening on any unix domain socket") + return 13 + while True: + collect_stats(sock_file) + time.sleep(COLLECTION_INTERVAL) - while True: - collect_stats(sock_file) - time.sleep(COLLECTION_INTERVAL) if __name__ == "__main__": - sys.exit(main()) + sys.exit(main()) diff --git a/collectors/0/hbase_master.py b/collectors/0/hbase_master.py index bf7f87d9..cb893bcb 100755 --- a/collectors/0/hbase_master.py +++ b/collectors/0/hbase_master.py @@ -15,11 +15,6 @@ import sys import time -try: - import json -except ImportError: - json = None - from collectors.lib import utils from collectors.lib.hadoop_http import HadoopHttp @@ -48,9 +43,6 @@ def emit(self): def main(args): utils.drop_privileges() - if json is None: - utils.err("This collector requires the `json' Python module.") - return 13 # Ask tcollector not to respawn us hbase_service = HBaseMaster() while True: hbase_service.emit() @@ -60,4 +52,3 @@ def main(args): if __name__ == "__main__": sys.exit(main(sys.argv)) - diff --git a/collectors/0/hbase_regionserver.py b/collectors/0/hbase_regionserver.py index e2b67b36..876783fd 100755 --- a/collectors/0/hbase_regionserver.py +++ b/collectors/0/hbase_regionserver.py @@ -14,20 +14,18 @@ import time import re - -try: - import json -except ImportError: - json = None +import sys from collectors.lib import utils from collectors.lib.hadoop_http import HadoopHttp +COLLECTION_INTERVAL = 15 EMIT_REGION = True EXCLUDED_CONTEXTS = ("master") REGION_METRIC_PATTERN = re.compile(r"[N|n]amespace_(.*)_table_(.*)_region_(.*)_metric_(.*)") + class HBaseRegionserver(HadoopHttp): def __init__(self): super(HBaseRegionserver, self).__init__("hbase", "regionserver", "localhost", 16030) @@ -70,16 +68,12 @@ def emit(self): def main(args): utils.drop_privileges() - if json is None: - utils.err("This collector requires the `json' Python module.") - return 13 # Ask tcollector not to respawn us hbase_service = HBaseRegionserver() while True: hbase_service.emit() - time.sleep(15) + time.sleep(COLLECTION_INTERVAL) + if __name__ == "__main__": - import sys sys.exit(main(sys.argv)) - diff --git a/collectors/0/ifrate.py b/collectors/0/ifrate.py index 2f6d7f82..f22ebf06 100755 --- a/collectors/0/ifrate.py +++ b/collectors/0/ifrate.py @@ -48,23 +48,25 @@ except ImportError: ifrate_conf = None -DEFAULT_COLLECTION_INTERVAL=15 - +DEFAULT_COLLECTION_INTERVAL = 15 signal_received = None + + def handlesignal(signum, stack): global signal_received signal_received = signum + def main(): """top main loop""" collection_interval=DEFAULT_COLLECTION_INTERVAL - if(ifrate_conf): + if ifrate_conf: config = ifrate_conf.get_config() - collection_interval=config['collection_interval'] - interfaces=config['interfaces'] - report_packets=config['report_packets'] - merge_err_in_out=config['merge_err_in_out'] + collection_interval = config['collection_interval'] + interfaces = config['interfaces'] + report_packets = config['report_packets'] + merge_err_in_out = config['merge_err_in_out'] global signal_received @@ -80,20 +82,20 @@ def main(): ["netstat", "-I", intname, "-d", "-w", str(collection_interval)], stdout=subprocess.PIPE, )) - intnum+=1 + intnum += 1 else: - sys.exit(13) # we signal tcollector to not run us + return 13 # we signal tcollector to not run us except OSError as e: if e.errno == errno.ENOENT: # it makes no sense to run this collector here - sys.exit(13) # we signal tcollector to not run us + return 13 # we signal tcollector to not run us raise timestamp = 0 procnum = 0 while signal_received is None: - if (procnum >= intnum): + if procnum >= intnum: procnum=0 try: line = p_net[procnum].stdout.readline() @@ -109,14 +111,14 @@ def main(): if (re.match("^[0-9 ]+$",line)): fields = line.split() if len(fields) == 9: - if(procnum == 0): + if procnum == 0: timestamp = int(time.time()) print("ifrate.byt.in %s %s int=%s" % (timestamp, int(fields[3])/collection_interval, interfaces[procnum])) print("ifrate.byt.out %s %s int=%s" % (timestamp, int(fields[6])/collection_interval, interfaces[procnum])) - if(report_packets): + if report_packets: print("ifrate.pkt.in %s %s int=%s" % (timestamp, int(fields[0])/collection_interval, interfaces[procnum])) print("ifrate.pkt.out %s %s int=%s" % (timestamp, int(fields[4])/collection_interval, interfaces[procnum])) - if(merge_err_in_out): + if merge_err_in_out: print("ifrate.err %s %s int=%s" % (timestamp, (int(fields[1])+int(fields[5]))/collection_interval, interfaces[procnum])) print("ifrate.drp %s %s int=%s" % (timestamp, (int(fields[2])+int(fields[8]))/collection_interval, interfaces[procnum])) else: @@ -127,7 +129,7 @@ def main(): print("ifrate.col %s %s int=%s" % (timestamp, int(fields[7])/collection_interval, interfaces[procnum])) # analyze next process - procnum+=1 + procnum += 1 sys.stdout.flush() @@ -142,8 +144,9 @@ def main(): p_net[procnum].wait() # If no line at all has been proceeded (wrong interface name ?), we signal tcollector to not run us - if(timestamp == 0): - exit(13) + if timestamp == 0: + return 13 + if __name__ == "__main__": - main() + sys.exit(main()) diff --git a/collectors/0/ifstat.py b/collectors/0/ifstat.py index 8e0aaf93..6a19865c 100755 --- a/collectors/0/ifstat.py +++ b/collectors/0/ifstat.py @@ -20,7 +20,7 @@ from collectors.lib import utils -interval = 15 # seconds +COLLECTION_INTERVAL = 15 # seconds # /proc/net/dev has 16 fields, 8 for receive and 8 for transmit, # defined below. @@ -89,7 +89,8 @@ def direction(i): % (FIELDS[i], ts, stats[i], intf, direction(i))) sys.stdout.flush() - time.sleep(interval) + time.sleep(COLLECTION_INTERVAL) + if __name__ == "__main__": sys.exit(main()) diff --git a/collectors/0/iostat.py b/collectors/0/iostat.py index 64810676..be2d434d 100755 --- a/collectors/0/iostat.py +++ b/collectors/0/iostat.py @@ -261,7 +261,7 @@ def main(): % (metric, FIELDS_PART[i], ts, values[i + 3], device)) else: print("Cannot parse /proc/diskstats line: ", line, file=sys.stderr) - exit(13) # tcollector does not restart collectors with return code 13 + return 13 # tcollector does not restart collectors with return code 13 sys.stdout.flush() time.sleep(COLLECTION_INTERVAL) diff --git a/collectors/0/jolokia.py b/collectors/0/jolokia.py index 8d54ebac..d3fd96ac 100755 --- a/collectors/0/jolokia.py +++ b/collectors/0/jolokia.py @@ -176,7 +176,7 @@ def parse_attribute(self, attr, not_tags=[]): def main(): if not (jolokia_conf and jolokia_conf.enabled()): utils.err("Jolokia collector disable by config") - sys.exit(13) + return 13 # ask tcollector to not respawn us utils.drop_privileges() CONFIG = jolokia_conf.get_config() @@ -217,7 +217,6 @@ def main(): break # End while True -if __name__ == "__main__": - main() -sys.exit(0) +if __name__ == "__main__": + sys.exit(main()) diff --git a/collectors/0/mapr_metrics.py b/collectors/0/mapr_metrics.py index eae7fa8f..dc0745ba 100755 --- a/collectors/0/mapr_metrics.py +++ b/collectors/0/mapr_metrics.py @@ -11,188 +11,193 @@ from collectors.etc import mapr_metrics_conf from collectors.lib import utils - try: - import requests + import requests except ImportError: - print >>sys.stderr, "Please install the requests module." - sys.exit(1) + print >> sys.stderr, "Please install the requests module." + sys.exit(1) try: - from collectors.etc import mapr_metrics_conf + from collectors.etc import mapr_metrics_conf except ImportError: - utils.err("No mapr_metrics configuration found!") - sys.exit(13) + utils.err("No mapr_metrics configuration found!") + sys.exit(13) CONFIG = mapr_metrics_conf.get_config() def get_metrics(webserver_url, username, password, params): - try: - r = requests.get(webserver_url, auth=(username,password), verify=False, params=params) - except requests.exceptions.ConnectionError as error: - print >>sys.stderr, "Error connecting: %s" % error - utils.err("Connection error: %s" % error) - raise - - try: - r.raise_for_status() - except requests.exceptions.HTTPError as error: - print >>sys.stderr, "Request was not successful: %s" % error - utils.err("HTTP error getting metrics from '%s' - %s" % (webserver_url, error)) - return 13 # tell tcollector to not respawn - - response = r.json() - try: - data = response['data'] - except KeyError as e: - print >>sys.stderr, "Did not get a 'data' key in the response." - print >>sys.stderr, response - raise - return data + try: + r = requests.get(webserver_url, auth=(username, password), verify=False, params=params) + except requests.exceptions.ConnectionError as error: + print >> sys.stderr, "Error connecting: %s" % error + utils.err("Connection error: %s" % error) + raise + + try: + r.raise_for_status() + except requests.exceptions.HTTPError as error: + print >> sys.stderr, "Request was not successful: %s" % error + utils.err("HTTP error getting metrics from '%s' - %s" % (webserver_url, error)) + return 13 # tell tcollector to not respawn + + response = r.json() + try: + data = response['data'] + except KeyError as e: + print >> sys.stderr, "Did not get a 'data' key in the response." + print >> sys.stderr, response + raise + return data + def main(): - schema = "https" + schema = "https" - username = CONFIG['username'] - password = CONFIG['password'] - webserver = CONFIG['webserver'] - port = CONFIG['port'] - if CONFIG['no_ssl']: - schema = "http" - webserver_url = "%s://%s:%d/rest/node/metrics" % (schema, webserver, port) + username = CONFIG['username'] + password = CONFIG['password'] + webserver = CONFIG['webserver'] + port = CONFIG['port'] + if CONFIG['no_ssl']: + schema = "http" + webserver_url = "%s://%s:%d/rest/node/metrics" % (schema, webserver, port) + + m = Metrics2TSD(webserver_url, username, password) + m.run() - m = Metrics2TSD(webserver_url, username, password) - m.run() class Metrics2TSD: - def __init__(self, webserver_url, username='mapr', password='mapr'): - self.metric_template = Template('mapr.$grouping.$metric') - self.webserver_url = webserver_url - self.username = username - self.password = password - self.failed_attempts = 0 - self.last_values = { } - - self.cluster_name = self.get_cluster_name() - - def get_cluster_name(self): - cluster_name = None - with open('/opt/mapr/conf/mapr-clusters.conf', 'r') as clusters_conf: - firstline = clusters_conf.readline() - cluster_name = re.split('\s+', firstline)[0] - return re.sub('\.', '_', cluster_name) - - def run(self): - seconds_delay = CONFIG['interval'] - - while True: - end = datetime.datetime.now() - start = end - timedelta(seconds=seconds_delay) - ms_start = int(start.strftime('%s')) * 1000 - ms_end = int(end.strftime('%s')) * 1000 - nodename = platform.node().split('.')[0] # if node() returns the fqdn, the metrics can't be retrieved - params = { 'nodes': nodename, 'start': ms_start, 'end': ms_end } - - try: - all_metrics = get_metrics(self.webserver_url, self.username, self.password, params) + def __init__(self, webserver_url, username='mapr', password='mapr'): + self.metric_template = Template('mapr.$grouping.$metric') + self.webserver_url = webserver_url + self.username = username + self.password = password self.failed_attempts = 0 - except requests.exceptions.ConnectionError as error: - self.failed_attempts += 1 - utils.err("Error connecting to %s, have experienced %d errors so far." % (self.webserver_url, self.failed_attempts)) - if self.failed_attempts > 5: - print >>sys.stderr, "Failed 5 times, exiting." - return 13 - continue - - if len(all_metrics) > 0: - for d in all_metrics[-1:]: - node = d['NODE'] - timestamp = int(d['TIMESTAMP']) / 1000 - tags = { - 'node': node, - 'cluster': self.cluster_name - } - - for group in ('DISKS','CPUS','NETWORK'): - if group in d: - self.group_metrics(group, self.last_values, d, tags=tags) - try: - self.send_gauge('mapr.memory.used', int(d['MEMORYUSED']) * (1024*1024), timestamp, tags=tags) - except KeyError as e: - utils.err('%s not in metrics data.' % e) - - try: - self.send_gauge('mapr.mfs.available', int(d['SERVAVAILSIZEMB']) * (1024 * 1024), timestamp, tags=tags) - except KeyError as e: - utils.err('%s not in metrics data.' % e) - - try: - self.send_gauge('mapr.mfs.used', int(d['SERVUSEDSIZEMB']) * (1024 * 1024), timestamp, tags=tags) - except KeyError as e: - utils.err('%s not in metrics data.' % e) - - try: - rpccount_metric = self.metric_template.substitute(grouping='rpc', metric='count') - if rpccount_metric in self.last_values: - self.send_counter(rpccount_metric, self.last_values[rpccount_metric], d['RPCCOUNT'], timestamp, tags=tags) - self.last_values[rpccount_metric] = d['RPCCOUNT'] - except KeyError as e: - utils.err('%s is not in metrics data.' % e) - - try: - rpcinbytes_metric = self.metric_template.substitute(grouping='rpc', metric='inbytes') - if rpcinbytes_metric in self.last_values: - self.send_counter(rpcinbytes_metric, self.last_values[rpcinbytes_metric], d['RPCINBYTES'], timestamp, tags=tags) - self.last_values[rpcinbytes_metric] = d['RPCINBYTES'] - except KeyError as e: - utils.err('%s is not in metrics data.' % e) - - try: - rpcoutbytes_metric = self.metric_template.substitute(grouping='rpc', metric='outbytes') - if rpcoutbytes_metric in self.last_values: - self.send_counter(rpcoutbytes_metric, self.last_values[rpcoutbytes_metric], d['RPCOUTBYTES'], timestamp, tags=tags) - self.last_values[rpcoutbytes_metric] = d['RPCOUTBYTES'] - except KeyError as e: - utils.err('%s is not in metrics data.' % e) - time.sleep(seconds_delay) - - - def group_metrics(self, group, last_values, all_metrics, tags={}): - node = all_metrics['NODE'] - timestamp = int(all_metrics['TIMESTAMP']) / 1000 - - for (obj, obj_metrics) in all_metrics[group].items(): - for (metric_name, value) in obj_metrics.items(): - t = tags.copy() - if group == 'DISKS': - t['disk'] = obj - if metric_name.endswith('KB'): - metric_name = re.sub("KB", "BYTES", metric_name) - value = value * 1024 - if group == 'CPUS': - t['cpu'] = obj - if group == 'NETWORK': - t['interface'] = obj - metric = self.metric_template.substitute(grouping=group.lower(), metric=metric_name) - self.print_opentsdb_message(metric, timestamp, value, t) - - def print_opentsdb_message(self, metric, timestamp, value, tags): - tag_string = " ".join(map(lambda x: "%s=%s" % x, tags.items())) - print("%s %i %d %s" % (metric, timestamp, value, tag_string)) - - def send_gauge(self, metric, value, timestamp, tags={}): - self.print_opentsdb_message(metric, timestamp, value, tags) - - def send_counter(self, metric, last_value, value, timestamp, tags={}): - delta = value - last_value - self.print_opentsdb_message(metric, timestamp, delta, tags) + self.last_values = {} + + self.cluster_name = self.get_cluster_name() + + def get_cluster_name(self): + cluster_name = None + with open('/opt/mapr/conf/mapr-clusters.conf', 'r') as clusters_conf: + firstline = clusters_conf.readline() + cluster_name = re.split('\s+', firstline)[0] + return re.sub('\.', '_', cluster_name) + + def run(self): + seconds_delay = CONFIG['interval'] + + while True: + end = datetime.datetime.now() + start = end - timedelta(seconds=seconds_delay) + ms_start = int(start.strftime('%s')) * 1000 + ms_end = int(end.strftime('%s')) * 1000 + nodename = platform.node().split('.')[0] # if node() returns the fqdn, the metrics can't be retrieved + params = {'nodes': nodename, 'start': ms_start, 'end': ms_end} + + try: + all_metrics = get_metrics(self.webserver_url, self.username, self.password, params) + self.failed_attempts = 0 + except requests.exceptions.ConnectionError as error: + self.failed_attempts += 1 + utils.err("Error connecting to %s, have experienced %d errors so far." % ( + self.webserver_url, self.failed_attempts)) + if self.failed_attempts > 5: + print >> sys.stderr, "Failed 5 times, exiting." + return 13 + continue + + if len(all_metrics) > 0: + for d in all_metrics[-1:]: + node = d['NODE'] + timestamp = int(d['TIMESTAMP']) / 1000 + tags = { + 'node': node, + 'cluster': self.cluster_name + } + + for group in ('DISKS', 'CPUS', 'NETWORK'): + if group in d: + self.group_metrics(group, self.last_values, d, tags=tags) + try: + self.send_gauge('mapr.memory.used', int(d['MEMORYUSED']) * (1024 * 1024), timestamp, tags=tags) + except KeyError as e: + utils.err('%s not in metrics data.' % e) + + try: + self.send_gauge('mapr.mfs.available', int(d['SERVAVAILSIZEMB']) * (1024 * 1024), timestamp, + tags=tags) + except KeyError as e: + utils.err('%s not in metrics data.' % e) + + try: + self.send_gauge('mapr.mfs.used', int(d['SERVUSEDSIZEMB']) * (1024 * 1024), timestamp, tags=tags) + except KeyError as e: + utils.err('%s not in metrics data.' % e) + + try: + rpccount_metric = self.metric_template.substitute(grouping='rpc', metric='count') + if rpccount_metric in self.last_values: + self.send_counter(rpccount_metric, self.last_values[rpccount_metric], d['RPCCOUNT'], + timestamp, tags=tags) + self.last_values[rpccount_metric] = d['RPCCOUNT'] + except KeyError as e: + utils.err('%s is not in metrics data.' % e) + + try: + rpcinbytes_metric = self.metric_template.substitute(grouping='rpc', metric='inbytes') + if rpcinbytes_metric in self.last_values: + self.send_counter(rpcinbytes_metric, self.last_values[rpcinbytes_metric], d['RPCINBYTES'], + timestamp, tags=tags) + self.last_values[rpcinbytes_metric] = d['RPCINBYTES'] + except KeyError as e: + utils.err('%s is not in metrics data.' % e) + + try: + rpcoutbytes_metric = self.metric_template.substitute(grouping='rpc', metric='outbytes') + if rpcoutbytes_metric in self.last_values: + self.send_counter(rpcoutbytes_metric, self.last_values[rpcoutbytes_metric], + d['RPCOUTBYTES'], timestamp, tags=tags) + self.last_values[rpcoutbytes_metric] = d['RPCOUTBYTES'] + except KeyError as e: + utils.err('%s is not in metrics data.' % e) + time.sleep(seconds_delay) + + def group_metrics(self, group, last_values, all_metrics, tags={}): + node = all_metrics['NODE'] + timestamp = int(all_metrics['TIMESTAMP']) / 1000 + + for (obj, obj_metrics) in all_metrics[group].items(): + for (metric_name, value) in obj_metrics.items(): + t = tags.copy() + if group == 'DISKS': + t['disk'] = obj + if metric_name.endswith('KB'): + metric_name = re.sub("KB", "BYTES", metric_name) + value = value * 1024 + if group == 'CPUS': + t['cpu'] = obj + if group == 'NETWORK': + t['interface'] = obj + metric = self.metric_template.substitute(grouping=group.lower(), metric=metric_name) + self.print_opentsdb_message(metric, timestamp, value, t) + + def print_opentsdb_message(self, metric, timestamp, value, tags): + tag_string = " ".join(map(lambda x: "%s=%s" % x, tags.items())) + print("%s %i %d %s" % (metric, timestamp, value, tag_string)) + + def send_gauge(self, metric, value, timestamp, tags={}): + self.print_opentsdb_message(metric, timestamp, value, tags) + + def send_counter(self, metric, last_value, value, timestamp, tags={}): + delta = value - last_value + self.print_opentsdb_message(metric, timestamp, delta, tags) if __name__ == "__main__": - if mapr_metrics_conf.enabled(): - sys.stdin.close() - sys.exit(main()) - else: - utils.err("Enable the mapr_metrics collector if you want MapR stats.") - sys.exit(13) + if mapr_metrics_conf.enabled(): + sys.stdin.close() + sys.exit(main()) + else: + utils.err("Enable the mapr_metrics collector if you want MapR stats.") + sys.exit(13) diff --git a/collectors/0/mongo.py b/collectors/0/mongo.py index 0f640ff5..1f58c571 100755 --- a/collectors/0/mongo.py +++ b/collectors/0/mongo.py @@ -62,11 +62,12 @@ ('opcounters', ('command', 'delete', 'getmore', 'insert', 'query', 'update')), ) + def main(): utils.drop_privileges() if pymongo is None: - print("error: Python module `pymongo' is missing", file=sys.stderr) - return 13 + print("error: Python module `pymongo' is missing", file=sys.stderr) + return 13 # ask tcollector to not respawn us c = pymongo.Connection(host=HOST, port=PORT) @@ -90,5 +91,6 @@ def main(): sys.stdout.flush() time.sleep(INTERVAL) + if __name__ == '__main__': sys.exit(main()) diff --git a/collectors/0/mongo3.py b/collectors/0/mongo3.py index 2fb56564..a17d4034 100755 --- a/collectors/0/mongo3.py +++ b/collectors/0/mongo3.py @@ -18,7 +18,6 @@ import sys import time -import os try: import pymongo except ImportError: @@ -205,6 +204,7 @@ def runServerStatus(c): for k, v in cur.items(): print('mongo.%s %d %s mode=%s' % (metric, ts, v, k)) + def runDbStats(c): for db_name in DB_NAMES: res = c[db_name].command('dbStats') @@ -233,6 +233,7 @@ def runDbStats(c): continue print('mongo.rs.%s %d %s replica=%s db=%s' % (metric, ts, cur, replica_name, db_name)) + def runReplSetGetStatus(c): res = c.admin.command('replSetGetStatus') ts = int(time.time()) @@ -258,6 +259,7 @@ def runReplSetGetStatus(c): continue print('mongo.replica.%s %d %s replica_set=%s replica=%s replica_state=%s replica_health=%s' % (metric, ts, cur, replica_set_name, replica_name, replica_state, replica_health)) + def loadEnv(): global USER, PASS, INTERVAL, DB_NAMES, CONFIG_CONN, MONGOS_CONN, REPLICA_CONN for item in mongodb3_conf.get_settings()['db'].split(','): @@ -281,13 +283,14 @@ def loadEnv(): PASS = mongodb3_conf.get_settings()['password'] INTERVAL = mongodb3_conf.get_settings()['interval'] + def main(): loadEnv() utils.drop_privileges() if pymongo is None: print("error: Python module `pymongo' is missing", file=sys.stderr) - return 13 + return 13 # ask tcollector to not respawn us for index, item in enumerate(CONFIG_CONN, start=0): conn = pymongo.MongoClient(host=item['host'], port=item['port']) @@ -320,5 +323,6 @@ def main(): sys.stdout.flush() time.sleep(INTERVAL) + if __name__ == '__main__': sys.exit(main()) diff --git a/collectors/0/mountstats.py b/collectors/0/mountstats.py index ad31bddf..01b1e1be 100755 --- a/collectors/0/mountstats.py +++ b/collectors/0/mountstats.py @@ -79,22 +79,11 @@ # proc.mountstats.bytes.writepages 1464196613 2477054 nfshost=fls1.sys.lab1.syseng.tmcs nfsvol=/vol/vol0 """ -import os -import socket import sys import time -PY3 = sys.version_info[0] > 2 -if PY3: - from hashlib import md5 +from hashlib import md5 - def md5_digest(line): - return md5(line.encode("utf8")).digest() -else: - import md5 # pylint: disable=import-error - - def md5_digest(line): - return md5.new(line).digest() COLLECTION_INTERVAL = 10 # seconds @@ -107,12 +96,17 @@ def md5_digest(line): # RPC_FIELDS is the individual metric fields on the RPC metric lines RPC_FIELDS = ['ops', 'txs', 'timeouts', 'txbytes', 'rxbytes', 'qtime', 'rttime', 'totaltime'] + +def md5_digest(line): + return md5(line.encode("utf8")).digest() + + def main(): """nfsstats main loop.""" try: f_nfsstats = open("/proc/self/mountstats", "r") except: - sys.exit(13) + return 13 while True: device = None @@ -174,7 +168,6 @@ def main(): for i in range(1, len(RPC_FIELDS) + 1): rpc_metrics[device]['other'][RPC_FIELDS[i-1]] += int(values[i]) - for device in rpc_metrics: # Skip the duplicates if 'dupe' in rpc_metrics[device]: @@ -195,6 +188,5 @@ def main(): time.sleep(COLLECTION_INTERVAL) - if __name__ == "__main__": - main() + sys.exit(main()) diff --git a/collectors/0/mysql.py b/collectors/0/mysql.py index c0064c85..15fe5452 100755 --- a/collectors/0/mysql.py +++ b/collectors/0/mysql.py @@ -20,19 +20,15 @@ import sys import time -PY3 = sys.version_info[0] > 2 -if PY3: - INTEGER_TYPES = (int, ) -else: - INTEGER_TYPES = (int, long) # pylint: disable=undefined-variable +from collectors.etc import mysqlconf +from collectors.lib import utils + +INTEGER_TYPES = (int,) try: - import MySQLdb + import MySQLdb except ImportError: - MySQLdb = None # This is handled gracefully in main() - -from collectors.etc import mysqlconf -from collectors.lib import utils + MySQLdb = None # This is handled gracefully in main() COLLECTION_INTERVAL = 15 # seconds CONNECT_TIMEOUT = 2 # seconds @@ -40,368 +36,373 @@ DB_REFRESH_INTERVAL = 60 # seconds # Usual locations where to find the default socket file. DEFAULT_SOCKFILES = set([ - "/tmp/mysql.sock", # MySQL's own default. - "/var/lib/mysql/mysql.sock", # RH-type / RPM systems. - "/var/run/mysqld/mysqld.sock", # Debian-type systems. + "/tmp/mysql.sock", # MySQL's own default. + "/var/lib/mysql/mysql.sock", # RH-type / RPM systems. + "/var/run/mysqld/mysqld.sock", # Debian-type systems. ]) # Directories under which to search additional socket files. SEARCH_DIRS = [ - "/var/lib/mysql", + "/var/lib/mysql", ] -class DB(object): - """Represents a MySQL server (as we can monitor more than 1 MySQL).""" - def __init__(self, sockfile, dbname, db, cursor, version): - """Constructor. - - Args: - sockfile: Path to the socket file. - dbname: Name of the database for that socket file. - db: A MySQLdb connection opened to that socket file. - cursor: A cursor acquired from that connection. - version: What version is this MySQL running (from `SELECT VERSION()'). - """ - self.sockfile = sockfile - self.dbname = dbname - self.db = db - self.cursor = cursor - self.version = version - self.master = None - self.slave_bytes_executed = None - self.relay_bytes_relayed = None - - version = version.split(".") - try: - self.major = int(version[0]) - self.medium = int(version[1]) - except (ValueError, IndexError) as e: - self.major = self.medium = 0 - - def __str__(self): - return "DB(%r, %r, version=%r)" % (self.sockfile, self.dbname, - self.version) - - def __repr__(self): - return self.__str__() - - def isShowGlobalStatusSafe(self): - """Returns whether or not SHOW GLOBAL STATUS is safe to run.""" - # We can't run SHOW GLOBAL STATUS on versions prior to 5.1 because it - # locks the entire database for too long and severely impacts traffic. - return self.major > 5 or (self.major == 5 and self.medium >= 1) - - def query(self, sql): - """Executes the given SQL statement and returns a sequence of rows.""" - assert self.cursor, "%s already closed?" % (self,) - try: - self.cursor.execute(sql) - except MySQLdb.OperationalError as exc: - if exc.errno != 2006: # "MySQL server has gone away" # pylint:disable=no-member - raise - self._reconnect() - return self.cursor.fetchall() - - def close(self): - """Closes the connection to this MySQL server.""" - if self.cursor: - self.cursor.close() - self.cursor = None - if self.db: - self.db.close() - self.db = None - - def _reconnect(self): - """Reconnects to this MySQL server.""" - self.close() - self.db = mysql_connect(self.sockfile) - self.cursor = self.db.cursor() +class DB(object): + """Represents a MySQL server (as we can monitor more than 1 MySQL).""" + + def __init__(self, sockfile, dbname, db, cursor, version): + """Constructor. + + Args: + sockfile: Path to the socket file. + dbname: Name of the database for that socket file. + db: A MySQLdb connection opened to that socket file. + cursor: A cursor acquired from that connection. + version: What version is this MySQL running (from `SELECT VERSION()'). + """ + self.sockfile = sockfile + self.dbname = dbname + self.db = db + self.cursor = cursor + self.version = version + self.master = None + self.slave_bytes_executed = None + self.relay_bytes_relayed = None + + version = version.split(".") + try: + self.major = int(version[0]) + self.medium = int(version[1]) + except (ValueError, IndexError) as e: + self.major = self.medium = 0 + + def __str__(self): + return "DB(%r, %r, version=%r)" % (self.sockfile, self.dbname, + self.version) + + def __repr__(self): + return self.__str__() + + def isShowGlobalStatusSafe(self): + """Returns whether or not SHOW GLOBAL STATUS is safe to run.""" + # We can't run SHOW GLOBAL STATUS on versions prior to 5.1 because it + # locks the entire database for too long and severely impacts traffic. + return self.major > 5 or (self.major == 5 and self.medium >= 1) + + def query(self, sql): + """Executes the given SQL statement and returns a sequence of rows.""" + assert self.cursor, "%s already closed?" % (self,) + try: + self.cursor.execute(sql) + except MySQLdb.OperationalError as exc: + if exc.errno != 2006: # "MySQL server has gone away" # pylint:disable=no-member + raise + self._reconnect() + return self.cursor.fetchall() + + def close(self): + """Closes the connection to this MySQL server.""" + if self.cursor: + self.cursor.close() + self.cursor = None + if self.db: + self.db.close() + self.db = None + + def _reconnect(self): + """Reconnects to this MySQL server.""" + self.close() + self.db = mysql_connect(self.sockfile) + self.cursor = self.db.cursor() def mysql_connect(sockfile): - """Connects to the MySQL server using the specified socket file.""" - user, passwd = mysqlconf.get_user_password(sockfile) - return MySQLdb.connect(unix_socket=sockfile, - connect_timeout=CONNECT_TIMEOUT, - user=user, passwd=passwd) + """Connects to the MySQL server using the specified socket file.""" + user, passwd = mysqlconf.get_user_password(sockfile) + return MySQLdb.connect(unix_socket=sockfile, + connect_timeout=CONNECT_TIMEOUT, + user=user, passwd=passwd) def todict(db, row): - """Transforms a row (returned by DB.query) into a dict keyed by column names. - - Args: - db: The DB instance from which this row was obtained. - row: A row as returned by DB.query - """ - d = {} - for i, field in enumerate(db.cursor.description): - column = field[0].lower() # Lower-case to normalize field names. - d[column] = row[i] - return d + """Transforms a row (returned by DB.query) into a dict keyed by column names. + + Args: + db: The DB instance from which this row was obtained. + row: A row as returned by DB.query + """ + d = {} + for i, field in enumerate(db.cursor.description): + column = field[0].lower() # Lower-case to normalize field names. + d[column] = row[i] + return d + def get_dbname(sockfile): - """Returns the name of the DB based on the path to the socket file.""" - if sockfile in DEFAULT_SOCKFILES: - return "default" - m = re.search("/mysql-(.+)/[^.]+\.sock$", sockfile) - if not m: - utils.err("error: couldn't guess the name of the DB for " + sockfile) - return None - return m.group(1) + """Returns the name of the DB based on the path to the socket file.""" + if sockfile in DEFAULT_SOCKFILES: + return "default" + m = re.search("/mysql-(.+)/[^.]+\.sock$", sockfile) + if not m: + utils.err("error: couldn't guess the name of the DB for " + sockfile) + return None + return m.group(1) def find_sockfiles(): - """Returns a list of paths to socket files to monitor.""" - paths = [] - # Look for socket files. - for dir in SEARCH_DIRS: - if not os.path.isdir(dir) or not os.access(dir, os.R_OK): - continue - for name in os.listdir(dir): - subdir = os.path.join(dir, name) - if not os.path.isdir(subdir) or not os.access(subdir, os.R_OK): - continue - for subname in os.listdir(subdir): - path = os.path.join(subdir, subname) - if utils.is_sockfile(path): - paths.append(path) - break # We only expect 1 socket file per DB, so get out. - # Try the default locations. - for sockfile in DEFAULT_SOCKFILES: - if not utils.is_sockfile(sockfile): - continue - paths.append(sockfile) - return paths + """Returns a list of paths to socket files to monitor.""" + paths = [] + # Look for socket files. + for dir in SEARCH_DIRS: + if not os.path.isdir(dir) or not os.access(dir, os.R_OK): + continue + for name in os.listdir(dir): + subdir = os.path.join(dir, name) + if not os.path.isdir(subdir) or not os.access(subdir, os.R_OK): + continue + for subname in os.listdir(subdir): + path = os.path.join(subdir, subname) + if utils.is_sockfile(path): + paths.append(path) + break # We only expect 1 socket file per DB, so get out. + # Try the default locations. + for sockfile in DEFAULT_SOCKFILES: + if not utils.is_sockfile(sockfile): + continue + paths.append(sockfile) + return paths def find_databases(dbs=None): - """Returns a map of dbname (string) to DB instances to monitor. - - Args: - dbs: A map of dbname (string) to DB instances already monitored. - This map will be modified in place if it's not None. - """ - sockfiles = find_sockfiles() - if dbs is None: - dbs = {} - for sockfile in sockfiles: - dbname = get_dbname(sockfile) - if dbname in dbs: - continue - if not dbname: - continue - try: - db = mysql_connect(sockfile) - cursor = db.cursor() - cursor.execute("SELECT VERSION()") - except (EnvironmentError, EOFError, RuntimeError, socket.error, - MySQLdb.MySQLError) as e: - utils.err("Couldn't connect to %s: %s" % (sockfile, e)) - continue - version = cursor.fetchone()[0] - dbs[dbname] = DB(sockfile, dbname, db, cursor, version) - return dbs + """Returns a map of dbname (string) to DB instances to monitor. + + Args: + dbs: A map of dbname (string) to DB instances already monitored. + This map will be modified in place if it's not None. + """ + sockfiles = find_sockfiles() + if dbs is None: + dbs = {} + for sockfile in sockfiles: + dbname = get_dbname(sockfile) + if dbname in dbs: + continue + if not dbname: + continue + try: + db = mysql_connect(sockfile) + cursor = db.cursor() + cursor.execute("SELECT VERSION()") + except (EnvironmentError, EOFError, RuntimeError, socket.error, + MySQLdb.MySQLError) as e: + utils.err("Couldn't connect to %s: %s" % (sockfile, e)) + continue + version = cursor.fetchone()[0] + dbs[dbname] = DB(sockfile, dbname, db, cursor, version) + return dbs def now(): - return int(time.time()) + return int(time.time()) def isyes(s): - if s.lower() == "yes": - return 1 - return 0 + if s.lower() == "yes": + return 1 + return 0 def collectInnodbStatus(db): - """Collects and prints InnoDB stats about the given DB instance.""" - ts = now() - def printmetric(metric, value, tags=""): - print("mysql.%s %d %s schema=%s%s" % (metric, ts, value, db.dbname, tags)) - - innodb_status = db.query("SHOW ENGINE INNODB STATUS")[0][2] - m = re.search("^(\d{6}\s+\d{1,2}:\d\d:\d\d) INNODB MONITOR OUTPUT$", - innodb_status, re.M) - if m: # If we have it, try to use InnoDB's own timestamp. - ts = int(time.mktime(time.strptime(m.group(1), "%y%m%d %H:%M:%S"))) - - line = None - def match(regexp): - return re.match(regexp, line) - - for line in innodb_status.split("\n"): - # SEMAPHORES - m = match("OS WAIT ARRAY INFO: reservation count (\d+), signal count (\d+)") - if m: - printmetric("innodb.oswait_array.reservation_count", m.group(1)) - printmetric("innodb.oswait_array.signal_count", m.group(2)) - continue - m = match("Mutex spin waits (\d+), rounds (\d+), OS waits (\d+)") - if m: - printmetric("innodb.locks.spin_waits", m.group(1), " type=mutex") - printmetric("innodb.locks.rounds", m.group(2), " type=mutex") - printmetric("innodb.locks.os_waits", m.group(3), " type=mutex") - continue - m = match("RW-shared spins (\d+), OS waits (\d+);" - " RW-excl spins (\d+), OS waits (\d+)") - if m: - printmetric("innodb.locks.spin_waits", m.group(1), " type=rw-shared") - printmetric("innodb.locks.os_waits", m.group(2), " type=rw-shared") - printmetric("innodb.locks.spin_waits", m.group(3), " type=rw-exclusive") - printmetric("innodb.locks.os_waits", m.group(4), " type=rw-exclusive") - continue - # GG 20141015 - RW-shared and RW-excl got separate lines and rounds in 5.5+ - m = match("RW-shared spins (\d+), rounds (\d+), OS waits (\d+)") - if m: - printmetric("locks.spin_waits", m.group(1), " type=rw-shared") - printmetric("locks.rounds", m.group(2), " type=rw-shared") - printmetric("locks.os_waits", m.group(3), " type=rw-shared") - continue - m = match("RW-excl spins (\d+), rounds (\d+), OS waits (\d+)") - if m: - printmetric("locks.spin_waits", m.group(1), " type=rw-exclusive") - printmetric("locks.rounds", m.group(2), " type=rw-exclusive") - printmetric("locks.os_waits", m.group(3), " type=rw-exclusive") - continue - # INSERT BUFFER AND ADAPTIVE HASH INDEX - # TODO(tsuna): According to the code in ibuf0ibuf.c, this line and - # the following one can appear multiple times. I've never seen this. - # If that happens, we need to aggregate the values here instead of - # printing them directly. - m = match("Ibuf: size (\d+), free list len (\d+), seg size (\d+),") - if m: - printmetric("innodb.ibuf.size", m.group(1)) - printmetric("innodb.ibuf.free_list_len", m.group(2)) - printmetric("innodb.ibuf.seg_size", m.group(3)) - continue - m = match("(\d+) inserts, (\d+) merged recs, (\d+) merges") - if m: - printmetric("innodb.ibuf.inserts", m.group(1)) - printmetric("innodb.ibuf.merged_recs", m.group(2)) - printmetric("innodb.ibuf.merges", m.group(3)) - continue - # ROW OPERATIONS - m = match("\d+ queries inside InnoDB, (\d+) queries in queue") - if m: - printmetric("innodb.queries_queued", m.group(1)) - continue - m = match("(\d+) read views open inside InnoDB") - if m: - printmetric("innodb.opened_read_views", m.group(1)) - continue - # TRANSACTION - m = match("History list length (\d+)") - if m: - printmetric("innodb.history_list_length", m.group(1)) - continue + """Collects and prints InnoDB stats about the given DB instance.""" + ts = now() + + def printmetric(metric, value, tags=""): + print("mysql.%s %d %s schema=%s%s" % (metric, ts, value, db.dbname, tags)) + + innodb_status = db.query("SHOW ENGINE INNODB STATUS")[0][2] + m = re.search("^(\d{6}\s+\d{1,2}:\d\d:\d\d) INNODB MONITOR OUTPUT$", + innodb_status, re.M) + if m: # If we have it, try to use InnoDB's own timestamp. + ts = int(time.mktime(time.strptime(m.group(1), "%y%m%d %H:%M:%S"))) + + line = None + + def match(regexp): + return re.match(regexp, line) + + for line in innodb_status.split("\n"): + # SEMAPHORES + m = match("OS WAIT ARRAY INFO: reservation count (\d+), signal count (\d+)") + if m: + printmetric("innodb.oswait_array.reservation_count", m.group(1)) + printmetric("innodb.oswait_array.signal_count", m.group(2)) + continue + m = match("Mutex spin waits (\d+), rounds (\d+), OS waits (\d+)") + if m: + printmetric("innodb.locks.spin_waits", m.group(1), " type=mutex") + printmetric("innodb.locks.rounds", m.group(2), " type=mutex") + printmetric("innodb.locks.os_waits", m.group(3), " type=mutex") + continue + m = match("RW-shared spins (\d+), OS waits (\d+);" + " RW-excl spins (\d+), OS waits (\d+)") + if m: + printmetric("innodb.locks.spin_waits", m.group(1), " type=rw-shared") + printmetric("innodb.locks.os_waits", m.group(2), " type=rw-shared") + printmetric("innodb.locks.spin_waits", m.group(3), " type=rw-exclusive") + printmetric("innodb.locks.os_waits", m.group(4), " type=rw-exclusive") + continue + # GG 20141015 - RW-shared and RW-excl got separate lines and rounds in 5.5+ + m = match("RW-shared spins (\d+), rounds (\d+), OS waits (\d+)") + if m: + printmetric("locks.spin_waits", m.group(1), " type=rw-shared") + printmetric("locks.rounds", m.group(2), " type=rw-shared") + printmetric("locks.os_waits", m.group(3), " type=rw-shared") + continue + m = match("RW-excl spins (\d+), rounds (\d+), OS waits (\d+)") + if m: + printmetric("locks.spin_waits", m.group(1), " type=rw-exclusive") + printmetric("locks.rounds", m.group(2), " type=rw-exclusive") + printmetric("locks.os_waits", m.group(3), " type=rw-exclusive") + continue + # INSERT BUFFER AND ADAPTIVE HASH INDEX + # TODO(tsuna): According to the code in ibuf0ibuf.c, this line and + # the following one can appear multiple times. I've never seen this. + # If that happens, we need to aggregate the values here instead of + # printing them directly. + m = match("Ibuf: size (\d+), free list len (\d+), seg size (\d+),") + if m: + printmetric("innodb.ibuf.size", m.group(1)) + printmetric("innodb.ibuf.free_list_len", m.group(2)) + printmetric("innodb.ibuf.seg_size", m.group(3)) + continue + m = match("(\d+) inserts, (\d+) merged recs, (\d+) merges") + if m: + printmetric("innodb.ibuf.inserts", m.group(1)) + printmetric("innodb.ibuf.merged_recs", m.group(2)) + printmetric("innodb.ibuf.merges", m.group(3)) + continue + # ROW OPERATIONS + m = match("\d+ queries inside InnoDB, (\d+) queries in queue") + if m: + printmetric("innodb.queries_queued", m.group(1)) + continue + m = match("(\d+) read views open inside InnoDB") + if m: + printmetric("innodb.opened_read_views", m.group(1)) + continue + # TRANSACTION + m = match("History list length (\d+)") + if m: + printmetric("innodb.history_list_length", m.group(1)) + continue def collect(db): - """Collects and prints stats about the given DB instance.""" - - ts = now() - def printmetric(metric, value, tags=""): - print("mysql.%s %d %s schema=%s%s" % (metric, ts, value, db.dbname, tags)) - - has_innodb = False - if db.isShowGlobalStatusSafe(): - for metric, value in db.query("SHOW GLOBAL STATUS"): - try: - if "." in value: - value = float(value) - else: - value = int(value) - except ValueError: - continue - metric = metric.lower() - has_innodb = has_innodb or metric.startswith("innodb") - printmetric(metric, value) - - if has_innodb: - collectInnodbStatus(db) - - if has_innodb and False: # Disabled because it's too expensive for InnoDB. - waits = {} # maps a mutex name to the number of waits + """Collects and prints stats about the given DB instance.""" + ts = now() - for engine, mutex, status in db.query("SHOW ENGINE INNODB MUTEX"): - if not status.startswith("os_waits"): - continue - m = re.search("&(\w+)(?:->(\w+))?$", mutex) - if not m: - continue - mutex, kind = m.groups() - if kind: - mutex += "." + kind - wait_count = int(status.split("=", 1)[1]) - waits[mutex] = waits.get(mutex, 0) + wait_count - for mutex, wait_count in waits.items(): - printmetric("innodb.locks", wait_count, " mutex=" + mutex) - - ts = now() - - mysql_slave_status = db.query("SHOW SLAVE STATUS") - if mysql_slave_status: - slave_status = todict(db, mysql_slave_status[0]) - master_host = slave_status["master_host"] - else: - master_host = None - - if master_host and master_host != "None": - sbm = slave_status.get("seconds_behind_master") - if isinstance(sbm, INTEGER_TYPES): - printmetric("slave.seconds_behind_master", sbm) - printmetric("slave.bytes_executed", slave_status["exec_master_log_pos"]) - printmetric("slave.bytes_relayed", slave_status["read_master_log_pos"]) - printmetric("slave.thread_io_running", - isyes(slave_status["slave_io_running"])) - printmetric("slave.thread_sql_running", - isyes(slave_status["slave_sql_running"])) - - states = {} # maps a connection state to number of connections in that state - for row in db.query("SHOW PROCESSLIST"): - id, user, host, db_, cmd, time, state = row[:7] - states[cmd] = states.get(cmd, 0) + 1 - for state, count in states.items(): - state = state.lower().replace(" ", "_") - printmetric("connection_states", count, " state=%s" % state) + def printmetric(metric, value, tags=""): + print("mysql.%s %d %s schema=%s%s" % (metric, ts, value, db.dbname, tags)) + + has_innodb = False + if db.isShowGlobalStatusSafe(): + for metric, value in db.query("SHOW GLOBAL STATUS"): + try: + if "." in value: + value = float(value) + else: + value = int(value) + except ValueError: + continue + metric = metric.lower() + has_innodb = has_innodb or metric.startswith("innodb") + printmetric(metric, value) + + if has_innodb: + collectInnodbStatus(db) + + if has_innodb and False: # Disabled because it's too expensive for InnoDB. + waits = {} # maps a mutex name to the number of waits + ts = now() + for engine, mutex, status in db.query("SHOW ENGINE INNODB MUTEX"): + if not status.startswith("os_waits"): + continue + m = re.search("&(\w+)(?:->(\w+))?$", mutex) + if not m: + continue + mutex, kind = m.groups() + if kind: + mutex += "." + kind + wait_count = int(status.split("=", 1)[1]) + waits[mutex] = waits.get(mutex, 0) + wait_count + for mutex, wait_count in waits.items(): + printmetric("innodb.locks", wait_count, " mutex=" + mutex) -def main(args): - """Collects and dumps stats from a MySQL server.""" - if not find_sockfiles(): # Nothing to monitor. - return 13 # Ask tcollector to not respawn us. - if MySQLdb is None: - utils.err("error: Python module `MySQLdb' is missing") - return 1 - - last_db_refresh = now() - dbs = find_databases() - while True: ts = now() - if ts - last_db_refresh >= DB_REFRESH_INTERVAL: - find_databases(dbs) - last_db_refresh = ts - - errs = [] - for dbname, db in dbs.items(): - try: - collect(db) - except (EnvironmentError, EOFError, RuntimeError, socket.error, - MySQLdb.MySQLError) as e: - if isinstance(e, IOError) and e[0] == errno.EPIPE: - # Exit on a broken pipe. There's no point in continuing - # because no one will read our stdout anyway. - return 2 - utils.err("error: failed to collect data from %s: %s" % (db, e)) - errs.append(dbname) - - for dbname in errs: - del dbs[dbname] - - sys.stdout.flush() - time.sleep(COLLECTION_INTERVAL) + + mysql_slave_status = db.query("SHOW SLAVE STATUS") + if mysql_slave_status: + slave_status = todict(db, mysql_slave_status[0]) + master_host = slave_status["master_host"] + else: + master_host = None + + if master_host and master_host != "None": + sbm = slave_status.get("seconds_behind_master") + if isinstance(sbm, INTEGER_TYPES): + printmetric("slave.seconds_behind_master", sbm) + printmetric("slave.bytes_executed", slave_status["exec_master_log_pos"]) + printmetric("slave.bytes_relayed", slave_status["read_master_log_pos"]) + printmetric("slave.thread_io_running", + isyes(slave_status["slave_io_running"])) + printmetric("slave.thread_sql_running", + isyes(slave_status["slave_sql_running"])) + + states = {} # maps a connection state to number of connections in that state + for row in db.query("SHOW PROCESSLIST"): + id, user, host, db_, cmd, time, state = row[:7] + states[cmd] = states.get(cmd, 0) + 1 + for state, count in states.items(): + state = state.lower().replace(" ", "_") + printmetric("connection_states", count, " state=%s" % state) + + +def main(args): + """Collects and dumps stats from a MySQL server.""" + if not find_sockfiles(): # Nothing to monitor. + return 13 # ask tcollector to not respawn us. + if MySQLdb is None: + utils.err("error: Python module `MySQLdb' is missing") + return 1 + + last_db_refresh = now() + dbs = find_databases() + while True: + ts = now() + if ts - last_db_refresh >= DB_REFRESH_INTERVAL: + find_databases(dbs) + last_db_refresh = ts + + errs = [] + for dbname, db in dbs.items(): + try: + collect(db) + except (EnvironmentError, EOFError, RuntimeError, socket.error, + MySQLdb.MySQLError) as e: + if isinstance(e, IOError) and e[0] == errno.EPIPE: + # Exit on a broken pipe. There's no point in continuing + # because no one will read our stdout anyway. + return 2 + utils.err("error: failed to collect data from %s: %s" % (db, e)) + errs.append(dbname) + + for dbname in errs: + del dbs[dbname] + + sys.stdout.flush() + time.sleep(COLLECTION_INTERVAL) if __name__ == "__main__": - sys.stdin.close() - sys.exit(main(sys.argv)) + sys.stdin.close() + sys.exit(main(sys.argv)) diff --git a/collectors/0/netfilter.py b/collectors/0/netfilter.py index 1dced88c..ab5f860b 100755 --- a/collectors/0/netfilter.py +++ b/collectors/0/netfilter.py @@ -38,33 +38,34 @@ basedir = "/proc/sys/net/netfilter" + def main(): """netfilter main loop""" utils.drop_privileges() - if (os.path.isdir(basedir)): + if os.path.isdir(basedir): while True: ts = int(time.time()) - - for s in STATS: - try: - f = open(basedir + "/" + s, 'r') - value = f.readline().rstrip() - print("proc.sys.net.netfilter.%s %d %s" % (s, ts, value)) - f.close() + + for s in STATS: + try: + f = open(basedir + "/" + s, 'r') + value = f.readline().rstrip() + print("proc.sys.net.netfilter.%s %d %s" % (s, ts, value)) + f.close() except: - # brute'ish, but should keep the collector reasonably future - # proof if some of the stats disappear between kernel module - # versions - continue + # brute'ish, but should keep the collector reasonably future + # proof if some of the stats disappear between kernel module + # versions + continue sys.stdout.flush() time.sleep(interval) - else: - print ("%s does not exist - ip_conntrack probably missing") - sys.exit(13) # we signal tcollector to not run us - + else: + print("%s does not exist - ip_conntrack probably missing") + return 13 # we signal tcollector to not run us + if __name__ == "__main__": sys.exit(main()) diff --git a/collectors/0/netstat.py b/collectors/0/netstat.py index 2f3487e7..8a91217e 100755 --- a/collectors/0/netstat.py +++ b/collectors/0/netstat.py @@ -390,5 +390,6 @@ def parse_stats(stats, filename): sys.stdout.flush() time.sleep(interval) + if __name__ == "__main__": sys.exit(main()) diff --git a/collectors/0/nfsstat.py b/collectors/0/nfsstat.py index 36a9c5d8..91c8ba8e 100755 --- a/collectors/0/nfsstat.py +++ b/collectors/0/nfsstat.py @@ -90,5 +90,6 @@ def main(): sys.stdout.flush() time.sleep(COLLECTION_INTERVAL) + if __name__ == "__main__": sys.exit(main()) diff --git a/collectors/0/ntpstat.py b/collectors/0/ntpstat.py index d07ed5e0..98111430 100755 --- a/collectors/0/ntpstat.py +++ b/collectors/0/ntpstat.py @@ -19,8 +19,6 @@ from __future__ import print_function -import os -import socket import subprocess import sys import time @@ -29,19 +27,20 @@ from collectors.lib import utils try: - from collectors.etc import ntpstat_conf + from collectors.etc import ntpstat_conf except ImportError: - ntpstat_conf = None + ntpstat_conf = None + +DEFAULT_COLLECTION_INTERVAL = 60 -DEFAULT_COLLECTION_INTERVAL=60 def main(): """ntpstats main loop""" - collection_interval=DEFAULT_COLLECTION_INTERVAL - if(ntpstat_conf): + collection_interval = DEFAULT_COLLECTION_INTERVAL + if (ntpstat_conf): config = ntpstat_conf.get_config() - collection_interval=config['collection_interval'] + collection_interval = config['collection_interval'] utils.drop_privileges() @@ -52,7 +51,7 @@ def main(): except OSError as e: if e.errno == errno.ENOENT: # looks like ntpdc is not available, stop using this collector - sys.exit(13) # we signal tcollector to stop using this + return 13 # we signal tcollector to stop using this raise stdout, _ = ntp_proc.communicate() @@ -64,7 +63,7 @@ def main(): if len(fields) <= 0: continue if fields[0].startswith("*"): - offset=fields[8] + offset = fields[8] continue print("ntp.offset %d %s" % (ts, offset)) else: @@ -73,5 +72,6 @@ def main(): sys.stdout.flush() time.sleep(collection_interval) + if __name__ == "__main__": - main() + sys.exit(main()) diff --git a/collectors/0/postgresql.py b/collectors/0/postgresql.py index ab003acc..c4824fc3 100755 --- a/collectors/0/postgresql.py +++ b/collectors/0/postgresql.py @@ -22,87 +22,89 @@ is set in postgresql.conf. """ +from collectors.lib import utils +from collectors.lib import postgresqlutils + import sys -import os import time import socket import errno -COLLECTION_INTERVAL = 15 # seconds +COLLECTION_INTERVAL = 15 # seconds -from collectors.lib import utils -from collectors.lib import postgresqlutils def collect(db): - """ - Collects and prints stats. - - Here we collect only general info, for full list of data for collection - see http://www.postgresql.org/docs/9.2/static/monitoring-stats.html - """ - - try: - cursor = db.cursor() - - # general statics - cursor.execute("SELECT pg_stat_database.*, pg_database_size" - " (pg_database.datname) AS size FROM pg_database JOIN" - " pg_stat_database ON pg_database.datname =" - " pg_stat_database.datname WHERE pg_stat_database.datname" - " NOT IN ('template0', 'template1', 'postgres')") - ts = time.time() - stats = cursor.fetchall() - -# datid | datname | numbackends | xact_commit | xact_rollback | blks_read | blks_hit | tup_returned | tup_fetched | tup_inserted | tup_updated | tup_deleted | conflicts | temp_files | temp_bytes | deadlocks | blk_read_time | blk_write_time | stats_reset | size - result = {} - for stat in stats: - database = stat[1] - result[database] = stat - - for database in result: - for i in range(2,len(cursor.description)): - metric = cursor.description[i].name - value = result[database][i] - try: - if metric in ("stats_reset"): - continue - print("postgresql.%s %i %s database=%s" - % (metric, ts, value, database)) - except: - utils.err("got here") - continue - - # connections - cursor.execute("SELECT datname, count(datname) FROM pg_stat_activity" - " GROUP BY pg_stat_activity.datname") - ts = time.time() - connections = cursor.fetchall() - - for database, connection in connections: - print("postgresql.connections %i %s database=%s" - % (ts, connection, database)) - - except (EnvironmentError, EOFError, RuntimeError, socket.error) as e: - if isinstance(e, IOError) and e[0] == errno.EPIPE: - # exit on a broken pipe. There is no point in continuing - # because no one will read our stdout anyway. - return 2 - utils.err("error: failed to collect data: %s" % e) + """ + Collects and prints stats. + + Here we collect only general info, for full list of data for collection + see http://www.postgresql.org/docs/9.2/static/monitoring-stats.html + """ + + try: + cursor = db.cursor() + + # general statics + cursor.execute("SELECT pg_stat_database.*, pg_database_size" + " (pg_database.datname) AS size FROM pg_database JOIN" + " pg_stat_database ON pg_database.datname =" + " pg_stat_database.datname WHERE pg_stat_database.datname" + " NOT IN ('template0', 'template1', 'postgres')") + ts = time.time() + stats = cursor.fetchall() + + # datid | datname | numbackends | xact_commit | xact_rollback | blks_read | blks_hit | tup_returned | tup_fetched | tup_inserted | tup_updated | tup_deleted | conflicts | temp_files | temp_bytes | deadlocks | blk_read_time | blk_write_time | stats_reset | size + result = {} + for stat in stats: + database = stat[1] + result[database] = stat + + for database in result: + for i in range(2, len(cursor.description)): + metric = cursor.description[i].name + value = result[database][i] + try: + if metric in ("stats_reset"): + continue + print("postgresql.%s %i %s database=%s" + % (metric, ts, value, database)) + except: + utils.err("got here") + continue + + # connections + cursor.execute("SELECT datname, count(datname) FROM pg_stat_activity" + " GROUP BY pg_stat_activity.datname") + ts = time.time() + connections = cursor.fetchall() + + for database, connection in connections: + print("postgresql.connections %i %s database=%s" + % (ts, connection, database)) + + except (EnvironmentError, EOFError, RuntimeError, socket.error) as e: + if isinstance(e, IOError) and e[0] == errno.EPIPE: + # exit on a broken pipe. There is no point in continuing + # because no one will read our stdout anyway. + return 2 + utils.err("error: failed to collect data: %s" % e) + def main(args): - """Collects and dumps stats from a PostgreSQL server.""" + """Collects and dumps stats from a PostgreSQL server.""" + + try: + db = postgresqlutils.connect() + except (Exception) as e: + utils.err("error: Could not initialize collector : %s" % (e)) + return 13 # Ask tcollector to not respawn us - try: - db = postgresqlutils.connect() - except (Exception) as e: - utils.err("error: Could not initialize collector : %s" % (e)) - return 13 # Ask tcollector to not respawn us + while True: + collect(db) + sys.stdout.flush() + time.sleep(COLLECTION_INTERVAL) - while True: - collect(db) - sys.stdout.flush() - time.sleep(COLLECTION_INTERVAL) if __name__ == "__main__": - sys.stdin.close() - sys.exit(main(sys.argv)) \ No newline at end of file + sys.stdin.close() + sys.exit(main(sys.argv)) diff --git a/collectors/0/postgresql_replication.py b/collectors/0/postgresql_replication.py index d5e99221..274771ff 100755 --- a/collectors/0/postgresql_replication.py +++ b/collectors/0/postgresql_replication.py @@ -22,96 +22,98 @@ is set in postgresql.conf. """ +from collectors.lib import utils +from collectors.lib import postgresqlutils + import sys -import os import time import socket import errno import re import subprocess -COLLECTION_INTERVAL = 5 # seconds +COLLECTION_INTERVAL = 5 # seconds -from collectors.lib import utils -from collectors.lib import postgresqlutils def collect(db): - """ - Collects and prints replication statistics. - """ - - try: - cursor = db.cursor() - - # Replication lag time (could be slave only or a master / slave combo) - cursor.execute("SELECT " - "CASE WHEN pg_is_in_recovery() THEN (EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()) * 1000)::BIGINT ELSE NULL END AS replication_lag_time, " - "pg_xlog_location_diff(pg_last_xlog_receive_location(), pg_last_xlog_replay_location()) AS replication_lag_bytes, " - "pg_is_in_recovery() AS in_recovery;") - ts = time.time() - stats = cursor.fetchall() - - if (stats[0][0] is not None): - print("postgresql.replication.upstream.lag.time %i %s" - % (ts, stats[0][0])) - - if (stats[0][1] is not None): - print("postgresql.replication.upstream.lag.bytes %i %s" - % (ts, stats[0][1])) - - print("postgresql.replication.recovering %i %i" - % (ts, stats[0][2])) - - # WAL receiver process running (could be slave only or master / slave combo) - ps_out = subprocess.check_output(["/bin/ps", "aux"] , stderr=subprocess.STDOUT) - ps_out = ps_out.split("\n") - ts = time.time() - - wal_receiver_running = 0 - for l in ps_out: - l = l.strip() - if (re.match (".*wal\\sreceiver.*", l)): - wal_receiver_running = 1; - break - - print("postgresql.replication.walreceiver.running %i %s" - % (ts, wal_receiver_running)) - - # WAL sender process info (could be master only or master / slave combo) - cursor.execute("SELECT client_addr, client_port, " - "pg_xlog_location_diff(sent_location, replay_location) AS lag_bytes " - "FROM pg_stat_replication;") - ts = time.time() - stats = cursor.fetchall() - - print("postgresql.replication.downstream.count %i %i" - % (ts, len(stats))) - - for stat in stats: - print("postgresql.replication.downstream.lag.bytes %i %i client_ip=%s client_port=%s" - % (ts, stat[2], stat[0], stat[1])) - - except (EnvironmentError, EOFError, RuntimeError, socket.error) as e: - if isinstance(e, IOError) and e[0] == errno.EPIPE: - # exit on a broken pipe. There is no point in continuing - # because no one will read our stdout anyway. - return 2 - utils.err("error: failed to collect data: %s" % e) + """ + Collects and prints replication statistics. + """ + + try: + cursor = db.cursor() + + # Replication lag time (could be slave only or a master / slave combo) + cursor.execute("SELECT " + "CASE WHEN pg_is_in_recovery() THEN (EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()) * 1000)::BIGINT ELSE NULL END AS replication_lag_time, " + "pg_xlog_location_diff(pg_last_xlog_receive_location(), pg_last_xlog_replay_location()) AS replication_lag_bytes, " + "pg_is_in_recovery() AS in_recovery;") + ts = time.time() + stats = cursor.fetchall() + + if (stats[0][0] is not None): + print("postgresql.replication.upstream.lag.time %i %s" + % (ts, stats[0][0])) + + if (stats[0][1] is not None): + print("postgresql.replication.upstream.lag.bytes %i %s" + % (ts, stats[0][1])) + + print("postgresql.replication.recovering %i %i" + % (ts, stats[0][2])) + + # WAL receiver process running (could be slave only or master / slave combo) + ps_out = subprocess.check_output(["/bin/ps", "aux"], stderr=subprocess.STDOUT) + ps_out = ps_out.split("\n") + ts = time.time() + + wal_receiver_running = 0 + for l in ps_out: + l = l.strip() + if (re.match(".*wal\\sreceiver.*", l)): + wal_receiver_running = 1; + break + + print("postgresql.replication.walreceiver.running %i %s" + % (ts, wal_receiver_running)) + + # WAL sender process info (could be master only or master / slave combo) + cursor.execute("SELECT client_addr, client_port, " + "pg_xlog_location_diff(sent_location, replay_location) AS lag_bytes " + "FROM pg_stat_replication;") + ts = time.time() + stats = cursor.fetchall() + + print("postgresql.replication.downstream.count %i %i" + % (ts, len(stats))) + + for stat in stats: + print("postgresql.replication.downstream.lag.bytes %i %i client_ip=%s client_port=%s" + % (ts, stat[2], stat[0], stat[1])) + + except (EnvironmentError, EOFError, RuntimeError, socket.error) as e: + if isinstance(e, IOError) and e[0] == errno.EPIPE: + # exit on a broken pipe. There is no point in continuing + # because no one will read our stdout anyway. + return 2 + utils.err("error: failed to collect data: %s" % e) + def main(args): - """Collects and dumps stats from a PostgreSQL server.""" + """Collects and dumps stats from a PostgreSQL server.""" + + try: + db = postgresqlutils.connect() + except (Exception) as e: + utils.err("error: Could not initialize collector : %s" % (e)) + return 13 # ask tcollector to not respawn us - try: - db = postgresqlutils.connect() - except (Exception) as e: - utils.err("error: Could not initialize collector : %s" % (e)) - return 13 # Ask tcollector to not respawn us + while True: + collect(db) + sys.stdout.flush() + time.sleep(COLLECTION_INTERVAL) - while True: - collect(db) - sys.stdout.flush() - time.sleep(COLLECTION_INTERVAL) if __name__ == "__main__": - sys.stdin.close() - sys.exit(main(sys.argv)) + sys.stdin.close() + sys.exit(main(sys.argv)) diff --git a/collectors/0/procnettcp.py b/collectors/0/procnettcp.py index 6b7dac9b..33044bfb 100755 --- a/collectors/0/procnettcp.py +++ b/collectors/0/procnettcp.py @@ -174,7 +174,7 @@ def main(unused_args): raise except IOError as e: print("Failed to open input file: %s" % (e,), file=sys.stderr) - return 13 # Ask tcollector to not re-start us immediately. + return 13 # ask tcollector to not re-start us immediately. utils.drop_privileges() while True: @@ -233,5 +233,6 @@ def main(unused_args): sys.stdout.flush() time.sleep(interval) + if __name__ == "__main__": sys.exit(main(sys.argv)) diff --git a/collectors/0/procstats.py b/collectors/0/procstats.py index 0dd8e042..39991999 100755 --- a/collectors/0/procstats.py +++ b/collectors/0/procstats.py @@ -23,8 +23,8 @@ from collectors.lib import utils -INTERRUPTS_INTVL_MULT = 4 # query softirqs every SOFTIRQS_INT_MULT * COLLECTION_INTERVAL seconds -SOFTIRQS_INTVL_MULT = 4 # query softirqs every SOFTIRQS_INT_MULT * COLLECTION_INTERVAL seconds +INTERRUPTS_INTVL_MULT = 4 # query softirqs every SOFTIRQS_INT_MULT * COLLECTION_INTERVAL seconds +SOFTIRQS_INTVL_MULT = 4 # query softirqs every SOFTIRQS_INT_MULT * COLLECTION_INTERVAL seconds # Modern Linux: CPUSET_PATH = "/sys/fs/cgroup/cpuset" if os.path.isdir("/dev/cpuset"): @@ -40,7 +40,7 @@ def find_sysfs_numa_stats(): nodes = os.listdir(NUMADIR) except OSError as exc: if exc.errno == 2: # No such file or directory - return [] # We don't have NUMA stats. + return [] # We don't have NUMA stats. raise nodes = [node for node in nodes if node.startswith("node")] @@ -59,34 +59,35 @@ def print_numa_stats(numafiles): """From a list of files names, opens file, extracts and prints NUMA stats.""" for numafilename in numafiles: numafile = open(numafilename) - node_id = int(numafile.name[numafile.name.find("/node/node")+10:-9]) + node_id = int(numafile.name[numafile.name.find("/node/node") + 10:-9]) ts = int(time.time()) stats = dict(line.split() for line in numafile.read().splitlines()) - for stat, tag in (# hit: process wanted memory from this node and got it - ("numa_hit", "hit"), - # miss: process wanted another node and got it from - # this one instead. - ("numa_miss", "miss")): + for stat, tag in ( # hit: process wanted memory from this node and got it + ("numa_hit", "hit"), + # miss: process wanted another node and got it from + # this one instead. + ("numa_miss", "miss")): print("sys.numa.zoneallocs %d %s node=%d type=%s" - % (ts, stats[stat], node_id, tag)) + % (ts, stats[stat], node_id, tag)) # Count this one as a separate metric because we can't sum up hit + # miss + foreign, this would result in double-counting of all misses. # See `zone_statistics' in the code of the kernel. # foreign: process wanted memory from this node but got it from # another node. So maybe this node is out of free pages. print("sys.numa.foreign_allocs %d %s node=%d" - % (ts, stats["numa_foreign"], node_id)) + % (ts, stats["numa_foreign"], node_id)) # When is memory allocated to a node that's local or remote to where # the process is running. for stat, tag in (("local_node", "local"), ("other_node", "remote")): print("sys.numa.allocation %d %s node=%d type=%s" - % (ts, stats[stat], node_id, tag)) + % (ts, stats[stat], node_id, tag)) # Pages successfully allocated with the interleave policy. print("sys.numa.interleave %d %s node=%d type=hit" - % (ts, stats["interleave_hit"], node_id)) + % (ts, stats["interleave_hit"], node_id)) numafile.close() + def expand_numlist(s): """return a list of numbers from a list with ranges, e.g. 4,5-10,14-16""" @@ -95,51 +96,53 @@ def expand_numlist(s): if '-' not in i: r.append(int(i)) else: - l,h = map(int, i.split('-')) - r+= range(l,h+1) + l, h = map(int, i.split('-')) + r += range(l, h + 1) return r + def cpus_csets(cpuset_path): """Return a hash of cpu_id_as_string->cset_name""" try: csets = os.listdir(cpuset_path) except OSError as e: - if e.errno == errno.ENOENT: # No such file or directory - return {} # We don't have csets + if e.errno == errno.ENOENT: # No such file or directory + return {} # We don't have csets raise csets = [cset for cset in csets if os.path.isdir(os.path.join(cpuset_path, cset))] cpu2cset = {} for cset in csets: - cpuspath = os.path.join(cpuset_path, cset, 'cpuset.cpus') - if not os.path.isfile(cpuspath): - cpuspath = os.path.join(cpuset_path, cset, 'cpus') - if not os.path.isfile(cpuspath): - # No such file?? Ignore csets - sys.stderr.write("No 'cpuset.cpus' or 'cpus' file in %s!" % os.path.join(cpuset_path, cset)) - continue - - try: - f_cpus = open(cpuspath) - except: - # Ignore that one and continue - sys.stderr.write("Could not open %s!" % cpuspath) - continue - - format_errors = 0 - for line in f_cpus: - m = re.match('^[-0-9,]+$', line) - if m: - for c in expand_numlist(line): - cpu2cset[str(c)] = cset - else: - format_errors += 1 - if format_errors > 0: - sys.stderr.write("%d line(s) of %s were not in the expected format" % (format_errors, cpuspath)) + cpuspath = os.path.join(cpuset_path, cset, 'cpuset.cpus') + if not os.path.isfile(cpuspath): + cpuspath = os.path.join(cpuset_path, cset, 'cpus') + if not os.path.isfile(cpuspath): + # No such file?? Ignore csets + sys.stderr.write("No 'cpuset.cpus' or 'cpus' file in %s!" % os.path.join(cpuset_path, cset)) + continue + + try: + f_cpus = open(cpuspath) + except: + # Ignore that one and continue + sys.stderr.write("Could not open %s!" % cpuspath) + continue + + format_errors = 0 + for line in f_cpus: + m = re.match('^[-0-9,]+$', line) + if m: + for c in expand_numlist(line): + cpu2cset[str(c)] = cset + else: + format_errors += 1 + if format_errors > 0: + sys.stderr.write("%d line(s) of %s were not in the expected format" % (format_errors, cpuspath)) return cpu2cset + def main(): """procstats main loop""" @@ -152,19 +155,19 @@ def main(): f_interrupts = open("/proc/interrupts", "r") f_scaling = "/sys/devices/system/cpu/cpu%s/cpufreq/%s_freq" - f_scaling_min = dict([]) - f_scaling_max = dict([]) - f_scaling_cur = dict([]) + f_scaling_min = dict([]) + f_scaling_max = dict([]) + f_scaling_cur = dict([]) f_softirqs = open("/proc/softirqs", "r") for cpu in glob.glob("/sys/devices/system/cpu/cpu[0-9]*/cpufreq/scaling_cur_freq"): m = re.match("/sys/devices/system/cpu/cpu([0-9]*)/cpufreq/scaling_cur_freq", cpu) if not m: continue cpu_no = m.group(1) - sys.stderr.write(f_scaling % (cpu_no,"min")) - f_scaling_min[cpu_no] = open(f_scaling % (cpu_no,"cpuinfo_min"), "r") - f_scaling_max[cpu_no] = open(f_scaling % (cpu_no,"cpuinfo_max"), "r") - f_scaling_cur[cpu_no] = open(f_scaling % (cpu_no,"scaling_cur"), "r") + sys.stderr.write(f_scaling % (cpu_no, "min")) + f_scaling_min[cpu_no] = open(f_scaling % (cpu_no, "cpuinfo_min"), "r") + f_scaling_max[cpu_no] = open(f_scaling % (cpu_no, "cpuinfo_max"), "r") + f_scaling_cur[cpu_no] = open(f_scaling % (cpu_no, "scaling_cur"), "r") numastats = find_sysfs_numa_stats() utils.drop_privileges() @@ -194,7 +197,7 @@ def main(): value = m.group(2) name = re.sub("\W", "_", m.group(1)).lower().strip("_") print("proc.meminfo.%s %d %s" - % (name, ts, value)) + % (name, ts, value)) # proc.vmstat f_vmstat.seek(0) @@ -229,15 +232,15 @@ def main(): tags = '' fields = m.group(2).split() cpu_types = ['user', 'nice', 'system', 'idle', 'iowait', - 'irq', 'softirq', 'guest', 'guest_nice'] + 'irq', 'softirq', 'guest', 'guest_nice'] # We use zip to ignore fields that don't exist. for value, field_name in zip(fields, cpu_types): print("proc.stat.cpu%s %d %s type=%s%s" % (metric_percpu, - ts, value, field_name, tags)) + ts, value, field_name, tags)) elif m.group(1) == "intr": print(("proc.stat.intr %d %s" - % (ts, m.group(2).split()[0]))) + % (ts, m.group(2).split()[0]))) elif m.group(1) == "ctxt": print("proc.stat.ctxt %d %s" % (ts, m.group(2))) elif m.group(1) == "processes": @@ -335,7 +338,7 @@ def print_interrupts(f_interrupts): interrupt_dict[k] = int(val) for k in interrupt_dict: - print ("proc.interrupts %s %d %s" % (ts, interrupt_dict[k], k)) + print("proc.interrupts %s %d %s" % (ts, interrupt_dict[k], k)) def print_irqs(f_softirqs): @@ -356,12 +359,11 @@ def print_irqs(f_softirqs): if not val.isdigit(): # something is weird, there should only be digit values sys.stderr.write("Unexpected softirq value %r in" - " %r: " % (val, cols)) + " %r: " % (val, cols)) break - print ("proc.softirqs %s %s type=%s cpu=%s" - % (ts, val, irq_type, i)) + print("proc.softirqs %s %s type=%s cpu=%s" + % (ts, val, irq_type, i)) if __name__ == "__main__": - main() - + sys.exit(main()) diff --git a/collectors/0/prometheus.py b/collectors/0/prometheus.py index 3b6bb81f..078247c0 100755 --- a/collectors/0/prometheus.py +++ b/collectors/0/prometheus.py @@ -15,19 +15,22 @@ import sys import time +from http.client import HTTPConnection +import json -import schedule -from prometheus_client.parser import text_string_to_metric_families +from collectors.lib import utils try: - import json + import schedule except ImportError: - json = None + utils.err("schedule library is not installed") + sys.exit(13) # ask tcollector to not re-start us try: - from http.client import HTTPConnection + from prometheus_client.parser import text_string_to_metric_families except ImportError: - from httplib import HTTPConnection + utils.err("prometheus_client.parser is not installed") + sys.exit(13) # ask tcollector to not re-start us try: from collectors.etc import prometheus_conf @@ -41,6 +44,7 @@ BASE_LABELS = "" SETTINGS = {} + class PrometheusCollector(object): def __init__(self, service, daemon, host, port, uri="/metrics"): self.service = service diff --git a/collectors/0/pxc-collector.py b/collectors/0/pxc-collector.py index 4a77c0fc..451a899b 100755 --- a/collectors/0/pxc-collector.py +++ b/collectors/0/pxc-collector.py @@ -22,66 +22,73 @@ ATTENTION: Only tested on Debian/Ubuntu systems. """ -import MySQLdb as mysql # pylint: disable=import-error +import MySQLdb as mysql # pylint: disable=import-error import time import sys import os.path -from collectors.etc import pxcconf + from collectors.lib import utils +from collectors.etc import pxcconf -__author__ = "Kai Laufer" -__version__ = "1.0.1" -__email__ = "mail@kai-laufer.de" +__author__ = "Kai Laufer" +__version__ = "1.0.1" +__email__ = "mail@kai-laufer.de" """ You can find these functions and additional information in etc/pxcconf.py """ -prefix = pxcconf.getPrefix() or "pxc" # Prefix for the collector, e.g. pxc -> pxc.wsrep_replicated_bytes -interval = pxcconf.getInterval() or 1 # Interval for checking MySQL statistics -galeraFile = pxcconf.getGaleraFile() or "/usr/lib/libgalera_smm.so" # Path to a galera specific file for ensuring that check won't run with a usual MySQL server. Default: "/usr/lib/libgalera_smm.so" -login = pxcconf.getUserPassword() # MySQL-User, MySQL-Password and MySQL-Host (localhost) -myMap = pxcconf.getKeyMap() or ( "wsrep_last_committed", "wsrep_replicated", "wsrep_repl_keys", "wsrep_local_commits" ) # Status variables which should be read -mysqlUser = login[0] or "root" +prefix = pxcconf.getPrefix() or "pxc" # Prefix for the collector, e.g. pxc -> pxc.wsrep_replicated_bytes +interval = pxcconf.getInterval() or 1 # Interval for checking MySQL statistics +galeraFile = pxcconf.getGaleraFile() or "/usr/lib/libgalera_smm.so" # Path to a galera specific file for ensuring that check won't run with a usual MySQL server. Default: "/usr/lib/libgalera_smm.so" +login = pxcconf.getUserPassword() # MySQL-User, MySQL-Password and MySQL-Host (localhost) +myMap = pxcconf.getKeyMap() or ("wsrep_last_committed", "wsrep_replicated", "wsrep_repl_keys", + "wsrep_local_commits") # Status variables which should be read +mysqlUser = login[0] or "root" mysqlPasswd = login[1] or "" -mysqlHost = login[2] or "localhost" +mysqlHost = login[2] or "localhost" + def getRow(): - """ Test connection """ - try: - db = mysql.connect(host=mysqlHost, user=mysqlUser, passwd=mysqlPasswd) - cursor = db.cursor() - cursor.execute("SHOW STATUS LIKE '%wsrep%'") - result = cursor.fetchall() + """ Test connection """ + try: + db = mysql.connect(host=mysqlHost, user=mysqlUser, passwd=mysqlPasswd) + cursor = db.cursor() + cursor.execute("SHOW STATUS LIKE '%wsrep%'") + result = cursor.fetchall() - except: - print("Error: unable to fetch data - Check your configuration!") - sys.exit(13) # Don't respawn collector + except: + utils.err("Error: unable to fetch data - Check your configuration!") + sys.exit(13) # # ask tcollector to not respawn us + + db.close() + return result - db.close() - return result class TSDResult(object): - """ Create TSD output """ - def __init__(self, key, value, prefix, timestamp): - self.key = key - self.value = value - self.prefix = prefix - self.timestamp = timestamp + """ Create TSD output """ + + def __init__(self, key, value, prefix, timestamp): + self.key = key + self.value = value + self.prefix = prefix + self.timestamp = timestamp + + def TSDRow(self): + return "%s.%s %s %s" % (self.prefix, self.key, self.timestamp, self.value) - def TSDRow(self): - return "%s.%s %s %s" % (self.prefix, self.key, self.timestamp, self.value) def main(): - if os.path.isfile(galeraFile) is True: - while True: - rows = getRow() - for row in rows: - timestamp = int(time.time()) - if row[0] in myMap: - result = TSDResult(row[0], row[1], prefix, timestamp) - print(result.TSDRow()) - time.sleep(interval) - return 0 - else: - return 2 + if os.path.isfile(galeraFile) is True: + while True: + rows = getRow() + for row in rows: + timestamp = int(time.time()) + if row[0] in myMap: + result = TSDResult(row[0], row[1], prefix, timestamp) + print(result.TSDRow()) + time.sleep(interval) + return 0 + else: + return 2 + if __name__ == "__main__": - sys.exit(main()) + sys.exit(main()) diff --git a/collectors/0/riak.py b/collectors/0/riak.py index 006914c7..ae43b8a1 100755 --- a/collectors/0/riak.py +++ b/collectors/0/riak.py @@ -44,14 +44,11 @@ import os import sys import time +from urllib.request import urlopen from collectors.etc import riak_conf from collectors.lib import utils -try: - from urllib.request import urlopen -except ImportError: - from urllib2 import urlopen CONFIG = riak_conf.get_default_config() @@ -113,7 +110,7 @@ def main(): # don't run if we're not a riak node if not os.path.exists("/usr/lib/riak"): - sys.exit(13) + return 13 utils.drop_privileges() sys.stdin.close() diff --git a/collectors/0/smart_stats.py b/collectors/0/smart_stats.py index c268f5c7..3785367a 100755 --- a/collectors/0/smart_stats.py +++ b/collectors/0/smart_stats.py @@ -31,7 +31,7 @@ except ImportError: smart_stats_conf = None -DEFAULT_COLLECTION_INTERVAL=120 +DEFAULT_COLLECTION_INTERVAL = 120 TWCLI = "/usr/sbin/tw_cli" ARCCONF = "/usr/local/bin/arcconf" @@ -45,199 +45,201 @@ # Common smart attributes, add more to this list if you start seeing # numbers instead of attribute names in TSD results. ATTRIBUTE_MAP = { - "1": "raw_read_error_rate", - "2": "throughput_performance", - "3": "spin_up_time", - "4": "start_stop_count", - "5": "reallocated_sector_ct", - "7": "seek_error_rate", - "8": "seek_time_performance", - "9": "power_on_hours", - "10": "spin_retry_count", - "11": "recalibration_retries", - "12": "power_cycle_count", - "13": "soft_read_error_rate", - "175": "program_fail_count_chip", - "176": "erase_fail_count_chip", - "177": "wear_leveling_count", - "178": "used_rsvd_blk_cnt_chip", - "179": "used_rsvd_blk_cnt_tot", - "180": "unused_rsvd_blk_cnt_tot", - "181": "program_fail_cnt_total", - "182": "erase_fail_count_total", - "183": "runtime_bad_block", - "184": "end_to_end_error", - "187": "reported_uncorrect", - "188": "command_timeout", - "189": "high_fly_writes", - "190": "airflow_temperature_celsius", - "191": "g_sense_error_rate", - "192": "power-off_retract_count", - "193": "load_cycle_count", - "194": "temperature_celsius", - "195": "hardware_ecc_recovered", - "196": "reallocated_event_count", - "197": "current_pending_sector", - "198": "offline_uncorrectable", - "199": "udma_crc_error_count", - "200": "write_error_rate", - "233": "media_wearout_indicator", - "240": "transfer_error_rate", - "241": "total_lba_writes", - "242": "total_lba_read", - } + "1": "raw_read_error_rate", + "2": "throughput_performance", + "3": "spin_up_time", + "4": "start_stop_count", + "5": "reallocated_sector_ct", + "7": "seek_error_rate", + "8": "seek_time_performance", + "9": "power_on_hours", + "10": "spin_retry_count", + "11": "recalibration_retries", + "12": "power_cycle_count", + "13": "soft_read_error_rate", + "175": "program_fail_count_chip", + "176": "erase_fail_count_chip", + "177": "wear_leveling_count", + "178": "used_rsvd_blk_cnt_chip", + "179": "used_rsvd_blk_cnt_tot", + "180": "unused_rsvd_blk_cnt_tot", + "181": "program_fail_cnt_total", + "182": "erase_fail_count_total", + "183": "runtime_bad_block", + "184": "end_to_end_error", + "187": "reported_uncorrect", + "188": "command_timeout", + "189": "high_fly_writes", + "190": "airflow_temperature_celsius", + "191": "g_sense_error_rate", + "192": "power-off_retract_count", + "193": "load_cycle_count", + "194": "temperature_celsius", + "195": "hardware_ecc_recovered", + "196": "reallocated_event_count", + "197": "current_pending_sector", + "198": "offline_uncorrectable", + "199": "udma_crc_error_count", + "200": "write_error_rate", + "233": "media_wearout_indicator", + "240": "transfer_error_rate", + "241": "total_lba_writes", + "242": "total_lba_read", +} class Alarm(RuntimeError): - pass + pass def alarm_handler(signum, frame): - print("Program took too long to run, " - "consider increasing its timeout.", file=sys.stderr) - raise Alarm() + print("Program took too long to run, " + "consider increasing its timeout.", file=sys.stderr) + raise Alarm() def smart_is_broken(drives): - """Determines whether SMART can be used. + """Determines whether SMART can be used. - Args: - drives: A list of device names on which we intend to use SMART. + Args: + drives: A list of device names on which we intend to use SMART. - Returns: - True if SMART is available, False otherwise. - """ - if os.path.exists(ARCCONF): - return is_adaptec_driver_broken() - if os.path.exists(TWCLI): - return is_3ware_driver_broken(drives) - return False + Returns: + True if SMART is available, False otherwise. + """ + if os.path.exists(ARCCONF): + return is_adaptec_driver_broken() + if os.path.exists(TWCLI): + return is_3ware_driver_broken(drives) + return False def is_adaptec_driver_broken(): - signal.alarm(COMMAND_TIMEOUT) - arcconf = subprocess.Popen("%s %s" % (ARCCONF, ARCCONF_ARGS), - shell=True, - stdout=subprocess.PIPE) - arcconf_output = arcconf.communicate()[0] - signal.alarm(0) - if arcconf.returncode != 0: - if arcconf_output and arcconf_output.startswith(NO_CONTROLLER): - # No controller => no problem. - return False - if arcconf.returncode == 127: - # arcconf doesn't even work on this system, so assume we're safe - return False - print("arcconf unexpected error %s" % arcconf.returncode, file=sys.stderr) - return True - for line in arcconf_output.split("\n"): - fields = [x for x in line.split(" ") if x] - if fields[0] == "Driver" and fields[2] in BROKEN_DRIVER_VERSIONS: - print("arcconf indicates broken driver version %s" - % fields[2], file=sys.stderr) - return True - return False - -def is_3ware_driver_broken(drives): - # Apparently 3ware controllers can't report SMART stats from SAS drives. WTF. - # See also http://sourceforge.net/apps/trac/smartmontools/ticket/161 - for i in reversed(range(len(drives))): - drive = drives[i] signal.alarm(COMMAND_TIMEOUT) - smart_ctl = subprocess.Popen(SMART_CTL + " -i /dev/" + drive, - shell=True, stdout=subprocess.PIPE) - smart_output = smart_ctl.communicate()[0] - if "supports SMART and is Disabled" in smart_output: - print("SMART is disabled for %s" % drive, file=sys.stderr) - del drives[i] # We're iterating from the end of the list so this is OK. + arcconf = subprocess.Popen("%s %s" % (ARCCONF, ARCCONF_ARGS), + shell=True, + stdout=subprocess.PIPE) + arcconf_output = arcconf.communicate()[0] signal.alarm(0) - if not drives: - print("None of the drives support SMART. Are they SAS drives?", file=sys.stderr) - return True - return False + if arcconf.returncode != 0: + if arcconf_output and arcconf_output.startswith(NO_CONTROLLER): + # No controller => no problem. + return False + if arcconf.returncode == 127: + # arcconf doesn't even work on this system, so assume we're safe + return False + print("arcconf unexpected error %s" % arcconf.returncode, file=sys.stderr) + return True + for line in arcconf_output.split("\n"): + fields = [x for x in line.split(" ") if x] + if fields[0] == "Driver" and fields[2] in BROKEN_DRIVER_VERSIONS: + print("arcconf indicates broken driver version %s" + % fields[2], file=sys.stderr) + return True + return False + + +def is_3ware_driver_broken(drives): + # Apparently 3ware controllers can't report SMART stats from SAS drives. WTF. + # See also http://sourceforge.net/apps/trac/smartmontools/ticket/161 + for i in reversed(range(len(drives))): + drive = drives[i] + signal.alarm(COMMAND_TIMEOUT) + smart_ctl = subprocess.Popen(SMART_CTL + " -i /dev/" + drive, + shell=True, stdout=subprocess.PIPE) + smart_output = smart_ctl.communicate()[0] + if "supports SMART and is Disabled" in smart_output: + print("SMART is disabled for %s" % drive, file=sys.stderr) + del drives[i] # We're iterating from the end of the list so this is OK. + signal.alarm(0) + if not drives: + print("None of the drives support SMART. Are they SAS drives?", file=sys.stderr) + return True + return False def process_output(drive, smart_output): - """Print formatted SMART output for the drive""" - ts = int(time.time()) - smart_output = smart_output.split("\n") - # Set data_marker to 0, so we skip stuff until we see a line - # beginning with ID# in the output. Start processing rows after - # that point. - data_marker = False - is_seagate = False - - for line in smart_output: - if data_marker: - fields = line.split() - if len(fields) < 2: - continue - field = fields[0] - if len(fields) > 2 and field in ATTRIBUTE_MAP: - metric = ATTRIBUTE_MAP[field] - value = fields[9].split()[0] - print("smart.%s %d %s disk=%s" % (metric, ts, value, drive)) - if is_seagate and metric in ("seek_error_rate", "raw_read_error_rate"): - # It appears that some Seagate drives (and possibly some Western - # Digital ones too) use the first 16 bits to store error counts, - # and the low 32 bits to store operation counts, out of these 48 - # bit values. So try to be helpful and extract these here. - value = int(value) - print("smart.%s %d %d disk=%s" - % (metric.replace("error_rate", "count"), ts, - value & 0xFFFFFFFF, drive)) - print("smart.%s %d %d disk=%s" - % (metric.replace("error_rate", "errors"), ts, - (value & 0xFFFF00000000) >> 32, drive)) - elif line.startswith("ID#"): - data_marker = True - elif line.startswith("Device Model:"): - model = line.split(None, 2)[2] - # Rough approximation to detect Seagate drives. - is_seagate = model.startswith("ST") + """Print formatted SMART output for the drive""" + ts = int(time.time()) + smart_output = smart_output.split("\n") + # Set data_marker to 0, so we skip stuff until we see a line + # beginning with ID# in the output. Start processing rows after + # that point. + data_marker = False + is_seagate = False + + for line in smart_output: + if data_marker: + fields = line.split() + if len(fields) < 2: + continue + field = fields[0] + if len(fields) > 2 and field in ATTRIBUTE_MAP: + metric = ATTRIBUTE_MAP[field] + value = fields[9].split()[0] + print("smart.%s %d %s disk=%s" % (metric, ts, value, drive)) + if is_seagate and metric in ("seek_error_rate", "raw_read_error_rate"): + # It appears that some Seagate drives (and possibly some Western + # Digital ones too) use the first 16 bits to store error counts, + # and the low 32 bits to store operation counts, out of these 48 + # bit values. So try to be helpful and extract these here. + value = int(value) + print("smart.%s %d %d disk=%s" + % (metric.replace("error_rate", "count"), ts, + value & 0xFFFFFFFF, drive)) + print("smart.%s %d %d disk=%s" + % (metric.replace("error_rate", "errors"), ts, + (value & 0xFFFF00000000) >> 32, drive)) + elif line.startswith("ID#"): + data_marker = True + elif line.startswith("Device Model:"): + model = line.split(None, 2)[2] + # Rough approximation to detect Seagate drives. + is_seagate = model.startswith("ST") def main(): - """main loop for SMART collector""" - - collection_interval=DEFAULT_COLLECTION_INTERVAL - if(smart_stats_conf): - config = smart_stats_conf.get_config() - collection_interval=config['collection_interval'] - - # Get the list of block devices. - drives = [dev[5:] for dev in glob.glob("/dev/[hs]d[a-z]")] - # Try FreeBSD drives if no block devices found - if not drives: - drives = [dev[5:] for dev in glob.glob("/dev/da[0-9]")+glob.glob("/dev/da[0-9][0-9]")+glob.glob("/dev/ada[0-9]")+glob.glob("/dev/ada[0-9][0-9]")] - # Exit gracefully if no block devices found - if not drives: - sys.exit(13) - - - # to make sure we are done with smartctl in COMMAND_TIMEOUT seconds - signal.signal(signal.SIGALRM, alarm_handler) - - if smart_is_broken(drives): - sys.exit(13) - - while True: - for drive in drives: - signal.alarm(COMMAND_TIMEOUT) - smart_ctl = subprocess.Popen(SMART_CTL + " -i -A /dev/" + drive, - shell=True, stdout=subprocess.PIPE) - smart_output = smart_ctl.communicate()[0] - signal.alarm(0) - if smart_ctl.returncode != 0: - if smart_ctl.returncode == 127: - sys.exit(13) - else: - print("Command exited with: %d" % smart_ctl.returncode, file=sys.stderr) - process_output(drive, smart_output) - - sys.stdout.flush() - time.sleep(collection_interval) + """main loop for SMART collector""" + + collection_interval = DEFAULT_COLLECTION_INTERVAL + if smart_stats_conf: + config = smart_stats_conf.get_config() + collection_interval = config['collection_interval'] + + # Get the list of block devices. + drives = [dev[5:] for dev in glob.glob("/dev/[hs]d[a-z]")] + # Try FreeBSD drives if no block devices found + if not drives: + drives = [dev[5:] for dev in + glob.glob("/dev/da[0-9]") + glob.glob("/dev/da[0-9][0-9]") + glob.glob("/dev/ada[0-9]") + glob.glob( + "/dev/ada[0-9][0-9]")] + # Exit gracefully if no block devices found + if not drives: + return 13 + + # to make sure we are done with smartctl in COMMAND_TIMEOUT seconds + signal.signal(signal.SIGALRM, alarm_handler) + + if smart_is_broken(drives): + return 13 + + while True: + for drive in drives: + signal.alarm(COMMAND_TIMEOUT) + smart_ctl = subprocess.Popen(SMART_CTL + " -i -A /dev/" + drive, + shell=True, stdout=subprocess.PIPE) + smart_output = smart_ctl.communicate()[0] + signal.alarm(0) + if smart_ctl.returncode != 0: + if smart_ctl.returncode == 127: + return 13 + else: + print("Command exited with: %d" % smart_ctl.returncode, file=sys.stderr) + process_output(drive, smart_output) + + sys.stdout.flush() + time.sleep(collection_interval) if __name__ == "__main__": - main() + sys.exit(main()) diff --git a/collectors/0/sysload.py b/collectors/0/sysload.py index 8e24959c..4c9328d2 100755 --- a/collectors/0/sysload.py +++ b/collectors/0/sysload.py @@ -13,7 +13,7 @@ # see . # -''' +""" CPU detailed statistics for TSDB This plugin tracks, for all CPUs: @@ -34,7 +34,7 @@ - memory statistics (bytes) (active, inact, wired, cache, buf, free) - arc statistics (bytes) (total, mru, mfu, anon, header, other) - swap statistics (bytes) (total, free, inuse, in/s, out/s) -''' +""" import errno import sys @@ -47,16 +47,15 @@ from collectors.lib import utils -PY3 = sys.version_info[0] > 2 -if PY3: - long = int +long = int try: from collectors.etc import sysload_conf except ImportError: sysload_conf = None -DEFAULT_COLLECTION_INTERVAL=15 +DEFAULT_COLLECTION_INTERVAL = 15 + def convert_to_bytes(string): """Take a string in the form 1234K, and convert to bytes""" @@ -75,17 +74,19 @@ def convert_to_bytes(string): return long(number) return long(string) + signal_received = None def handlesignal(signum, stack): global signal_received signal_received = signum + def main(): """top main loop""" - collection_interval=DEFAULT_COLLECTION_INTERVAL - collect_every_cpu=True - if(sysload_conf): + collection_interval = DEFAULT_COLLECTION_INTERVAL + collect_every_cpu = True + if sysload_conf: config = sysload_conf.get_config() collection_interval=config['collection_interval'] collect_every_cpu=config['collect_every_cpu'] @@ -97,7 +98,7 @@ def main(): try: if platform.system() == "FreeBSD": - if(collect_every_cpu): + if collect_every_cpu: p_top = subprocess.Popen( ["top", "-S", "-P", "-n", "-s"+str(collection_interval), "-dinfinity", "0"], stdout=subprocess.PIPE, @@ -108,7 +109,7 @@ def main(): stdout=subprocess.PIPE, ) else: - if(collect_every_cpu): + if collect_every_cpu: p_top = subprocess.Popen( ["mpstat", "-P", "ALL", str(collection_interval)], stdout=subprocess.PIPE, @@ -121,7 +122,7 @@ def main(): except OSError as e: if e.errno == errno.ENOENT: # it makes no sense to run this collector here - sys.exit(13) # we signal tcollector to not run us + sys.exit(13) # we signal tcollector to not run us raise timestamp = 0 @@ -292,5 +293,6 @@ def main(): pass p_top.wait() + if __name__ == "__main__": - main() + sys.exit(main()) diff --git a/collectors/0/tcp_bridge.py b/collectors/0/tcp_bridge.py index bd3de3b7..63b6f12e 100755 --- a/collectors/0/tcp_bridge.py +++ b/collectors/0/tcp_bridge.py @@ -21,15 +21,12 @@ import time from collectors.lib import utils -try: - from _thread import * -except ImportError: - from thread import * +from _thread import * try: from collectors.etc import tcp_bridge_conf except ImportError: - print('unable to import tcp_bridge_conf', file=sys.stderr) + utils.err('unable to import tcp_bridge_conf', file=sys.stderr) tcp_bridge_conf = None HOST = '127.0.0.1' @@ -46,6 +43,7 @@ # buffered stdout seems to break metrics out = os.fdopen(sys.stdout.fileno(), 'w', 0) + def main(): if not (tcp_bridge_conf and tcp_bridge_conf.enabled()): print('not enabled, or tcp_bridge_conf unavilable', file=sys.stderr) @@ -130,7 +128,6 @@ def removePut(line): finally: sock.close() -if __name__ == "__main__": - main() -sys.exit(0) +if __name__ == "__main__": + sys.exit(main()) diff --git a/collectors/0/udp_bridge.py b/collectors/0/udp_bridge.py index 2daf5b31..7a337f99 100755 --- a/collectors/0/udp_bridge.py +++ b/collectors/0/udp_bridge.py @@ -19,17 +19,18 @@ from collectors.lib import utils try: - from collectors.etc import udp_bridge_conf + from collectors.etc import udp_bridge_conf except ImportError: - udp_bridge_conf = None + udp_bridge_conf = None HOST = '127.0.0.1' PORT = 8953 SIZE = 8192 + def main(): if not (udp_bridge_conf and udp_bridge_conf.enabled()): - sys.exit(13) + sys.exit(13) utils.drop_privileges() def removePut(line): @@ -40,9 +41,9 @@ def removePut(line): try: if (udp_bridge_conf and udp_bridge_conf.usetcp()): - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) else: - sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) sock.bind((HOST, PORT)) except socket.error as msg: utils.err('could not open socket: %s' % msg) @@ -75,7 +76,6 @@ def removePut(line): finally: sock.close() -if __name__ == "__main__": - main() -sys.exit(0) +if __name__ == "__main__": + sys.exit(main()) diff --git a/collectors/0/varnishstat.py b/collectors/0/varnishstat.py index 33541dde..2ca6fdbe 100755 --- a/collectors/0/varnishstat.py +++ b/collectors/0/varnishstat.py @@ -24,7 +24,7 @@ from collectors.lib import utils -interval = 10 # seconds +interval = 10 # seconds # If you would rather use the timestamp returned by varnishstat instead of a # local timestamp, then change this value to "True" @@ -41,6 +41,7 @@ # Collect all metrics vstats = "all" + # Collect metrics a la carte # vstats = frozenset([ # "client_conn", @@ -52,47 +53,48 @@ # ]) def main(): - utils.drop_privileges() - bad_regex = re.compile("[,()]+") # avoid forbidden by TSD symbols - - while True: - try: - if vstats == "all": - stats = subprocess.Popen( - ["varnishstat", "-1", "-j"], - stdout=subprocess.PIPE, - ) - else: - fields = ",".join(vstats) - stats = subprocess.Popen( - ["varnishstat", "-1", "-f" + fields, "-j"], - stdout=subprocess.PIPE, - ) - except OSError as e: - # Die and signal to tcollector not to run this script. - sys.stderr.write("Error: %s\n" % e) - sys.exit(13) - - metrics = "" - for line in stats.stdout.readlines(): - metrics += line - metrics = json.loads(metrics) - - timestamp = "" - if use_varnishstat_timestamp: - pattern = "%Y-%m-%dT%H:%M:%S" - timestamp = int(time.mktime(time.strptime(metrics['timestamp'], pattern))) - else: - timestamp = time.time() - - for k, v in metrics.items(): - if k != "timestamp" and None == bad_regex.search(k): - metric_name = metric_prefix + "." + k - print("%s %d %s %s" % \ - (metric_name, timestamp, v['value'], ",".join(tags))) - - sys.stdout.flush() - time.sleep(interval) + utils.drop_privileges() + bad_regex = re.compile("[,()]+") # avoid forbidden by TSD symbols + + while True: + try: + if vstats == "all": + stats = subprocess.Popen( + ["varnishstat", "-1", "-j"], + stdout=subprocess.PIPE, + ) + else: + fields = ",".join(vstats) + stats = subprocess.Popen( + ["varnishstat", "-1", "-f" + fields, "-j"], + stdout=subprocess.PIPE, + ) + except OSError as e: + # Die and signal to tcollector not to run this script. + sys.stderr.write("Error: %s\n" % e) + sys.exit(13) + + metrics = "" + for line in stats.stdout.readlines(): + metrics += line + metrics = json.loads(metrics) + + timestamp = "" + if use_varnishstat_timestamp: + pattern = "%Y-%m-%dT%H:%M:%S" + timestamp = int(time.mktime(time.strptime(metrics['timestamp'], pattern))) + else: + timestamp = time.time() + + for k, v in metrics.items(): + if k != "timestamp" and None == bad_regex.search(k): + metric_name = metric_prefix + "." + k + print("%s %d %s %s" % \ + (metric_name, timestamp, v['value'], ",".join(tags))) + + sys.stdout.flush() + time.sleep(interval) + if __name__ == "__main__": - sys.exit(main()) + sys.exit(main()) diff --git a/collectors/0/zfsiostats.py b/collectors/0/zfsiostats.py index a3c0e243..e37805d7 100755 --- a/collectors/0/zfsiostats.py +++ b/collectors/0/zfsiostats.py @@ -13,7 +13,7 @@ # see . # -''' +""" ZFS I/O and disk space statistics for TSDB This plugin tracks, for all pools: @@ -29,7 +29,7 @@ Disk space usage is given in kbytes. Throughput is given in operations/s and bytes/s. -''' +""" import errno import sys @@ -39,9 +39,7 @@ import signal import os -PY3 = sys.version_info[0] > 2 -if PY3: - long = int +long = int from collectors.lib import utils @@ -50,19 +48,20 @@ except ImportError: zfsiostats_conf = None -DEFAULT_COLLECTION_INTERVAL=15 -DEFAULT_REPORT_CAPACITY_EVERY_X_TIMES=20 -DEFAULT_REPORT_DISKS_IN_VDEVS=False +DEFAULT_COLLECTION_INTERVAL = 15 +DEFAULT_REPORT_CAPACITY_EVERY_X_TIMES = 20 +DEFAULT_REPORT_DISKS_IN_VDEVS = False + def convert_to_bytes(string): """Take a string in the form 1234K, and convert to bytes""" factors = { - "K": 1024, - "M": 1024 * 1024, - "G": 1024 * 1024 * 1024, - "T": 1024 * 1024 * 1024 * 1024, - "P": 1024 * 1024 * 1024 * 1024 * 1024, - "E": 1024 * 1024 * 1024 * 1024 * 1024 * 1024, + "K": 1024, + "M": 1024 * 1024, + "G": 1024 * 1024 * 1024, + "T": 1024 * 1024 * 1024 * 1024, + "P": 1024 * 1024 * 1024 * 1024 * 1024, + "E": 1024 * 1024 * 1024 * 1024 * 1024 * 1024, } if string == "-": return -1 for f, fm in factors.items(): @@ -72,15 +71,16 @@ def convert_to_bytes(string): return long(number) return long(string) + def convert_wo_prefix(string): """Take a string in the form 1234K, and convert without metric prefix""" factors = { - "K": 1000, - "M": 1000 * 1000, - "G": 1000 * 1000 * 1000, - "T": 1000 * 1000 * 1000 * 1000, - "P": 1000 * 1000 * 1000 * 1000 * 1000, - "E": 1000 * 1000 * 1000 * 1000 * 1000 * 1000, + "K": 1000, + "M": 1000 * 1000, + "G": 1000 * 1000 * 1000, + "T": 1000 * 1000 * 1000 * 1000, + "P": 1000 * 1000 * 1000 * 1000 * 1000, + "E": 1000 * 1000 * 1000 * 1000 * 1000 * 1000, } if string == "-": return -1 for f, fm in factors.items(): @@ -90,11 +90,12 @@ def convert_wo_prefix(string): return long(number) return long(string) -def extract_info(line,report_disks_in_vdevs): + +def extract_info(line, report_disks_in_vdevs): (poolname, - alloc, free, - read_issued, write_issued, - read_throughput, write_throughput) = line.split() + alloc, free, + read_issued, write_issued, + read_throughput, write_throughput) = line.split() s_io = {} # magnitudeless variable @@ -112,11 +113,12 @@ def extract_info(line,report_disks_in_vdevs): s_df["free"] = convert_to_bytes(free) / 1024 if ((s_df["used"] < 0) or (s_df["free"] < 0)): s_df = {} - if(not report_disks_in_vdevs): + if (not report_disks_in_vdevs): s_io = {} return poolname, s_df, s_io + T_START = 1 T_HEADERS = 2 T_SEPARATOR = 3 @@ -126,22 +128,25 @@ def extract_info(line,report_disks_in_vdevs): T_LEG = 7 signal_received = None + + def handlesignal(signum, stack): global signal_received signal_received = signum + def main(): """zfsiostats main loop""" global signal_received - collection_interval=DEFAULT_COLLECTION_INTERVAL - report_capacity_every_x_times=DEFAULT_REPORT_CAPACITY_EVERY_X_TIMES - report_disks_in_vdevs=DEFAULT_REPORT_DISKS_IN_VDEVS - if(zfsiostats_conf): + collection_interval = DEFAULT_COLLECTION_INTERVAL + report_capacity_every_x_times = DEFAULT_REPORT_CAPACITY_EVERY_X_TIMES + report_disks_in_vdevs = DEFAULT_REPORT_DISKS_IN_VDEVS + if (zfsiostats_conf): config = zfsiostats_conf.get_config() - collection_interval=config['collection_interval'] - report_capacity_every_x_times=config['report_capacity_every_x_times'] - report_disks_in_vdevs=config['report_disks_in_vdevs'] + collection_interval = config['collection_interval'] + report_capacity_every_x_times = config['report_capacity_every_x_times'] + report_disks_in_vdevs = config['report_disks_in_vdevs'] signal.signal(signal.SIGTERM, handlesignal) signal.signal(signal.SIGINT, handlesignal) @@ -154,11 +159,11 @@ def main(): except OSError as e: if e.errno == errno.ENOENT: # it makes no sense to run this collector here - sys.exit(13) # we signal tcollector to not run us + sys.exit(13) # we signal tcollector to not run us raise firstloop = True - report_capacity = (report_capacity_every_x_times-1) + report_capacity = (report_capacity_every_x_times - 1) lastleg = 0 ltype = None timestamp = int(time.time()) @@ -207,7 +212,7 @@ def main(): ltype = T_DEVICE else: # must be a pool name - #assert ltype == T_SEPARATOR, \ + # assert ltype == T_SEPARATOR, \ # "expecting last state T_SEPARATOR, now got %s" % ltype if ltype == T_SEPARATOR: parentpoolname = "" @@ -215,19 +220,19 @@ def main(): if ltype == T_START: for x in ( - capacity_stats_pool, capacity_stats_device, - io_stats_pool, io_stats_device, - ): + capacity_stats_pool, capacity_stats_device, + io_stats_pool, io_stats_device, + ): x.clear() timestamp = int(time.time()) elif ltype == T_POOL: line = line.strip() - poolname, s_df, s_io = extract_info(line,report_disks_in_vdevs) + poolname, s_df, s_io = extract_info(line, report_disks_in_vdevs) if parentpoolname == "": parentpoolname = poolname else: - poolname=parentpoolname+"."+poolname + poolname = parentpoolname + "." + poolname capacity_stats_pool[poolname] = s_df io_stats_pool[poolname] = s_io # marker for leg @@ -236,13 +241,13 @@ def main(): elif ltype == T_LEG: last_leg = last_leg + 1 line = line.strip() - devicename, s_df, s_io = extract_info(line,report_disks_in_vdevs) + devicename, s_df, s_io = extract_info(line, report_disks_in_vdevs) capacity_stats_device["%s %s%s" % (poolname, devicename, last_leg)] = s_df io_stats_device["%s %s%s" % (poolname, devicename, last_leg)] = s_io elif ltype == T_DEVICE: line = line.strip() - devicename, s_df, s_io = extract_info(line,report_disks_in_vdevs) + devicename, s_df, s_io = extract_info(line, report_disks_in_vdevs) capacity_stats_device["%s %s" % (poolname, devicename)] = s_df io_stats_device["%s %s" % (poolname, devicename)] = s_io @@ -250,7 +255,7 @@ def main(): if report_capacity_every_x_times > 0: report_capacity += 1 if report_capacity == report_capacity_every_x_times: - report_capacity=0 + report_capacity = 0 for poolname, stats in capacity_stats_pool.items(): fm = "zfs.df.pool.kb.%s %d %s pool=%s" for statname, statnumber in stats.items(): @@ -287,6 +292,6 @@ def main(): pass p_zpool.wait() -if __name__ == "__main__": - main() +if __name__ == "__main__": + sys.exit(main()) diff --git a/collectors/0/zfsolkernstats.py b/collectors/0/zfsolkernstats.py index 6d580d99..aa655c99 100755 --- a/collectors/0/zfsolkernstats.py +++ b/collectors/0/zfsolkernstats.py @@ -33,6 +33,7 @@ # and the allocation sizes for the slabs # /proc/spl/kstat/zfs/arcstats is a table. we only care about the data column + def main(): """zfsstat main loop""" interval = 15 @@ -83,6 +84,6 @@ def main(): sys.stdout.flush() time.sleep(interval) -if __name__ == "__main__": - main() +if __name__ == "__main__": + sys.exit(main()) diff --git a/collectors/0/zookeeper.py b/collectors/0/zookeeper.py index cd5fd4cc..5386426d 100755 --- a/collectors/0/zookeeper.py +++ b/collectors/0/zookeeper.py @@ -50,6 +50,7 @@ "zk_open_file_descriptor_count", ]) + def scan_zk_instances(): """ Finding out all the running instances of zookeeper @@ -106,10 +107,12 @@ def scan_zk_instances(): fd.close() return instances + def print_stat(metric, ts, value, tags=""): if value is not None: print("zookeeper.%s %i %s %s" % (metric, ts, value, tags)) + def connect_socket(tcp_version, port): sock = None if tcp_version == "tcp6": @@ -124,6 +127,7 @@ def connect_socket(tcp_version, port): utils.err(err) return sock + def main(): if USER != "root": utils.drop_privileges(user=USER) @@ -139,7 +143,7 @@ def main(): last_scan = ts if not instances: - return 13 # Ask tcollector not to respawn us + return 13 # ask tcollector not to respawn us # Iterate over every zookeeper instance and get statistics for ip, port, tcp_version in instances: @@ -161,5 +165,6 @@ def main(): sys.stdout.flush() time.sleep(COLLECTION_INTERVAL) + if __name__ == "__main__": sys.exit(main()) diff --git a/collectors/300/aws_cloudwatch_stats.py b/collectors/300/aws_cloudwatch_stats.py index 6db53024..b26fc462 100755 --- a/collectors/300/aws_cloudwatch_stats.py +++ b/collectors/300/aws_cloudwatch_stats.py @@ -5,7 +5,6 @@ import datetime import re import json -from collections import OrderedDict import threading from time import mktime from collectors.lib import utils