diff --git a/cmd/exporters/prometheus/httpd.go b/cmd/exporters/prometheus/httpd.go index 4defc09aa..17a175168 100644 --- a/cmd/exporters/prometheus/httpd.go +++ b/cmd/exporters/prometheus/httpd.go @@ -116,15 +116,26 @@ func (me *Prometheus) ServeMetrics(w http.ResponseWriter, r *http.Request) { if md, err := e.Render(e.Metadata); err == nil { data = append(data, md...) - }*/ + } + */ + + if me.addMetaTags { + data = filterMetaTags(data) + } w.WriteHeader(200) w.Header().Set("content-type", "text/plain") _, err := w.Write(bytes.Join(data, []byte("\n"))) if err != nil { - me.Logger.Error().Stack().Err(err).Msg("error") + me.Logger.Error().Stack().Err(err).Msg("write metrics") } + // make sure stream ends with newline + if _, err = w.Write([]byte("\n")); err != nil { + me.Logger.Error().Stack().Err(err).Msg("write ending newline") + } + + // update metadata me.Metadata.Reset() err = me.Metadata.LazySetValueInt64("time", "http", time.Since(start).Microseconds()) if err != nil { @@ -136,6 +147,38 @@ func (me *Prometheus) ServeMetrics(w http.ResponseWriter, r *http.Request) { } } +// filterMetaTags removes duplicate TYPE/HELP tags in the metrics +// Note: this is a workaround, normally Render() will only add +// one TYPE/HELP for each metric type, however since some metric +// types (e.g. metadata_collector_count) are submitted from multiple +// collectors, we end up with duplicates in the final batch delivered +// over HTTP. +func filterMetaTags(metrics [][]byte) [][]byte { + + filtered := make([][]byte, 0) + + metricsWithTags := make(map[string]bool) + + for i, m := range metrics { + if bytes.HasPrefix(m, []byte("# ")) { + if fields := strings.Fields(string(m)); len(fields) > 3 { + name := fields[2] + if !metricsWithTags[name] { + metricsWithTags[name] = true + filtered = append(filtered, m) + if i+1 < len(metrics) { + filtered = append(filtered, metrics[i+1]) + i++ + } + } + } + } else { + filtered = append(filtered, m) + } + } + return filtered +} + // ServeInfo provides a human-friendly overview of metric types and source collectors // this is done in a very inefficient way, by "reverse engineering" the metrics. // That's probably ok, since we don't expect this to be called often. diff --git a/cmd/exporters/prometheus/prometheus.go b/cmd/exporters/prometheus/prometheus.go index 80cb20438..cff7f9551 100644 --- a/cmd/exporters/prometheus/prometheus.go +++ b/cmd/exporters/prometheus/prometheus.go @@ -274,7 +274,7 @@ func (me *Prometheus) render(data *matrix.Matrix) ([][]byte, error) { tagged *set.Set labels_to_include, keys_to_include, global_labels []string prefix string - include_all_labels bool + err error ) rendered = make([][]byte, 0) @@ -296,10 +296,19 @@ func (me *Prometheus) render(data *matrix.Matrix) ([][]byte, error) { me.Logger.Debug().Msgf("requested keys_labels : %v", keys_to_include) } - if options.GetChildContentS("include_all_labels") == "true" { - include_all_labels = true - } else { - include_all_labels = false + include_all_labels := false + require_instance_keys := true + + if x := options.GetChildContentS("include_all_labels"); x != "" { + if include_all_labels, err = strconv.ParseBool(x); err != nil { + me.Logger.Error().Stack().Err(err).Msg("parameter: include_all_labels") + } + } + + if x := options.GetChildContentS("require_instance_keys"); x != "" { + if require_instance_keys, err = strconv.ParseBool(x); err != nil { + me.Logger.Error().Stack().Err(err).Msg("parameter: require_instance_keys") + } } prefix = me.globalPrefix + data.Object @@ -318,18 +327,26 @@ func (me *Prometheus) render(data *matrix.Matrix) ([][]byte, error) { me.Logger.Trace().Msgf("rendering instance [%s] (%v)", key, instance.GetLabels()) instance_keys := make([]string, len(global_labels)) - instance_labels := make([]string, 0) copy(instance_keys, global_labels) + instance_keys_ok := false + instance_labels := make([]string, 0) if include_all_labels { for label, value := range instance.GetLabels().Map() { - instance_keys = append(instance_keys, fmt.Sprintf("%s=\"%s\"", label, value)) + // temporary fix for the rarely happening duplicate labels + // known case is: ZapiPerf -> 7mode -> disk.yaml + // actual cause is the Aggregator plugin, which is adding node as + // instance label (even though it's already a global label for 7modes) + if !data.GetGlobalLabels().Has(label) { + instance_keys = append(instance_keys, fmt.Sprintf("%s=\"%s\"", label, value)) + } } } else { for _, key := range keys_to_include { value := instance.GetLabel(key) - if value != "" { - instance_keys = append(instance_keys, fmt.Sprintf("%s=\"%s\"", key, value)) + instance_keys = append(instance_keys, fmt.Sprintf("%s=\"%s\"", key, value)) + if !instance_keys_ok && value != "" { + instance_keys_ok = true } me.Logger.Trace().Msgf("++ key [%s] (%s) found=%v", key, value, value != "") } @@ -341,7 +358,7 @@ func (me *Prometheus) render(data *matrix.Matrix) ([][]byte, error) { } // @TODO, probably be strict, and require all keys to be present - if len(instance_keys) == 0 && options.GetChildContentS("require_instance_keys") != "False" { + if !instance_keys_ok && require_instance_keys { me.Logger.Trace().Msgf("skip instance, no keys parsed (%v) (%v)", instance_keys, instance_labels) continue } diff --git a/cmd/exporters/prometheus/validator.py b/cmd/exporters/prometheus/validator.py new file mode 100755 index 000000000..2b3938769 --- /dev/null +++ b/cmd/exporters/prometheus/validator.py @@ -0,0 +1,271 @@ +#!/usr/bin/env python3 + +""" +Copyright NetApp Inc, 2021 All rights reserved + +Utility to validate integrity of Prometheus metrics generated +by Harvests' Prometheus exporter. This utility takes into account +the parsing rules of the PrometheusDB, as well as other collector +servers, such as InfluxDB's Telegraf. + +""" + +import argparse +import regex +import signal +import sys +import time +import urllib.request + +# error summary +errors = { + 'corrupt_metrics' : 0, + 'corrupt_labels' : 0, + 'corrupt_metatags' : 0, + 'inconsistent_labels' : 0, + 'duplicate_labels' : 0, + 'duplicate_metatags' : 0, + 'missing_metatags' : 0, + 'missing_newlines' : 0, + } + +# cache label keys of seen metrics to check for consistency +label_cache = {} # str -> set + +# regular expressions to match metric +metric_pattern = regex.compile(r'^(\w+)\{(.+)\} \d+(\.\d+(e[-+]\d+)?)?$') +# pattern to match HELP/TYPE metatags +tag_pattern = regex.compile(r'^# (\w+) (\w+) .*$') +# label name must start with alphabetical char +# see: https://github.com/prometheus/common/blob/main/model/labels.go#L94 +label_pattern = regex.compile(r'^([_a-zA-Z]\w*)="[^"]*?"$', flags=regex.ASCII) + +# tty colors +END = '\033[0m' +BOLD = '\033[1m' +RED = '\033[91m' +GREEN = '\033[92m' +YELLOW = '\033[93m' +PINK = '\033[95m' + +def main(): + # parse arguments + a = get_args() + + # make sure to print errors before exiting + signal.signal(signal.SIGINT, terminate) + + # run the scrapes + for i in range(a.scrapes): + + # cache metrics for which we have seen metatags + help_cache = {} # str -> bool + type_cache = {} # str -> bool + + metrics = get_batch_metrics(a.addr, a.port) + print('{}-> scrape #{:<4} - scraped metrics/lines: {}{}'.format(BOLD, i+1, len(metrics.splitlines()), END)) + + if metrics == '': + # sleep until next scrape + time.sleep(a.interval) + + if not metrics.endswith('\n'): + errors['missing_newlines'] += 1 + print(' {}missing newline at the end of metric batch{}'.format(PINK, END)) + + for m in metrics.splitlines(): + + # skip newline + if m == '\n' or m == '': + continue + + # handle metatag + if len(m) and m[0] == '#': + ok, tag, metric_name = check_metatag(m) + if not ok: + errors['corrupt_metatags'] += 1 + print(' corrupt {} metatag:'.format(tag)) + print(' [{}{}{}]'.format(RED, m, END)) + elif tag == 'HELP': + if help_cache.get(metric_name, False): + errors['duplicate_metatags'] += 1 # count only once + print(' duplicate HELP tag for metric {}'.format(metric_name)) + help_cache[metric_name] = True + elif tag == 'TYPE': + if type_cache.get(metric_name, False): + print(' duplicate TYPE tag for metric {}'.format(metric_name)) + type_cache[metric_name] = True + continue + + # check general metric intergrity and parse raw labels substring + ok, metric_name, raw_labels = check_metric(m) + + if not ok: + errors['corrupt_metrics'] += 1 + print(' corrupt metric format:') + print(' [{}{}{}]'.format(RED, m, END)) + continue + + # check labels integrity + ok, labels = parse_labels(raw_labels) # list + if not ok: + errors['corrupt_metrics'] += 1 + print(' corrupt metric format (labels):') + print(' [{}{}{}]'.format(RED, m, END)) + continue + + # check for duplicate labels + duplicates = set([l for l in labels if labels.count(l) > 1]) + if duplicates: + errors['duplicate_labels'] += 1 + print(' duplicate labels ({}):'.format(', '.join(duplicates))) + print(' [{}{}{}]'.format(RED, m, END)) + + labels = set(labels) + + # compare with cached labels for consistency + cached_labels = label_cache.get(metric_name, None) + if cached_labels == None: + label_cache[metric_name] = labels + else: + missing = cached_labels - labels + added = labels - cached_labels + if missing or added: + errors['inconsistent_labels'] += 1 + print(' inconsistent labels (cached: {}):'.format(' '.join(cached_labels))) + if missing: + print(' - missing ({})'.format(', '.join(missing))) + if added: + print(' - added ({})'.format(', '.join(added))) + print(' [{}{}{}]'.format(RED, m, END)) + + # optionally check for metatags + # each metrics should at least once include HELP/TYPE metametric + if a.metatags: + has_help = help_cache.get(metric_name, False) + has_type = type_cache.get(metric_name, False) + if not has_help or not has_type: + errors['missing_metatags'] += 1 + print(' {}missing metatags for metric [{}]{}'.format(RED, metric_name, END)) + if not has_help: + print(' - HELP tag not detected') + if not has_type: + print(' - TYPE tag not detected') + + # sleep until next scrape + time.sleep(a.interval) + + print_errors() + # DONE + +# Scrape an HTTP endpoint and return data +def get_batch_metrics(addr: str, port: int) -> str: + try: + return urllib.request.urlopen('http://{}:{}/metrics'.format(addr, port)).read().decode() + except urllib.error.URLError as err: + print(err) + return '' + +# validate metric format (without labels), extract name and labels substring +def check_metric(metric: str) -> (bool, str, str): + match = metric_pattern.match(metric) + if match: + try: + return True, match.captures(1)[0], match.captures(2)[0] + except Exception as ex: + print('regex exception: {}'.format(ex)) + return False, '', '' + +def check_metatag(metric: str) -> (bool, str, str): + match = tag_pattern.match(metric) + if match: + try: + return True, match.captures(1)[0], match.captures(2)[0] + except Exception as ex: + print('regex exception: {}'.format(ex)) + return False, '', '' + +# parse label keys from raw labels substring +def parse_labels(labels: str) -> (bool, [str]): + keys = [] + for pair in labels.split(','): + match = label_pattern.match(pair) + if not match: + print(' - failed parse label pair ({})'.format(pair)) + return False, keys + keys.append(match.captures(1)[0]) + + return True, keys + +def terminate(signum, frame): + print('\n{}-> terminating validation session{}'.format(YELLOW, END)) + print_errors() + sys.exit() + +def print_errors(): + print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') + print('-> {} unique metrics validated'.format(len(label_cache))) + total = sum(errors.values()) + if total == 0: + print('{}-> OK - no errors detected{}'.format(GREEN, END)) + else: + print('{}-> FAIL - {} errors detected{}'.format(RED, total, END)) + + for k, v in errors.items(): + print('{:<30} - {:>8}'.format(k, v)) + print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') + + +# Parse CLI arguments +def get_args() -> argparse.Namespace: + p = argparse.ArgumentParser( + formatter_class = argparse.RawTextHelpFormatter, + description = """Open Metric Validator using an HTTP endpoint + +SYNOPSIS: + Run this tool specifying the port of the Prometheus exporter. Then, + start a Harvest poller that will serve the metrics on the port. + (Start tools first, so no metatags are missed). + +VALIDATION: + Tool will validate integrity of the rendered metrics: + - metric format + - label integrity + - label consistency + - label duplicates + - HELP/TYPE metatags (optional)""" + ) + p.add_argument('-a', '--addr', + help = 'Address of the HTTP endpoint (default: localhost)', + dest = 'addr', + type = str, + default = 'localhost' + ) + p.add_argument('-p', '--port', + help = 'Port of the HTTP endpoint', + dest = 'port', + type = int, + required = True + ) + p.add_argument('-i', '--interval', + help = 'Interval between scrapes (in seconds, default: 60)', + dest = 'interval', + type = int, + default = 60 + ) + p.add_argument('-s', '--scrapes', + help = 'Number of scrapes to run (default: 5)', + dest = 'scrapes', + type = int, + default = 5 + ) + p.add_argument('-m', '--metatags', + help = 'Check TYPE/HELP metatags (default: false)', + dest = 'metatags', + action = 'store_true', + default = False + ) + return p.parse_args() + +if __name__ == '__main__': + main() diff --git a/conf/zapiperf/7mode/8.2.5/fcp.yaml b/conf/zapiperf/7mode/8.2.5/fcp.yaml index fe205c92e..ae8e4ec73 100644 --- a/conf/zapiperf/7mode/8.2.5/fcp.yaml +++ b/conf/zapiperf/7mode/8.2.5/fcp.yaml @@ -23,7 +23,6 @@ plugins: export_options: instance_keys: - - node - port - speed graphite_leafs: diff --git a/conf/zapiperf/7mode/8.2.5/nfsv3_node.yaml b/conf/zapiperf/7mode/8.2.5/nfsv3_node.yaml index 4b7696256..5011c8611 100644 --- a/conf/zapiperf/7mode/8.2.5/nfsv3_node.yaml +++ b/conf/zapiperf/7mode/8.2.5/nfsv3_node.yaml @@ -8,7 +8,7 @@ global_labels: counters: - instance_name => node - - nfsv3_ops => ops + - nfsv3_ops => total_ops - nfsv3_read_ops => read_ops - nfsv3_write_ops => write_ops - nfsv3_read_latency => read_avg_latency @@ -16,8 +16,7 @@ counters: - nfsv3_avg_op_latency => latency export_options: - instance_keys: - - nfsv + require_instance_keys: false graphite_leafs: - node.{node}.nfsv3 diff --git a/conf/zapiperf/7mode/8.2.5/nfsv4_node.yaml b/conf/zapiperf/7mode/8.2.5/nfsv4_node.yaml index 88d37ab91..8d8d31658 100644 --- a/conf/zapiperf/7mode/8.2.5/nfsv4_node.yaml +++ b/conf/zapiperf/7mode/8.2.5/nfsv4_node.yaml @@ -9,14 +9,13 @@ global_labels: counters: - instance_name => node - nfsv3_avg_latency => latency - - nfsv4_ops => ops + - nfsv4_ops => total_ops - nfsv4_ready_latency => read_avg_latency - nfsv4_write_latency => write_avg_latency - nfsv4_read_ops => read_ops - nfsv4_write_ops => write_ops export_options: - instance_keys: - - nfsv + require_instance_keys: false graphite_leafs: - node.{node}.nfsv4 diff --git a/conf/zapiperf/7mode/8.2.5/processor.yaml b/conf/zapiperf/7mode/8.2.5/processor.yaml index 3b16f4af5..5254b7606 100644 --- a/conf/zapiperf/7mode/8.2.5/processor.yaml +++ b/conf/zapiperf/7mode/8.2.5/processor.yaml @@ -13,7 +13,7 @@ counters: plugins: Aggregator: - - node<>node_cpu + - node<>node_cpu # only export node-level averages from plugin # set this true or comment, to get data for each cpu diff --git a/conf/zapiperf/cdot/9.8.0/nfsv3.yaml b/conf/zapiperf/cdot/9.8.0/nfsv3.yaml index 15c2f620e..879cf3e5c 100644 --- a/conf/zapiperf/cdot/9.8.0/nfsv3.yaml +++ b/conf/zapiperf/cdot/9.8.0/nfsv3.yaml @@ -91,4 +91,3 @@ rewrite_as_label: export_options: instance_keys: - svm - - nfsv diff --git a/conf/zapiperf/cdot/9.8.0/nfsv3_node.yaml b/conf/zapiperf/cdot/9.8.0/nfsv3_node.yaml index 7f0ace38f..dab93031a 100644 --- a/conf/zapiperf/cdot/9.8.0/nfsv3_node.yaml +++ b/conf/zapiperf/cdot/9.8.0/nfsv3_node.yaml @@ -8,7 +8,7 @@ global_labels: counters: - instance_name => node - - nfsv3_ops => ops + - nfsv3_ops => total_ops # "nfs_ops" already used in system_node.yaml - nfsv3_read_ops => read_ops - nfsv3_write_ops => write_ops - nfsv3_throughput => throughput @@ -91,5 +91,3 @@ override: export_options: instance_keys: - node - - nfsv - diff --git a/conf/zapiperf/cdot/9.8.0/nfsv4.yaml b/conf/zapiperf/cdot/9.8.0/nfsv4.yaml index ea24637e4..7a03d5f52 100644 --- a/conf/zapiperf/cdot/9.8.0/nfsv4.yaml +++ b/conf/zapiperf/cdot/9.8.0/nfsv4.yaml @@ -94,11 +94,6 @@ rewrite_as_label: - _avg_latency => request_latency: latency_type - _total => request_total: request_type -export_options: - instance_keys: - - svm - - nfsv - override: - access_total: rate - close_total: rate @@ -139,3 +134,8 @@ override: - setclientid_total: rate - verify_total: rate - write_total: rate + +export_options: + instance_keys: + - svm + diff --git a/conf/zapiperf/cdot/9.8.0/nfsv4_1.yaml b/conf/zapiperf/cdot/9.8.0/nfsv4_1.yaml index 88f30050e..ddc91ed40 100644 --- a/conf/zapiperf/cdot/9.8.0/nfsv4_1.yaml +++ b/conf/zapiperf/cdot/9.8.0/nfsv4_1.yaml @@ -125,7 +125,6 @@ rewrite_as_label: export_options: instance_keys: - svm - - nfsv override: - access_total: rate diff --git a/conf/zapiperf/cdot/9.8.0/nfsv4_1_node.yaml b/conf/zapiperf/cdot/9.8.0/nfsv4_1_node.yaml index ec9861b6f..0c154ecc9 100644 --- a/conf/zapiperf/cdot/9.8.0/nfsv4_1_node.yaml +++ b/conf/zapiperf/cdot/9.8.0/nfsv4_1_node.yaml @@ -9,7 +9,7 @@ global_labels: counters: - instance_name => node - latency - - total_ops => ops + - total_ops - nfs41_read_throughput => read_throughput - nfs41_throughput => throughput - nfs41_write_throughput => write_throughput @@ -118,11 +118,6 @@ counters: - want_delegation_avg_latency - write_avg_latency -export_options: - instance_keys: - - node - - nfsv - override: - access_total: rate - backchannel_ctl_total: rate @@ -177,3 +172,7 @@ override: - verify_total: rate - want_delegation_total: rate - write_total: rate + +export_options: + instance_keys: + - node diff --git a/conf/zapiperf/cdot/9.8.0/nfsv4_node.yaml b/conf/zapiperf/cdot/9.8.0/nfsv4_node.yaml index c27044c3d..52c701399 100644 --- a/conf/zapiperf/cdot/9.8.0/nfsv4_node.yaml +++ b/conf/zapiperf/cdot/9.8.0/nfsv4_node.yaml @@ -9,7 +9,7 @@ global_labels: counters: - instance_name => node - latency - - total_ops => ops + - total_ops - nfs4_read_throughput => read_throughput - nfs4_throughput => throughput - nfs4_write_throughput => write_throughput @@ -94,11 +94,6 @@ rewrite_as_label: - _avg_latency => request_latency: latency_type - _total => request_total: request_type -export_options: - instance_keys: - - node - - nfsv - override: - access_total: rate - close_total: rate @@ -139,3 +134,7 @@ override: - setclientid_total: rate - verify_total: rate - write_total: rate + +export_options: + instance_keys: + - node