From ae6299a18461b5c775c19fbd163e05ab94d0b33c Mon Sep 17 00:00:00 2001 From: Jose Luis Lucas Date: Tue, 12 Feb 2019 18:05:18 +0100 Subject: [PATCH] Improve QED dashboard, and add metrics --- .../provisioning/dashboards/QED.json | 873 +++++++++++------- tests/e2e/test_service.go | 23 +- 2 files changed, 560 insertions(+), 336 deletions(-) diff --git a/deploy/aws/modules/prometheus/provisioning/dashboards/QED.json b/deploy/aws/modules/prometheus/provisioning/dashboards/QED.json index aa2b106c7..60317cb04 100644 --- a/deploy/aws/modules/prometheus/provisioning/dashboards/QED.json +++ b/deploy/aws/modules/prometheus/provisioning/dashboards/QED.json @@ -15,9 +15,21 @@ "editable": true, "gnetId": null, "graphTooltip": 0, - "id": 2, "links": [], "panels": [ + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 34, + "panels": [], + "title": "General", + "type": "row" + }, { "aliasColors": {}, "bars": false, @@ -28,7 +40,7 @@ "h": 5, "w": 14, "x": 0, - "y": 0 + "y": 1 }, "id": 23, "legend": { @@ -87,6 +99,13 @@ "intervalFactor": 1, "legendFormat": "publishers", "refId": "D" + }, + { + "expr": "sum(qed_store_instances_count)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "store", + "refId": "F" } ], "thresholds": [], @@ -151,7 +170,7 @@ "h": 5, "w": 5, "x": 14, - "y": 0 + "y": 1 }, "id": 4, "interval": null, @@ -230,7 +249,7 @@ "h": 5, "w": 5, "x": 19, - "y": 0 + "y": 1 }, "id": 20, "interval": null, @@ -289,173 +308,366 @@ "valueName": "avg" }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "fill": 1, + "collapsed": true, "gridPos": { - "h": 9, - "w": 8, + "h": 1, + "w": 24, "x": 0, - "y": 5 + "y": 6 }, - "id": 2, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ + "id": 32, + "panels": [ { - "expr": "sum(rate(qed_balloon_add_total[30s])) by (job)", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{job}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Add", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(qed_balloon_add_total[30s])) by (job)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{job}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Add", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(qed_balloon_membership_total[30s])) by (job)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{job}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Membership", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(qed_balloon_incremental_total[30s])) by (job)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{job}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Incremental", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 19, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(qed_sender_batches_sent_total[1m])) by (job)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{job}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Batches sent by Sender", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], - "yaxis": { - "align": false, - "alignLevel": null - } + "title": "Servers", + "type": "row" }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "fill": 1, "gridPos": { - "h": 9, - "w": 8, - "x": 8, - "y": 5 - }, - "id": 10, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(qed_balloon_membership_total[30s])) by (job)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{job}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Membership", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] + "h": 1, + "w": 24, + "x": 0, + "y": 7 }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "id": 30, + "title": "Agents", + "type": "row" }, { "aliasColors": {}, @@ -465,11 +677,11 @@ "fill": 1, "gridPos": { "h": 9, - "w": 8, - "x": 16, - "y": 5 + "w": 12, + "x": 0, + "y": 8 }, - "id": 12, + "id": 17, "legend": { "avg": false, "current": false, @@ -493,102 +705,32 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(qed_balloon_incremental_total[30s])) by (job)", + "expr": "sum(rate(qed_monitor_batches_received_total[1m])) by (job)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "{{job}}", + "legendFormat": "monitor", "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Incremental", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true }, { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "fill": 1, - "gridPos": { - "h": 9, - "w": 8, - "x": 0, - "y": 14 - }, - "id": 19, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ + "expr": "sum(rate(qed_auditor_batches_received_total[1m])) by (job)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{job}}", + "refId": "B" + }, { - "expr": "sum(rate(qed_sender_batches_sent_total[1m])) by (job)", + "expr": "sum(rate(qed_publisher_batches_received_total[1m])) by (job)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{job}}", - "refId": "A" + "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Batches sent by Sender", + "title": "Batches received", "tooltip": { "shared": true, "sort": 0, @@ -633,11 +775,11 @@ "fill": 1, "gridPos": { "h": 9, - "w": 8, - "x": 8, - "y": 14 + "w": 12, + "x": 12, + "y": 8 }, - "id": 17, + "id": 18, "legend": { "avg": false, "current": false, @@ -661,21 +803,21 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(qed_monitor_batches_received_total[1m])) by (job)", + "expr": "sum(rate(qed_monitor_batches_process_seconds[1m])) by (job)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{job}}", "refId": "A" }, { - "expr": "sum(rate(qed_auditor_batches_received_total[1m])) by (job)", + "expr": "sum(rate(qed_auditor_batches_process_seconds[1m])) by (job)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{job}}", "refId": "B" }, { - "expr": "sum(rate(qed_publisher_batches_received_total[1m])) by (job)", + "expr": "sum(rate(qed_auditor_batches_process_seconds[1m])) by (job)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{job}}", @@ -686,7 +828,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Batches received", + "title": "Batches process duration", "tooltip": { "shared": true, "sort": 0, @@ -724,105 +866,178 @@ } }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "fill": 1, "gridPos": { - "h": 9, - "w": 8, - "x": 16, - "y": 14 + "h": 1, + "w": 24, + "x": 0, + "y": 17 }, - "id": 18, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false + "id": 28, + "title": "Store", + "type": "row" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true }, - "lines": true, - "linewidth": 1, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 18 + }, + "id": 25, + "interval": null, "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", "targets": [ { - "expr": "sum(rate(qed_monitor_batches_process_seconds[1m])) by (job)", + "expr": "sum(qed_store_snapshots_received_total)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{job}}", "refId": "A" - }, + } + ], + "thresholds": "", + "title": "Snapshots saved", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ { - "expr": "sum(rate(qed_auditor_batches_process_seconds[1m])) by (job)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{job}}", - "refId": "B" + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 18 + }, + "id": 26, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 }, { - "expr": "sum(rate(qed_auditor_batches_process_seconds[1m])) by (job)", + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(qed_store_alerts_received_total)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{job}}", - "refId": "C" + "refId": "A" } ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Batches process duration", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, + "thresholds": "", + "title": "Alerts", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "op": "=", + "text": "N/A", + "value": "null" } ], - "yaxis": { - "align": false, - "alignLevel": null - } + "valueName": "avg" } ], - "refresh": "10s", + "refresh": false, "schemaVersion": 16, "style": "dark", "tags": [], @@ -830,8 +1045,8 @@ "list": [] }, "time": { - "from": "now-30m", - "to": "now" + "from": "2019-02-12T16:25:07.504Z", + "to": "2019-02-12T16:27:24.213Z" }, "timepicker": { "refresh_intervals": [ diff --git a/tests/e2e/test_service.go b/tests/e2e/test_service.go index 22980eff2..851ba061b 100644 --- a/tests/e2e/test_service.go +++ b/tests/e2e/test_service.go @@ -43,21 +43,29 @@ var ( Qed_store_snapshots_received_total = prometheus.NewCounter( prometheus.CounterOpts{ Name: "qed_store_snapshots_received_total", - Help: "Amount of snapshots received.", + Help: "Amount of snapshots received (POST from publishers).", }, ) - Qed_store_alerts_received_total = prometheus.NewCounter( + Qed_store_snapshots_retrieved_total = prometheus.NewCounter( prometheus.CounterOpts{ - Name: "qed_store_alerts_received_total", - Help: "Duration of alerts received.", + Name: "qed_store_snapshots_retrieved_total", + Help: "Amount of snapshots retrieved (GET from auditors).", + }, + ) + + Qed_store_alerts_generated_total = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "qed_store_alerts_generated_total", + Help: "Amount of alerts generated.", }, ) metricsList = []prometheus.Collector{ Qed_store_instances_count, Qed_store_snapshots_received_total, - Qed_store_alerts_received_total, + Qed_store_snapshots_retrieved_total, + Qed_store_alerts_generated_total, } ) @@ -193,6 +201,7 @@ func (s *Service) getSnapshotHandler() func(http.ResponseWriter, *http.Request) atomic.AddUint64(&s.stats.count[RPS], 1) atomic.AddUint64(&s.stats.count[SNAP], 1) if r.Method == "GET" { + Qed_store_snapshots_retrieved_total.Inc() q := r.URL.Query() version, err := strconv.ParseInt(q.Get("v"), 10, 64) if err != nil { @@ -235,7 +244,7 @@ func (s *Service) alertHandler() func(http.ResponseWriter, *http.Request) { } return } else if r.Method == "POST" { - Qed_store_alerts_received_total.Inc() + Qed_store_alerts_generated_total.Inc() buf, err := ioutil.ReadAll(r.Body) if err != nil { @@ -301,7 +310,7 @@ func (s *Service) Start(foreground bool) { router.HandleFunc("/snapshot", s.getSnapshotHandler()) router.HandleFunc("/alert", s.alertHandler()) - s.httpServer = &http.Server{Addr: "127.0.0.1:8888", Handler: router} + s.httpServer = &http.Server{Addr: ":8888", Handler: router} fmt.Println("Starting test service...") go func() { ticker := time.NewTicker(1 * time.Second)