diff --git a/build/grafana/dashboard-allocations.yaml b/build/grafana/dashboard-allocations.yaml new file mode 100644 index 0000000000..225341bdff --- /dev/null +++ b/build/grafana/dashboard-allocations.yaml @@ -0,0 +1,381 @@ +# Copyright 2018 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# configs map used by grafana +apiVersion: v1 +kind: ConfigMap +metadata: + name: agones-allocations + namespace: metrics + labels: + grafana_dashboard: "1" +data: + dashboard-agones-allocations.json: | + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Agones gameserver allocations", + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 8, + "iteration": 1549050426140, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "agones_fleet_allocations_count{fleet_name=~\"$fleet\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "used {{fleet_name}}", + "refId": "A" + }, + { + "expr": "agones_gameservers_count{fleet_name=~\"$fleet\",type=\"Ready\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "free {{fleet_name}}", + "refId": "B" + }, + { + "expr": "", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GameServer allocations count by fleet", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(agones_fleet_allocations_count{fleet_name=~\"$fleet\"}[$interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{fleet_name}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GameServer allocations rate by fleet", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "10s", + "schemaVersion": 16, + "style": "dark", + "tags": [ + "agones", + "nodes", + "controller", + "gameservers", + "allocations", + "fleets" + ], + "templating": { + "list": [ + { + "allValue": ".*", + "current": { + "text": "All", + "value": [ + "$__all" + ] + }, + "datasource": "Prometheus", + "definition": "label_values(agones_gameservers_count, fleet_name)", + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "fleet", + "options": [], + "query": "label_values(agones_gameservers_count, fleet_name)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "text": "1m", + "value": "1m" + }, + "hide": 0, + "label": null, + "name": "interval", + "options": [ + { + "selected": true, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "12h", + "value": "12h" + }, + { + "selected": false, + "text": "1d", + "value": "1d" + }, + { + "selected": false, + "text": "7d", + "value": "7d" + }, + { + "selected": false, + "text": "14d", + "value": "14d" + }, + { + "selected": false, + "text": "30d", + "value": "30d" + } + ], + "query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d", + "refresh": 2, + "skipUrlSync": false, + "type": "interval" + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Agones Allocations", + "uid": "TfUv6ylmk", + "version": 13 + } \ No newline at end of file diff --git a/build/grafana/dashboard-gameservers.yaml b/build/grafana/dashboard-gameservers.yaml index 4bcb35ad36..2c1172678c 100644 --- a/build/grafana/dashboard-gameservers.yaml +++ b/build/grafana/dashboard-gameservers.yaml @@ -26,7 +26,6 @@ data: "annotations": { "list": [ { - "$$hashKey": "object:2841", "builtIn": 1, "datasource": "-- Grafana --", "enable": true, @@ -41,31 +40,83 @@ data: "editable": true, "gnetId": null, "graphTooltip": 0, - "iteration": 1547331274009, + "iteration": 1549049758587, "links": [], "panels": [ + { + "aliasColors": { + "Allocated": "#1f78c1", + "Creating": "#cca300", + "PortAllocation": "#e0f9d7", + "Ready": "#508642", + "Scheduled": "#f2c96d", + "Unhealthy": "#890f02" + }, + "breakPoint": "50%", + "cacheTimeout": null, + "combine": { + "label": "Others", + "threshold": 0 + }, + "fontSize": "100%", + "format": "short", + "gridPos": { + "h": 6, + "w": 7, + "x": 0, + "y": 0 + }, + "id": 4, + "interval": null, + "legend": { + "percentage": false, + "show": true, + "sideWidth": null, + "sortDesc": true, + "values": true + }, + "legendType": "Right side", + "links": [], + "maxDataPoints": 3, + "nullPointMode": "connected", + "pieType": "pie", + "strokeWidth": 1, + "targets": [ + { + "expr": "sum(agones_gameservers_count{fleet_name=~\"$fleet\"}) by (type)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "{{type}}", + "refId": "A" + } + ], + "title": "GameServers count per type", + "transparent": false, + "type": "grafana-piechart-panel", + "valueName": "current" + }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, - "datasource": null, - "fill": 10, + "fill": 1, "gridPos": { - "h": 7, - "w": 12, - "x": 0, + "h": 6, + "w": 17, + "x": 7, "y": 0 }, - "id": 7, + "id": 2, "legend": { "alignAsTable": true, "avg": true, - "current": false, + "current": true, "hideEmpty": true, "hideZero": true, - "max": true, - "min": true, + "max": false, + "min": false, "rightSide": true, "show": true, "total": false, @@ -81,12 +132,11 @@ data: "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, - "stack": true, + "stack": false, "steppedLine": false, "targets": [ { - "$$hashKey": "object:11157", - "expr": "sum(agones_gameservers_count{fleet_name=~\"$fleet\"}) by (type)", + "expr": "sum(rate(agones_gameservers_total{fleet_name=~\"$fleet\"}[$interval])) by (type)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{type}}", @@ -95,8 +145,9 @@ data: ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, - "title": "GameServers count per type", + "title": "GameServers rate per type", "tooltip": { "shared": true, "sort": 0, @@ -112,7 +163,6 @@ data: }, "yaxes": [ { - "$$hashKey": "object:11185", "format": "short", "label": null, "logBase": 1, @@ -121,7 +171,6 @@ data: "show": true }, { - "$$hashKey": "object:11186", "format": "short", "label": null, "logBase": 1, @@ -140,15 +189,14 @@ data: "bars": false, "dashLength": 10, "dashes": false, - "datasource": null, - "fill": 1, + "fill": 10, "gridPos": { "h": 7, "w": 12, - "x": 12, - "y": 0 + "x": 0, + "y": 6 }, - "id": 2, + "id": 7, "legend": { "alignAsTable": true, "avg": true, @@ -172,12 +220,11 @@ data: "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "$$hashKey": "object:11320", - "expr": "sum(rate(agones_gameservers_total{fleet_name=~\"$fleet\"}[$interval])) by (type)", + "expr": "sum(agones_gameservers_count{fleet_name=~\"$fleet\"}) by (type)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{type}}", @@ -186,8 +233,9 @@ data: ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, - "title": "GameServers rate per type", + "title": "GameServers count per type", "tooltip": { "shared": true, "sort": 0, @@ -203,7 +251,6 @@ data: }, "yaxes": [ { - "$$hashKey": "object:11389", "format": "short", "label": null, "logBase": 1, @@ -212,7 +259,6 @@ data: "show": true }, { - "$$hashKey": "object:11390", "format": "short", "label": null, "logBase": 1, @@ -226,59 +272,6 @@ data: "alignLevel": null } }, - { - "aliasColors": { - "Allocated": "#1f78c1", - "Creating": "#cca300", - "PortAllocation": "#e0f9d7", - "Ready": "#508642", - "Scheduled": "#f2c96d", - "Unhealthy": "#890f02" - }, - "breakPoint": "50%", - "cacheTimeout": null, - "combine": { - "label": "Others", - "threshold": 0 - }, - "datasource": null, - "fontSize": "100%", - "format": "short", - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 7 - }, - "id": 4, - "interval": null, - "legend": { - "percentage": false, - "show": true, - "sortDesc": true, - "values": true - }, - "legendType": "On graph", - "links": [], - "maxDataPoints": 3, - "nullPointMode": "connected", - "pieType": "pie", - "strokeWidth": 1, - "targets": [ - { - "expr": "sum(agones_gameservers_count{fleet_name=~\"$fleet\"}) by (type)", - "format": "time_series", - "instant": true, - "intervalFactor": 1, - "legendFormat": "{{type}}", - "refId": "A" - } - ], - "title": "GameServers count per type", - "transparent": false, - "type": "grafana-piechart-panel", - "valueName": "current" - }, { "aliasColors": { "allocated": "#511749", @@ -289,13 +282,12 @@ data: "bars": false, "dashLength": 10, "dashes": false, - "datasource": null, "fill": 1, "gridPos": { - "h": 8, + "h": 7, "w": 12, "x": 12, - "y": 7 + "y": 6 }, "id": 9, "legend": { @@ -323,7 +315,6 @@ data: "steppedLine": false, "targets": [ { - "$$hashKey": "object:11806", "expr": "sum(agones_fleets_replicas_count{name=~\"$fleet\"}) by (type)", "format": "time_series", "intervalFactor": 1, @@ -333,6 +324,7 @@ data: ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Fleet Replicas Count", "tooltip": { @@ -350,7 +342,6 @@ data: }, "yaxes": [ { - "$$hashKey": "object:11834", "format": "short", "label": null, "logBase": 1, @@ -359,7 +350,6 @@ data: "show": true }, { - "$$hashKey": "object:11835", "format": "short", "label": null, "logBase": 1, @@ -378,22 +368,19 @@ data: "bars": false, "dashLength": 10, "dashes": false, - "datasource": null, "fill": 1, "gridPos": { - "h": 8, - "w": 12, + "h": 7, + "w": 11, "x": 0, - "y": 15 + "y": 13 }, - "id": 6, + "id": 12, "legend": { "alignAsTable": true, "avg": false, "current": true, - "hideEmpty": true, - "hideZero": true, - "max": true, + "max": false, "min": false, "rightSide": true, "show": true, @@ -414,18 +401,53 @@ data: "steppedLine": false, "targets": [ { - "$$hashKey": "object:11266", - "expr": "agones_fleet_allocations_count{fleet_name=~\"$fleet\"}", + "expr": "histogram_quantile(1, sum(rate(agones_gameservers_node_count_bucket[5m])) by (le))", "format": "time_series", "intervalFactor": 1, - "legendFormat": "{{fleet_name}}", + "legendFormat": "max", + "refId": "F" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(agones_gameservers_node_count_bucket[5m])) by (le))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "99th", "refId": "A" + }, + { + "expr": "histogram_quantile(0.90, sum(rate(agones_gameservers_node_count_bucket[5m])) by (le))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "90th", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.50, sum(rate(agones_gameservers_node_count_bucket[5m])) by (le))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "50th", + "refId": "C" + }, + { + "expr": "histogram_quantile(0, sum(rate(agones_gameservers_node_count_bucket[5m])) by (le))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "min", + "refId": "E" + }, + { + "expr": " agones_gameservers_node_count_sum /\n agones_gameservers_node_count_count", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "avg", + "refId": "D" } ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, - "title": "GameServer allocations count by fleet", + "title": "GameServers per node", "tooltip": { "shared": true, "sort": 0, @@ -467,22 +489,19 @@ data: "bars": false, "dashLength": 10, "dashes": false, - "datasource": null, "fill": 1, "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 15 + "h": 7, + "w": 13, + "x": 11, + "y": 13 }, - "id": 10, + "id": 14, "legend": { "alignAsTable": true, "avg": true, "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, + "max": true, "min": false, "rightSide": true, "show": true, @@ -499,22 +518,29 @@ data: "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, - "stack": false, + "stack": true, "steppedLine": false, "targets": [ { - "$$hashKey": "object:11266", - "expr": "rate(agones_fleet_allocations_count{fleet_name=~\"$fleet\"}[$interval])", + "expr": "agones_nodes_count{empty=\"true\"}", "format": "time_series", "intervalFactor": 1, - "legendFormat": "{{fleet_name}}", + "legendFormat": "unused", "refId": "A" + }, + { + "expr": "agones_nodes_count{empty=\"false\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "used", + "refId": "B" } ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, - "title": "GameServer allocations rate by fleet", + "title": "Node avaibility (stacked)", "tooltip": { "shared": true, "sort": 0, @@ -571,6 +597,7 @@ data: ] }, "datasource": "Prometheus", + "definition": "label_values(agones_gameservers_count, fleet_name)\t", "hide": 0, "includeAll": true, "label": null, @@ -580,6 +607,7 @@ data: "query": "label_values(agones_gameservers_count, fleet_name)\t", "refresh": 2, "regex": "", + "skipUrlSync": false, "sort": 0, "tagValuesQuery": "", "tags": [], @@ -657,6 +685,7 @@ data: ], "query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d", "refresh": 2, + "skipUrlSync": false, "type": "interval" } ] diff --git a/cmd/controller/main.go b/cmd/controller/main.go index 3d2c05f85a..f5d6a6f7f7 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -143,7 +143,7 @@ func main() { // Add metrics controller only if we configure one of metrics exporters if ctlConf.PrometheusMetrics || ctlConf.Stackdriver { - rs = append(rs, metrics.NewController(kubeClient, agonesClient, agonesInformerFactory)) + rs = append(rs, metrics.NewController(kubeClient, agonesClient, kubeInformerFactory, agonesInformerFactory)) } server.Handle("/", health) diff --git a/pkg/metrics/controller.go b/pkg/metrics/controller.go index 3f9c321e5b..ebb76fad7f 100644 --- a/pkg/metrics/controller.go +++ b/pkg/metrics/controller.go @@ -20,6 +20,8 @@ import ( "sync" "time" + v1 "k8s.io/client-go/listers/core/v1" + stablev1alpha1 "agones.dev/agones/pkg/apis/stable/v1alpha1" "agones.dev/agones/pkg/client/clientset/versioned" "agones.dev/agones/pkg/client/informers/externalversions" @@ -32,13 +34,14 @@ import ( "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/cache" ) var ( // MetricResyncPeriod is the interval to re-synchronize metrics based on indexed cache. - MetricResyncPeriod = time.Second * 1 + MetricResyncPeriod = time.Second * 15 ) func init() { @@ -50,10 +53,12 @@ type Controller struct { logger *logrus.Entry gameServerLister listerv1alpha1.GameServerLister faLister listerv1alpha1.FleetAllocationLister + nodeLister v1.NodeLister gameServerSynced cache.InformerSynced fleetSynced cache.InformerSynced fasSynced cache.InformerSynced faSynced cache.InformerSynced + nodeSynced cache.InformerSynced lock sync.Mutex gsCount GameServerCount faCount map[string]int64 @@ -63,6 +68,7 @@ type Controller struct { func NewController( kubeClient kubernetes.Interface, agonesClient versioned.Interface, + kubeInformerFactory informers.SharedInformerFactory, agonesInformerFactory externalversions.SharedInformerFactory) *Controller { gameServer := agonesInformerFactory.Stable().V1alpha1().GameServers() @@ -74,14 +80,18 @@ func NewController( fInformer := fleets.Informer() fas := agonesInformerFactory.Stable().V1alpha1().FleetAutoscalers() fasInformer := fas.Informer() + node := kubeInformerFactory.Core().V1().Nodes() + nodeInformer := node.Informer() c := &Controller{ gameServerLister: gameServer.Lister(), - gameServerSynced: gsInformer.HasSynced, + nodeLister: node.Lister(), faLister: fa.Lister(), + gameServerSynced: gsInformer.HasSynced, fleetSynced: fInformer.HasSynced, fasSynced: fasInformer.HasSynced, faSynced: faInformer.HasSynced, + nodeSynced: nodeInformer.HasSynced, gsCount: GameServerCount{}, faCount: map[string]int64{}, } @@ -300,6 +310,7 @@ func (c *Controller) collect() { defer c.lock.Unlock() c.collectGameServerCounts() c.collectFleetAllocationCounts() + c.collectNodeCounts() } // collects fleet allocations count by going through our informer cache @@ -312,6 +323,7 @@ func (c *Controller) collectFleetAllocationCounts() { fleetAllocations, err := c.faLister.List(labels.Everything()) if err != nil { c.logger.WithError(err).Warn("failed listing fleet allocations") + return } for _, fa := range fleetAllocations { @@ -331,9 +343,41 @@ func (c *Controller) collectGameServerCounts() { gameservers, err := c.gameServerLister.List(labels.Everything()) if err != nil { c.logger.WithError(err).Warn("failed listing gameservers") + return } if err := c.gsCount.record(gameservers); err != nil { c.logger.WithError(err).Warn("error while recoding stats") } } + +// collectNodeCounts count gameservers per node using informer cache. +func (c *Controller) collectNodeCounts() { + gsPerNodes := map[string]int32{} + + gameservers, err := c.gameServerLister.List(labels.Everything()) + if err != nil { + c.logger.WithError(err).Warn("failed listing gameservers") + return + } + for _, gs := range gameservers { + if gs.Status.NodeName != "" { + gsPerNodes[gs.Status.NodeName]++ + } + } + + nodes, err := c.nodeLister.List(labels.Everything()) + if err != nil { + c.logger.WithError(err).Warn("failed listing gameservers") + return + } + recordWithTags(context.Background(), []tag.Mutator{tag.Insert(keyEmpty, "true")}, + nodesCountStats.M(int64(len(nodes)-len(gsPerNodes)))) + recordWithTags(context.Background(), []tag.Mutator{tag.Insert(keyEmpty, "false")}, + nodesCountStats.M(int64(len(gsPerNodes)))) + + for _, node := range nodes { + stats.Record(context.Background(), gsPerNodesCountStats.M(int64(gsPerNodes[node.Name]))) + } + +} diff --git a/pkg/metrics/controller_metrics.go b/pkg/metrics/controller_metrics.go index d489348efe..b2de06320b 100644 --- a/pkg/metrics/controller_metrics.go +++ b/pkg/metrics/controller_metrics.go @@ -32,6 +32,8 @@ var ( fleetAllocationCountStats = stats.Int64("fleet_allocations/count", "The count of fleet allocations", "1") gameServerTotalStats = stats.Int64("gameservers/total", "The total of gameservers", "1") fleetAllocationTotalStats = stats.Int64("fleet_allocations/total", "The total of fleet allocations", "1") + nodesCountStats = stats.Int64("nodes/count", "The count of nodes in the cluster", "1") + gsPerNodesCountStats = stats.Int64("gameservers_node/count", "The count of gameservers per node in the cluster", "1") stateViews = []*view.View{ &view.View{ @@ -111,6 +113,19 @@ var ( Aggregation: view.Count(), TagKeys: []tag.Key{keyType, keyFleetName}, }, + &view.View{ + Name: "nodes_count", + Measure: nodesCountStats, + Description: "The count of nodes in the cluster", + Aggregation: view.LastValue(), + TagKeys: []tag.Key{keyEmpty}, + }, + &view.View{ + Name: "gameservers_node_count", + Measure: gsPerNodesCountStats, + Description: "The count of gameservers per node in the cluster", + Aggregation: view.Distribution(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, 40, 50, 60, 70, 80, 90, 100, 110, 120), + }, } ) diff --git a/pkg/metrics/controller_test.go b/pkg/metrics/controller_test.go index c4ade4587a..b47cc1c22f 100644 --- a/pkg/metrics/controller_test.go +++ b/pkg/metrics/controller_test.go @@ -34,7 +34,7 @@ func TestControllerGameServerCount(t *testing.T) { c := newFakeController() defer c.close() - gs1 := gameServer("test-fleet", v1alpha1.GameServerStateCreating) + gs1 := gameServerWithFleetAndState("test-fleet", v1alpha1.GameServerStateCreating) c.gsWatch.Add(gs1) gs1 = gs1.DeepCopy() gs1.Status.State = v1alpha1.GameServerStateReady @@ -47,8 +47,8 @@ func TestControllerGameServerCount(t *testing.T) { gs1 = gs1.DeepCopy() gs1.Status.State = v1alpha1.GameServerStateShutdown c.gsWatch.Modify(gs1) - c.gsWatch.Add(gameServer("", v1alpha1.GameServerStatePortAllocation)) - c.gsWatch.Add(gameServer("", v1alpha1.GameServerStatePortAllocation)) + c.gsWatch.Add(gameServerWithFleetAndState("", v1alpha1.GameServerStatePortAllocation)) + c.gsWatch.Add(gameServerWithFleetAndState("", v1alpha1.GameServerStatePortAllocation)) c.sync() c.collect() @@ -109,7 +109,7 @@ func TestControllerFleetAllocationTotal(t *testing.T) { fa.Status.GameServer = nil c.faWatch.Add(fa) faUpdated := fa.DeepCopy() - faUpdated.Status.GameServer = gameServer("test", v1alpha1.GameServerStateAllocated) + faUpdated.Status.GameServer = gameServerWithFleetAndState("test", v1alpha1.GameServerStateAllocated) c.faWatch.Modify(faUpdated) // make sure we count only one event c.faWatch.Modify(faUpdated) @@ -119,7 +119,7 @@ func TestControllerFleetAllocationTotal(t *testing.T) { fa.Status.GameServer = nil c.faWatch.Add(fa) faUpdated := fa.DeepCopy() - faUpdated.Status.GameServer = gameServer("test2", v1alpha1.GameServerStateAllocated) + faUpdated.Status.GameServer = gameServerWithFleetAndState("test2", v1alpha1.GameServerStateAllocated) c.faWatch.Modify(faUpdated) } c.sync() @@ -139,7 +139,7 @@ func TestControllerGameServersTotal(t *testing.T) { c.run(t) // deleted gs should not be counted - gs := gameServer("deleted", v1alpha1.GameServerStateCreating) + gs := gameServerWithFleetAndState("deleted", v1alpha1.GameServerStateCreating) c.gsWatch.Add(gs) c.gsWatch.Delete(gs) @@ -223,3 +223,26 @@ func TestControllerFleetAutoScalerState(t *testing.T) { "agones_fleet_autoscalers_current_replicas_count", "agones_fleet_autoscalers_desired_replicas_count", "agones_fleet_autoscalers_limited")) } + +func TestControllerGameServersNodeState(t *testing.T) { + registry := prometheus.NewRegistry() + _, err := RegisterPrometheusExporter(registry) + assert.Nil(t, err) + + c := newFakeController() + defer c.close() + + c.nodeWatch.Add(nodeWithName("node1")) + c.nodeWatch.Add(nodeWithName("node2")) + c.nodeWatch.Add(nodeWithName("node3")) + c.gsWatch.Add(gameServerWithNode("node1")) + c.gsWatch.Add(gameServerWithNode("node2")) + c.gsWatch.Add(gameServerWithNode("node2")) + + c.sync() + c.collect() + report() + + assert.Nil(t, testutil.GatherAndCompare(registry, strings.NewReader(nodeCountExpected), "agones_nodes_count", "agones_gameservers_node_count")) + +} diff --git a/pkg/metrics/exporter.go b/pkg/metrics/exporter.go index 4456f67696..110eb53a4a 100644 --- a/pkg/metrics/exporter.go +++ b/pkg/metrics/exporter.go @@ -68,7 +68,7 @@ func RegisterStackdriverExporter(projectID string) (sd *stackdriver.Exporter, er // we are going to use func SetReportingPeriod(prometheus, stackdriver bool) { // if we're using only prometheus we can report faster as we're only exposing metrics in memory - reportingPeriod := 1 * time.Second + reportingPeriod := 15 * time.Second if stackdriver { // There is a limitation on Stackdriver that reporting should // be equal or more than 1 minute diff --git a/pkg/metrics/util.go b/pkg/metrics/util.go index 8a5837cb00..46c01baad3 100644 --- a/pkg/metrics/util.go +++ b/pkg/metrics/util.go @@ -31,6 +31,7 @@ var ( keyStatusCode = mustTagKey("status_code") keyVerb = mustTagKey("verb") keyEndpoint = mustTagKey("endpoint") + keyEmpty = mustTagKey("empty") ) func recordWithTags(ctx context.Context, mutators []tag.Mutator, ms ...stats.Measurement) { diff --git a/pkg/metrics/util_test.go b/pkg/metrics/util_test.go index 4245ec28a1..87970add10 100644 --- a/pkg/metrics/util_test.go +++ b/pkg/metrics/util_test.go @@ -18,6 +18,8 @@ import ( "context" "testing" + v1 "k8s.io/api/core/v1" + "agones.dev/agones/pkg/apis/stable/v1alpha1" agtesting "agones.dev/agones/pkg/testing" "github.com/stretchr/testify/assert" @@ -33,19 +35,21 @@ import ( // newFakeController returns a controller, backed by the fake Clientset func newFakeController() *fakeController { m := agtesting.NewMocks() - c := NewController(m.KubeClient, m.AgonesClient, m.AgonesInformerFactory) + c := NewController(m.KubeClient, m.AgonesClient, m.KubeInformerFactory, m.AgonesInformerFactory) gsWatch := watch.NewFake() faWatch := watch.NewFake() fasWatch := watch.NewFake() fleetWatch := watch.NewFake() + nodeWatch := watch.NewFake() m.AgonesClient.AddWatchReactor("gameservers", k8stesting.DefaultWatchReactor(gsWatch, nil)) m.AgonesClient.AddWatchReactor("fleetallocations", k8stesting.DefaultWatchReactor(faWatch, nil)) m.AgonesClient.AddWatchReactor("fleetautoscalers", k8stesting.DefaultWatchReactor(fasWatch, nil)) m.AgonesClient.AddWatchReactor("fleets", k8stesting.DefaultWatchReactor(fleetWatch, nil)) + m.KubeClient.AddWatchReactor("nodes", k8stesting.DefaultWatchReactor(nodeWatch, nil)) stop, cancel := agtesting.StartInformers(m, c.gameServerSynced, c.faSynced, - c.fleetSynced, c.fasSynced) + c.fleetSynced, c.fasSynced, c.nodeSynced) return &fakeController{ Controller: c, @@ -54,6 +58,7 @@ func newFakeController() *fakeController { faWatch: faWatch, fasWatch: fasWatch, fleetWatch: fleetWatch, + nodeWatch: nodeWatch, cancel: cancel, stop: stop, } @@ -93,11 +98,27 @@ type fakeController struct { faWatch *watch.FakeWatcher fasWatch *watch.FakeWatcher fleetWatch *watch.FakeWatcher + nodeWatch *watch.FakeWatcher stop <-chan struct{} cancel context.CancelFunc } -func gameServer(fleetName string, state v1alpha1.GameServerState) *v1alpha1.GameServer { +func nodeWithName(name string) *v1.Node { + return &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + UID: uuid.NewUUID(), + }, + } +} + +func gameServerWithNode(nodeName string) *v1alpha1.GameServer { + gs := gameServerWithFleetAndState("fleet", v1alpha1.GameServerStateReady) + gs.Status.NodeName = nodeName + return gs +} + +func gameServerWithFleetAndState(fleetName string, state v1alpha1.GameServerState) *v1alpha1.GameServer { lbs := map[string]string{} if fleetName != "" { lbs[v1alpha1.FleetNameLabel] = fleetName @@ -118,7 +139,7 @@ func gameServer(fleetName string, state v1alpha1.GameServerState) *v1alpha1.Game func generateGsEvents(count int, state v1alpha1.GameServerState, fleetName string, fakew *watch.FakeWatcher) { for i := 0; i < count; i++ { - gs := gameServer(fleetName, v1alpha1.GameServerState("")) + gs := gameServerWithFleetAndState(fleetName, v1alpha1.GameServerState("")) fakew.Add(gs) gsUpdated := gs.DeepCopy() gsUpdated.Status.State = state @@ -139,7 +160,7 @@ func fleetAllocation(fleetName string) *v1alpha1.FleetAllocation { FleetName: fleetName, }, Status: v1alpha1.FleetAllocationStatus{ - GameServer: gameServer(fleetName, v1alpha1.GameServerStateAllocated), + GameServer: gameServerWithFleetAndState(fleetName, v1alpha1.GameServerStateAllocated), }, } } @@ -266,3 +287,40 @@ agones_fleet_autoscalers_limited{fleet_name="first-fleet",name="name-switch"} 0 agones_fleet_autoscalers_limited{fleet_name="second-fleet",name="name-switch"} 1 agones_fleet_autoscalers_limited{fleet_name="deleted-fleet",name="deleted"} 0 ` + +var nodeCountExpected = `# HELP agones_gameservers_node_count The count of gameservers per node in the cluster +# TYPE agones_gameservers_node_count histogram +agones_gameservers_node_count_bucket{le="1"} 3 +agones_gameservers_node_count_bucket{le="2"} 4 +agones_gameservers_node_count_bucket{le="3"} 5 +agones_gameservers_node_count_bucket{le="4"} 5 +agones_gameservers_node_count_bucket{le="5"} 5 +agones_gameservers_node_count_bucket{le="6"} 5 +agones_gameservers_node_count_bucket{le="7"} 5 +agones_gameservers_node_count_bucket{le="8"} 5 +agones_gameservers_node_count_bucket{le="9"} 5 +agones_gameservers_node_count_bucket{le="10"} 5 +agones_gameservers_node_count_bucket{le="11"} 5 +agones_gameservers_node_count_bucket{le="12"} 5 +agones_gameservers_node_count_bucket{le="13"} 5 +agones_gameservers_node_count_bucket{le="14"} 5 +agones_gameservers_node_count_bucket{le="15"} 5 +agones_gameservers_node_count_bucket{le="16"} 5 +agones_gameservers_node_count_bucket{le="32"} 5 +agones_gameservers_node_count_bucket{le="40"} 5 +agones_gameservers_node_count_bucket{le="50"} 5 +agones_gameservers_node_count_bucket{le="60"} 5 +agones_gameservers_node_count_bucket{le="70"} 5 +agones_gameservers_node_count_bucket{le="80"} 5 +agones_gameservers_node_count_bucket{le="90"} 5 +agones_gameservers_node_count_bucket{le="100"} 5 +agones_gameservers_node_count_bucket{le="110"} 5 +agones_gameservers_node_count_bucket{le="120"} 5 +agones_gameservers_node_count_bucket{le="+Inf"} 5 +agones_gameservers_node_count_sum 3 +agones_gameservers_node_count_count 5 +# HELP agones_nodes_count The count of nodes in the cluster +# TYPE agones_nodes_count gauge +agones_nodes_count{empty="false"} 2 +agones_nodes_count{empty="true"} 1 +` diff --git a/site/content/en/docs/Guides/metrics.md b/site/content/en/docs/Guides/metrics.md index fd9a0e462b..65aca7c820 100644 --- a/site/content/en/docs/Guides/metrics.md +++ b/site/content/en/docs/Guides/metrics.md @@ -54,19 +54,21 @@ Follow the [Stackdriver Installation steps](#stackdriver-installation) to see yo ## Metrics available -| Name | Description | Type | -|-------------------------------------------------|---------------------------------------------------------------------|---------| -| agones_gameservers_count | The number of gameservers per fleet and status | gauge | -| agones_fleet_allocations_count | The number of fleet allocations per fleet | gauge | -| agones_gameservers_total | The total of gameservers per fleet and status | counter | -| agones_fleet_allocations_total | The total of fleet allocations per fleet | counter | -| agones_fleets_replicas_count | The number of replicas per fleet (total, desired, ready, allocated) | gauge | -| agones_fleet_autoscalers_able_to_scale | The fleet autoscaler can access the fleet to scale | gauge | -| agones_fleet_autoscalers_buffer_limits | he limits of buffer based fleet autoscalers (min, max) | gauge | -| agones_fleet_autoscalers_buffer_size | The buffer size of fleet autoscalers (count or percentage) | gauge | -| agones_fleet_autoscalers_current_replicas_count | The current replicas count as seen by autoscalers | gauge | -| agones_fleet_autoscalers_desired_replicas_count | The desired replicas count as seen by autoscalers | gauge | -| agones_fleet_autoscalers_limited | The fleet autoscaler is capped (1) | gauge | +| Name | Description | Type | +|-------------------------------------------------|---------------------------------------------------------------------|-----------| +| agones_gameservers_count | The number of gameservers per fleet and status | gauge | +| agones_fleet_allocations_count | The number of fleet allocations per fleet | gauge | +| agones_gameservers_total | The total of gameservers per fleet and status | counter | +| agones_fleet_allocations_total | The total of fleet allocations per fleet | counter | +| agones_fleets_replicas_count | The number of replicas per fleet (total, desired, ready, allocated) | gauge | +| agones_fleet_autoscalers_able_to_scale | The fleet autoscaler can access the fleet to scale | gauge | +| agones_fleet_autoscalers_buffer_limits | he limits of buffer based fleet autoscalers (min, max) | gauge | +| agones_fleet_autoscalers_buffer_size | The buffer size of fleet autoscalers (count or percentage) | gauge | +| agones_fleet_autoscalers_current_replicas_count | The current replicas count as seen by autoscalers | gauge | +| agones_fleet_autoscalers_desired_replicas_count | The desired replicas count as seen by autoscalers | gauge | +| agones_fleet_autoscalers_limited | The fleet autoscaler is capped (1) | gauge | +| agones_gameservers_node_count | The distribution of gameservers per node | histogram | +| agones_nodes_count | The count of nodes empty and with gameservers | gauge | ## Dashboard @@ -78,11 +80,16 @@ We provide a set of useful [Grafana](https://grafana.com/) dashboards to monitor - {{< ghlink href="/build/grafana/dashboard-gameservers.yaml" branch="master" >}}Agones GameServers{{< /ghlink >}} displays your current game servers workload status (allocations , game servers statuses, fleets replicas) with optional fleet name filtering. +{{% feature publishVersion="0.8.0" %}} +- {{< ghlink href="/build/grafana/dashboard-allocations.yaml" branch="master" >}}Agones GameServer Allocations{{< /ghlink >}} displays Agones gameservers allocations rates and counts per fleet. +{{% /feature %}} + - {{< ghlink href="/build/grafana/dashboard-status.yaml" branch="master" >}}Agones Status{{< /ghlink >}} displays Agones controller health status. - {{< ghlink href="/build/grafana/dashboard-controller-usage.yaml" branch="master" >}}Agones Controller Resource Usage{{< /ghlink >}} displays Agones Controller CPU and memory usage and also some Golang runtime metrics. {{% feature publishVersion="0.8.0" %}} + - {{< ghlink href="/build/grafana/dashboard-goclient-requests.yaml" branch="master" >}}Agones Controller go-client requests{{< /ghlink >}} displays Agones Controller Kubernetes API consumption. - {{< ghlink href="/build/grafana/dashboard-goclient-caches.yaml" branch="master" >}}Agones Controller go-client caches{{< /ghlink >}} displays Agones Controller Kubernetes Watches/Lists operations used.