From 9283e2f39c57bc79806adec76bb70f82375a149b Mon Sep 17 00:00:00 2001 From: dsmith111 Date: Thu, 15 Sep 2022 15:01:33 -0700 Subject: [PATCH 01/22] Added GameServer control performance metrics; Updated Grafana dashboard --- .gitignore | 5 +- pkg/operator/api/v1alpha1/gameserver_types.go | 2 + .../controllers/gameserverbuild_controller.go | 70 +- pkg/operator/controllers/metrics.go | 32 + samples/grafana/dashboard.json | 1607 +++++++++++------ 5 files changed, 1138 insertions(+), 578 deletions(-) diff --git a/.gitignore b/.gitignore index b3b00e54..d2ee2c4f 100644 --- a/.gitignore +++ b/.gitignore @@ -21,4 +21,7 @@ installfilesdev -.uptodate \ No newline at end of file +.uptodate + +# vscode settings +.vscode \ No newline at end of file diff --git a/pkg/operator/api/v1alpha1/gameserver_types.go b/pkg/operator/api/v1alpha1/gameserver_types.go index 554ff2ea..9504883d 100644 --- a/pkg/operator/api/v1alpha1/gameserver_types.go +++ b/pkg/operator/api/v1alpha1/gameserver_types.go @@ -80,6 +80,8 @@ type GameServerStatus struct { Health GameServerHealth `json:"health,omitempty"` // State defines the state of the game server (Initializing, StandingBy, Active etc.) State GameServerState `json:"state,omitempty"` + // The Previously known manually set state + PrevState GameServerState `json:"prevstate,omitempty"` // PublicIP is the PublicIP of the game server PublicIP string `json:"publicIP,omitempty"` // Ports is a concatenated list of the ports this game server listens to diff --git a/pkg/operator/controllers/gameserverbuild_controller.go b/pkg/operator/controllers/gameserverbuild_controller.go index 80ab5dd9..a31f21ee 100644 --- a/pkg/operator/controllers/gameserverbuild_controller.go +++ b/pkg/operator/controllers/gameserverbuild_controller.go @@ -23,6 +23,7 @@ import ( "runtime" "sort" "sync" + "time" mpsv1alpha1 "github.com/playfab/thundernetes/pkg/operator/api/v1alpha1" corev1 "k8s.io/api/core/v1" @@ -140,6 +141,16 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ // calculate counts by state so we can update .status accordingly var activeCount, standingByCount, crashesCount, initializingCount, pendingCount int + // Gather sum of time taken to reach standingby phase and server count to produce the recent average gameserver initialization time + var timeToStandBySum float64 + var recentStandingByCount int + timeToStandBySum = 0 + + // Gather current sum of estimated time taken to clean up crashed or pending deletion gameservers + var timeToDeleteBySum float64 + var pendingCleanUpCount int + timeToStandBySum = 0 + for i := 0; i < len(gameServers.Items); i++ { gs := gameServers.Items[i] @@ -149,6 +160,10 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ initializingCount++ } else if gs.Status.State == mpsv1alpha1.GameServerStateStandingBy && gs.Status.Health == mpsv1alpha1.GameServerHealthy { standingByCount++ + if gs.Status.PrevState != gs.Status.State { + timeToStandBySum += float64(gs.Status.ReachedStandingByOn.Sub(gs.CreationTimestamp.Time).Milliseconds()) + recentStandingByCount++ + } } else if gs.Status.State == mpsv1alpha1.GameServerStateActive && gs.Status.Health == mpsv1alpha1.GameServerHealthy { activeCount++ } else if gs.Status.State == mpsv1alpha1.GameServerStateGameCompleted && gs.Status.Health == mpsv1alpha1.GameServerHealthy { @@ -156,9 +171,13 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ if err := r.Delete(ctx, &gs); err != nil { return ctrl.Result{}, err } + GameServersSessionEndedCounter.WithLabelValues(gsb.Name).Inc() r.expectations.addGameServerToUnderDeletionMap(gsb.Name, gs.Name) r.Recorder.Eventf(&gsb, corev1.EventTypeNormal, "Exited", "GameServer %s session completed", gs.Name) + + pendingCleanUpCount++ + timeToDeleteBySum += math.Abs(float64(time.Until(gs.DeletionTimestamp.Time).Milliseconds())) } else if gs.Status.State == mpsv1alpha1.GameServerStateCrashed { // game server process exited with code != 0 (crashed) crashesCount++ @@ -168,6 +187,9 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ GameServersCrashedCounter.WithLabelValues(gsb.Name).Inc() r.expectations.addGameServerToUnderDeletionMap(gsb.Name, gs.Name) r.Recorder.Eventf(&gsb, corev1.EventTypeNormal, "Unhealthy", "GameServer %s was deleted because it became unhealthy, state: %s, health: %s", gs.Name, gs.Status.State, gs.Status.Health) + + pendingCleanUpCount++ + timeToDeleteBySum += math.Abs(float64(time.Until(gs.DeletionTimestamp.Time).Milliseconds())) } else if gs.Status.Health == mpsv1alpha1.GameServerUnhealthy { // all cases where the game server was marked as Unhealthy crashesCount++ @@ -177,25 +199,37 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ GameServersUnhealthyCounter.WithLabelValues(gsb.Name).Inc() r.expectations.addGameServerToUnderDeletionMap(gsb.Name, gs.Name) r.Recorder.Eventf(&gsb, corev1.EventTypeNormal, "Crashed", "GameServer %s was deleted because it crashed, state: %s, health: %s", gs.Name, gs.Status.State, gs.Status.Health) + + pendingCleanUpCount++ + timeToDeleteBySum += math.Abs(float64(time.Until(gs.DeletionTimestamp.Time).Milliseconds())) + } + if gs.Status.State != gs.Status.PrevState { + gs.Status.PrevState = gs.Status.State } } + if recentStandingByCount > 0 { + GameServersCreatedDuration.WithLabelValues(gsb.Name).Set(timeToStandBySum / float64(recentStandingByCount)) + } + + if pendingCleanUpCount > 0 { + GameServersCleanUpDuration.WithLabelValues(gsb.Name).Set(timeToDeleteBySum / float64(pendingCleanUpCount)) + } + // calculate the total amount of servers not in the active state nonActiveGameServersCount := standingByCount + initializingCount + pendingCount // user has decreased standingBy numbers if nonActiveGameServersCount > gsb.Spec.StandingBy { - totalNumberOfGameServersToDelete := int(math.Min(float64(nonActiveGameServersCount-gsb.Spec.StandingBy), maxNumberOfGameServersToDelete)) - err := r.deleteNonActiveGameServers(ctx, &gsb, &gameServers, totalNumberOfGameServersToDelete) - if err != nil { - return ctrl.Result{}, err + var totalNumberOfGameServersToDelete int + + // we also need to check if we are above the max + // this can happen if the user modifies the spec.Max during the GameServerBuild's lifetime + if nonActiveGameServersCount+activeCount > gsb.Spec.Max { + totalNumberOfGameServersToDelete += int(math.Min(float64(nonActiveGameServersCount+activeCount-gsb.Spec.Max), maxNumberOfGameServersToDelete)) } - } - // we need to check if we are above the max - // this can happen if the user modifies the spec.Max during the GameServerBuild's lifetime - if nonActiveGameServersCount+activeCount > gsb.Spec.Max { - totalNumberOfGameServersToDelete := int(math.Min(float64(nonActiveGameServersCount+activeCount-gsb.Spec.Max), maxNumberOfGameServersToDelete)) + totalNumberOfGameServersToDelete = int(math.Min(float64(totalNumberOfGameServersToDelete+nonActiveGameServersCount-gsb.Spec.StandingBy), maxNumberOfGameServersToDelete)) err := r.deleteNonActiveGameServers(ctx, &gsb, &gameServers, totalNumberOfGameServersToDelete) if err != nil { return ctrl.Result{}, err @@ -207,13 +241,16 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ // we attempt to create the missing number of game servers, but we don't want to create more than the max // an error channel for the go routines to write errors errCh := make(chan error, maxNumberOfGameServersToAdd) + + // Time how long it takes to trigger new standby gameservers + standByReconcileStartTime := time.Now() // a waitgroup for async create calls var wg sync.WaitGroup for i := 0; i < gsb.Spec.StandingBy-nonActiveGameServersCount && i+nonActiveGameServersCount+activeCount < gsb.Spec.Max && i < maxNumberOfGameServersToAdd; i++ { wg.Add(1) - go func() { + go func(standByStartTime time.Time) { defer wg.Done() newgs, err := NewGameServerForGameServerBuild(&gsb, r.PortRegistry) if err != nil { @@ -224,12 +261,15 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ errCh <- err return } + newgs.Status.PrevState = mpsv1alpha1.GameServerStateInitializing r.expectations.addGameServerToUnderCreationMap(gsb.Name, newgs.Name) GameServersCreatedCounter.WithLabelValues(gsb.Name).Inc() r.Recorder.Eventf(&gsb, corev1.EventTypeNormal, "Creating", "Creating GameServer %s", newgs.Name) - }() + GameServersStandByReconcileDuration.WithLabelValues(gsb.Name).Set(float64(time.Since(standByStartTime).Milliseconds())) + }(standByReconcileStartTime) } wg.Wait() + if len(errCh) > 0 { return ctrl.Result{}, <-errCh } @@ -325,6 +365,8 @@ func (r *GameServerBuildReconciler) deleteNonActiveGameServers(ctx context.Conte // a waitgroup for async deletion calls var wg sync.WaitGroup deletionCalls := 0 + deletionStartTime := time.Now() + // we sort the GameServers by state so that we can delete the ones that are empty state or Initializing before we delete the StandingBy ones (if needed) // this is to make sure we don't fall below the desired number of StandingBy during scaling down sort.Sort(ByState(gameServers.Items)) @@ -334,7 +376,7 @@ func (r *GameServerBuildReconciler) deleteNonActiveGameServers(ctx context.Conte if gs.Status.State == "" || gs.Status.State == mpsv1alpha1.GameServerStateInitializing || gs.Status.State == mpsv1alpha1.GameServerStateStandingBy { deletionCalls++ wg.Add(1) - go func() { + go func(deletionStartTime time.Time) { defer wg.Done() if err := r.deleteGameServer(ctx, &gs); err != nil { if apierrors.IsConflict(err) { // this GameServer has been updated, skip it @@ -346,7 +388,9 @@ func (r *GameServerBuildReconciler) deleteNonActiveGameServers(ctx context.Conte GameServersDeletedCounter.WithLabelValues(gsb.Name).Inc() r.expectations.addGameServerToUnderDeletionMap(gsb.Name, gs.Name) r.Recorder.Eventf(gsb, corev1.EventTypeNormal, "GameServer deleted", "GameServer %s deleted", gs.Name) - }() + duration := time.Since(deletionStartTime).Milliseconds() + GameServersEndedDuration.WithLabelValues(gsb.Name).Set(float64(duration)) + }(deletionStartTime) } } wg.Wait() diff --git a/pkg/operator/controllers/metrics.go b/pkg/operator/controllers/metrics.go index 13f14aa7..fa100d3e 100644 --- a/pkg/operator/controllers/metrics.go +++ b/pkg/operator/controllers/metrics.go @@ -24,6 +24,22 @@ var ( }, []string{"BuildName"}, ) + GameServersCreatedDuration = registry.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: "thundernetes", + Name: "gameservers_create_duration", + Help: "Average time it took to create a the newest set of GameServers", + }, + []string{"BuildName"}, + ) + GameServersStandByReconcileDuration = registry.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: "thundernetes", + Name: "gameservers_reconcile_standby_duration", + Help: "Time it took to begin initialization for all new GameServers", + }, + []string{"BuildName"}, + ) GameServersSessionEndedCounter = registry.NewCounterVec( prometheus.CounterOpts{ Namespace: "thundernetes", @@ -32,6 +48,22 @@ var ( }, []string{"BuildName"}, ) + GameServersEndedDuration = registry.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: "thundernetes", + Name: "gameservers_end_duration", + Help: "Time it took to delete a set of non-active GameServers", + }, + []string{"BuildName"}, + ) + GameServersCleanUpDuration = registry.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: "thundernetes", + Name: "gameservers_clean_up_duration", + Help: "Average time it took to clean up all completed or unhealthy GameServers", + }, + []string{"BuildName"}, + ) GameServersCrashedCounter = registry.NewCounterVec( prometheus.CounterOpts{ Namespace: "thundernetes", diff --git a/samples/grafana/dashboard.json b/samples/grafana/dashboard.json index 255b46c4..ec0b0523 100644 --- a/samples/grafana/dashboard.json +++ b/samples/grafana/dashboard.json @@ -3,7 +3,10 @@ "list": [ { "builtIn": 1, - "datasource": "-- Grafana --", + "datasource": { + "type": "datasource", + "uid": "grafana" + }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", @@ -21,7 +24,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 26, + "id": 27, "links": [], "liveNow": false, "panels": [ @@ -75,9 +78,13 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "8.4.5", + "pluginVersion": "9.1.0", "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, "exemplar": true, "expr": "sum(thundernetes_gameservers_current_state_per_build{state=\"active\"})", "interval": "", @@ -138,9 +145,13 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "8.4.5", + "pluginVersion": "9.1.0", "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, "exemplar": true, "expr": "sum(thundernetes_gameservers_current_state_per_build{state=\"standingby\"})", "interval": "", @@ -201,7 +212,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "8.4.5", + "pluginVersion": "9.1.0", "targets": [ { "datasource": { @@ -268,7 +279,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "8.4.5", + "pluginVersion": "9.1.0", "targets": [ { "datasource": { @@ -286,7 +297,11 @@ "type": "stat" }, { - "collapsed": false, + "collapsed": true, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, "gridPos": { "h": 1, "w": 24, @@ -294,484 +309,933 @@ "y": 8 }, "id": 20, - "panels": [], - "title": "Status", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineStyle": { - "fill": "solid" - }, - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ + "overrides": [ { - "color": "green", - "value": null + "matcher": { + "id": "byName", + "options": "gameserverbuild-sample-openarena-StandBy" + }, + "properties": [ + { + "id": "custom.fillBelowTo", + "value": "gameserverbuild-sample-openarena-Active" + }, + { + "id": "custom.fillOpacity", + "value": 6 + } + ] } ] - } - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "gameserverbuild-sample-openarena-StandBy" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "properties": [ - { - "id": "custom.fillBelowTo", - "value": "gameserverbuild-sample-openarena-Active" + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" }, - { - "id": "custom.fillOpacity", - "value": 6 - } - ] - } - ] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 9 - }, - "id": 2, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" + "exemplar": true, + "expr": "sum(thundernetes_gameservers_current_state_per_build) by (state)", + "hide": false, + "interval": "", + "legendFormat": "{{state}}", + "refId": "B" + } + ], + "title": "Game Server History", + "type": "timeseries" }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "targets": [ { - "exemplar": true, - "expr": "sum(thundernetes_gameservers_current_state_per_build) by (state)", - "hide": false, - "interval": "", - "legendFormat": "{{state}}", - "refId": "B" - } - ], - "title": "Game Server History", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 18, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "thresholdsStyle": { - "mode": "off" + "tooltip": { + "mode": "single", + "sort": "none" } }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 9 - }, - "id": 18, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" + "exemplar": true, + "expr": "sum(thundernetes_allocations_total{}) by (BuildName)", + "instant": false, + "interval": "", + "legendFormat": "{{ BuildName }}", + "refId": "A" + } + ], + "title": "Total allocations time series", + "type": "timeseries" }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, - "exemplar": true, - "expr": "sum(thundernetes_allocations_total{}) by (BuildName)", - "instant": false, - "interval": "", - "legendFormat": "{{ BuildName }}", - "refId": "A" - } - ], - "title": "Total allocations time series", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 23, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "exemplar": true, + "expr": "sum(thundernetes_gameservers_current_state_per_build{state=\"active\"}) by (BuildName)", + "instant": false, + "interval": "", + "legendFormat": "{{ BuildName }}", + "refId": "A" + } + ], + "title": "Active servers time series", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 17 + }, + "id": 25, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "thresholdsStyle": { - "mode": "off" + "tooltip": { + "mode": "single", + "sort": "none" } }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 17 - }, - "id": 23, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" + "exemplar": true, + "expr": "sum(thundernetes_gameservers_current_state_per_build{state=\"standingby\"}) by (BuildName)", + "instant": false, + "interval": "", + "legendFormat": "{{ BuildName }}", + "refId": "A" + } + ], + "title": "Standby servers time series", + "type": "timeseries" }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, - "exemplar": true, - "expr": "sum(thundernetes_gameservers_current_state_per_build{state=\"active\"}) by (BuildName)", - "instant": false, - "interval": "", - "legendFormat": "{{ BuildName }}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 6, + "y": 25 + }, + "id": 24, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "exemplar": true, + "expr": "sum(thundernetes_connected_players) by (BuildName)", + "instant": false, + "interval": "", + "legendFormat": "{{ BuildName}}", + "refId": "A" + } + ], + "title": "Connected players time series", + "type": "timeseries" + } + ], + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, "refId": "A" } ], - "title": "Active servers time series", - "type": "timeseries" + "title": "Status", + "type": "row" }, { + "collapsed": true, "datasource": { - "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "type": "datasource", + "uid": "grafana" }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 9, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "thresholdsStyle": { - "mode": "off" + "tooltip": { + "mode": "multi", + "sort": "none" } }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" }, - { - "color": "red", - "value": 80 - } - ] - } + "exemplar": true, + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"gameserverbuild.+\"}[5m])) by (pod) /\nsum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{pod=~\"gameserverbuild.+\"}) by (pod)", + "hide": false, + "interval": "", + "legendFormat": "{{pod}}", + "refId": "A" + } + ], + "title": "Game Server % of CPU Utilization", + "type": "timeseries" }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 17 - }, - "id": 25, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 18 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "exemplar": true, + "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{pod=~\"gameserverbuild.+\"}) by (pod) /\nsum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{namespace=\"default\"}) by (pod)", + "hide": false, + "interval": "", + "legendFormat": "{{pod}}", + "refId": "A" + } + ], + "title": "Game Server % of Memory Utilization", + "type": "timeseries" }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, - "exemplar": true, - "expr": "sum(thundernetes_gameservers_current_state_per_build{state=\"standingby\"}) by (BuildName)", - "instant": false, - "interval": "", - "legendFormat": "{{ BuildName }}", - "refId": "A" - } - ], - "title": "Standby servers time series", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 26 + }, + "id": 14, + "interval": "20", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "exemplar": true, + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"gameserverbuild.+\"}[5m])) by (pod) * 100", + "interval": "", + "legendFormat": "{{pod}}", + "refId": "A" + } + ], + "title": "Game Server CPU Total Utilization", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 26 + }, + "id": 16, + "interval": "20", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "thresholdsStyle": { - "mode": "off" + "tooltip": { + "mode": "single", + "sort": "none" } }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 6, - "y": 25 - }, - "id": 24, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "single", - "sort": "none" + "exemplar": true, + "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{pod=~\"gameserverbuild.+\"}) by (pod)", + "interval": "", + "legendFormat": "{{pod}}", + "refId": "A" + } + ], + "title": "Game Server Total Memory Utilization", + "type": "timeseries" } - }, + ], "targets": [ { "datasource": { - "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" + "type": "datasource", + "uid": "grafana" }, - "exemplar": true, - "expr": "sum(thundernetes_connected_players) by (BuildName)", - "instant": false, - "interval": "", - "legendFormat": "{{ BuildName}}", "refId": "A" } ], - "title": "Connected players time series", - "type": "timeseries" + "title": "Resources", + "type": "row" }, { "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 33 + "y": 10 }, - "id": 9, + "id": 29, "panels": [], - "title": "Resources", + "targets": [ + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "refId": "A" + } + ], + "title": "Thundernetes Controller", "type": "row" }, { @@ -782,22 +1246,24 @@ "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" + "mode": "thresholds" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", + "fillOpacity": 20, + "gradientMode": "opacity", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "lineInterpolation": "linear", - "lineWidth": 1, + "lineInterpolation": "smooth", + "lineWidth": 3, "pointSize": 5, "scaleDistribution": { "type": "linear" @@ -813,8 +1279,6 @@ } }, "mappings": [], - "max": 1, - "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -828,7 +1292,7 @@ } ] }, - "unit": "percentunit" + "unit": "percent" }, "overrides": [] }, @@ -836,20 +1300,24 @@ "h": 8, "w": 12, "x": 0, - "y": 34 + "y": 11 }, - "id": 4, + "id": 27, + "interval": "1m", + "links": [], "options": { "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { - "mode": "multi", + "mode": "single", "sort": "none" } }, + "pluginVersion": "8.4.3", "targets": [ { "datasource": { @@ -857,14 +1325,16 @@ "uid": "P1809F7CD0C75ACF3" }, "exemplar": true, - "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"gameserverbuild.+\"}[5m])) by (pod) /\nsum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{pod=~\"gameserverbuild.+\"}) by (pod)", - "hide": false, + "expr": "rate(process_cpu_seconds_total{job=\"thundernetes-controller-manager-metrics-service\", namespace=\"thundernetes-system\", pod=~\"thundernetes-controller-manager.+\"}[5m]) * 100", + "format": "time_series", "interval": "", - "legendFormat": "{{pod}}", - "refId": "A" + "intervalFactor": 2, + "legendFormat": "Pod: {{pod}} | Container: {{container}}", + "refId": "A", + "step": 10 } ], - "title": "Game Server % of CPU Utilization", + "title": "Controller CPU Usage", "type": "timeseries" }, { @@ -875,22 +1345,25 @@ "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" + "fixedColor": "green", + "mode": "thresholds" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", + "fillOpacity": 20, + "gradientMode": "opacity", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "lineInterpolation": "linear", - "lineWidth": 1, + "lineInterpolation": "smooth", + "lineWidth": 3, "pointSize": 5, "scaleDistribution": { "type": "linear" @@ -906,22 +1379,16 @@ } }, "mappings": [], - "max": 1, - "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null - }, - { - "color": "red", - "value": 80 } ] }, - "unit": "percentunit" + "unit": "bytes" }, "overrides": [] }, @@ -929,31 +1396,41 @@ "h": 8, "w": 12, "x": 12, - "y": 34 + "y": 11 }, - "id": 5, + "id": 31, + "interval": "1m", + "links": [], "options": { "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { - "mode": "multi", + "mode": "single", "sort": "none" } }, + "pluginVersion": "8.4.3", "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, "exemplar": true, - "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{pod=~\"gameserverbuild.+\"}) by (pod) /\nsum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{namespace=\"default\"}) by (pod)", - "hide": false, + "expr": "process_resident_memory_bytes{job=\"thundernetes-controller-manager-metrics-service\", namespace=\"thundernetes-system\", pod=~\"thundernetes-controller-manager.+\"}", + "format": "time_series", "interval": "", - "legendFormat": "{{pod}}", - "refId": "A" + "intervalFactor": 2, + "legendFormat": "Pod: {{pod}} | Container: {{container}}", + "refId": "A", + "step": 10 } ], - "title": "Game Server % of Memory Utilization", + "title": "Controller Memory Usage", "type": "timeseries" }, { @@ -961,25 +1438,28 @@ "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, + "description": "Total number of reconciliation per controller", "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" + "mode": "thresholds" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", + "fillOpacity": 20, + "gradientMode": "opacity", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "lineInterpolation": "linear", - "lineWidth": 1, + "lineInterpolation": "smooth", + "lineWidth": 3, "pointSize": 5, "scaleDistribution": { "type": "linear" @@ -1001,14 +1481,10 @@ { "color": "green", "value": null - }, - { - "color": "red", - "value": 80 } ] }, - "unit": "percent" + "unit": "cpm" }, "overrides": [] }, @@ -1016,15 +1492,15 @@ "h": 8, "w": 12, "x": 0, - "y": 42 + "y": 19 }, - "id": 14, - "interval": "20", + "id": 35, "options": { "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -1037,14 +1513,16 @@ "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, + "editorMode": "code", "exemplar": true, - "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"gameserverbuild.+\"}[5m])) by (pod) * 100", + "expr": "sum(rate(controller_runtime_reconcile_total{job=\"thundernetes-controller-manager-metrics-service\", namespace=\"thundernetes-system\"}[5m])) by (instance, pod)", "interval": "", - "legendFormat": "{{pod}}", + "legendFormat": "{{instance}} {{pod}}", + "range": true, "refId": "A" } ], - "title": "Game Server CPU Total Utilization", + "title": "Reconciliation Total Count Per Controller", "type": "timeseries" }, { @@ -1052,25 +1530,28 @@ "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, + "description": "Total number of reconciliation errors per controller", "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" + "mode": "thresholds" }, "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", + "fillOpacity": 20, + "gradientMode": "scheme", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "lineInterpolation": "linear", - "lineWidth": 1, + "lineInterpolation": "smooth", + "lineWidth": 3, "pointSize": 5, "scaleDistribution": { "type": "linear" @@ -1092,14 +1573,10 @@ { "color": "green", "value": null - }, - { - "color": "red", - "value": 80 } ] }, - "unit": "bytes" + "unit": "cpm" }, "overrides": [] }, @@ -1107,15 +1584,15 @@ "h": 8, "w": 12, "x": 12, - "y": 42 + "y": 19 }, - "id": 16, - "interval": "20", + "id": 33, "options": { "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -1124,41 +1601,37 @@ }, "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "code", "exemplar": true, - "expr": "sum(node_namespace_pod_container:container_memory_working_set_bytes{pod=~\"gameserverbuild.+\"}) by (pod)", + "expr": "sum(rate(controller_runtime_reconcile_errors_total{job=\"thundernetes-controller-manager-metrics-service\", namespace=\"thundernetes-system\"}[5m])) by (instance, pod)", "interval": "", - "legendFormat": "{{pod}}", + "legendFormat": "{{instance}} {{pod}}", + "range": true, "refId": "A" } ], - "title": "Game Server Total Memory Utilization", + "title": "Reconciliation Error Count Per Controller", "type": "timeseries" }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 50 - }, - "id": 29, - "panels": [], - "title": "Thundernetes Controller", - "type": "row" - }, { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, + "description": "This panel consists of the average initialization time among all of the new GameServers from a batch of of GameServers (consisting of 1 or more) needing to be created in the reconciliation loop. ", "fieldConfig": { "defaults": { "color": { - "mode": "thresholds" + "mode": "palette-classic" }, "custom": { - "axisLabel": "", + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Time (milliseconds)", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -1198,8 +1671,7 @@ "value": 80 } ] - }, - "unit": "percent" + } }, "overrides": [] }, @@ -1207,40 +1679,36 @@ "h": 8, "w": 12, "x": 0, - "y": 51 + "y": 27 }, - "id": 27, - "interval": "1m", - "links": [], + "id": 37, "options": { "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, - "pluginVersion": "8.4.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, - "exemplar": true, - "expr": "rate(process_cpu_seconds_total{job=\"thundernetes-controller-manager-metrics-service\", namespace=\"thundernetes-system\", pod=~\"thundernetes-controller-manager.+\"}[5m]) * 100", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "Pod: {{pod}} | Container: {{container}}", - "refId": "A", - "step": 10 + "editorMode": "builder", + "expr": "thundernetes_gameservers_create_duration", + "hide": false, + "legendFormat": "{{BuildName}}-{{instance}}", + "range": true, + "refId": "A" } ], - "title": "Controller CPU Usage", + "title": "Average Time Until GameServer Reaches StandBy", "type": "timeseries" }, { @@ -1248,14 +1716,16 @@ "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, + "description": "The time it has taken to begin the initialization of every needed GameServer in the latest reconciliation loop.", "fieldConfig": { "defaults": { "color": { - "fixedColor": "green", - "mode": "thresholds" + "mode": "palette-classic" }, "custom": { - "axisLabel": "", + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Time (milliseconds)", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -1289,10 +1759,13 @@ { "color": "green", "value": null + }, + { + "color": "red", + "value": 80 } ] - }, - "unit": "bytes" + } }, "overrides": [] }, @@ -1300,40 +1773,36 @@ "h": 8, "w": 12, "x": 12, - "y": 51 + "y": 27 }, - "id": 31, - "interval": "1m", - "links": [], + "id": 38, "options": { "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, - "pluginVersion": "8.4.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, - "exemplar": true, - "expr": "process_resident_memory_bytes{job=\"thundernetes-controller-manager-metrics-service\", namespace=\"thundernetes-system\", pod=~\"thundernetes-controller-manager.+\"}", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "Pod: {{pod}} | Container: {{container}}", - "refId": "A", - "step": 10 + "editorMode": "builder", + "expr": "thundernetes_gameservers_reconcile_standby_duration", + "hide": false, + "legendFormat": "{{BuildName}}-{{instance}}", + "range": true, + "refId": "A" } ], - "title": "Controller Memory Usage", + "title": "GameServer StandBy Reconciliation Time", "type": "timeseries" }, { @@ -1341,14 +1810,16 @@ "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, - "description": "Total number of reconciliation per controller", + "description": "During a reconciliation loop, all GameServers marked as non-healthy or completed will have the time difference between now and their estimated time until deletion summed up and averaged.", "fieldConfig": { "defaults": { "color": { - "mode": "thresholds" + "mode": "palette-classic" }, "custom": { - "axisLabel": "", + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Time (milliseconds)", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -1382,10 +1853,13 @@ { "color": "green", "value": null + }, + { + "color": "red", + "value": 80 } ] - }, - "unit": "cpm" + } }, "overrides": [] }, @@ -1393,14 +1867,15 @@ "h": 8, "w": 12, "x": 0, - "y": 59 + "y": 35 }, - "id": 35, + "id": 39, "options": { "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -1413,16 +1888,15 @@ "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, - "editorMode": "code", - "exemplar": true, - "expr": "sum(rate(controller_runtime_reconcile_total{job=\"thundernetes-controller-manager-metrics-service\", namespace=\"thundernetes-system\"}[5m])) by (instance, pod)", - "interval": "", - "legendFormat": "{{instance}} {{pod}}", + "editorMode": "builder", + "expr": "thundernetes_gameservers_clean_up_duration", + "hide": false, + "legendFormat": "{{BuildName}}-{{instance}}", "range": true, "refId": "A" } ], - "title": "Reconciliation Total Count Per Controller", + "title": "Average Estimated Time Until Hanging GameServer Clean Up", "type": "timeseries" }, { @@ -1430,19 +1904,21 @@ "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, - "description": "Total number of reconciliation errors per controller", + "description": "The time it has taken to begin the scaling down/deletion of every inactive GameServer given a decrease in the number of allowed StandBy servers or a breach past the maximum allowed.", "fieldConfig": { "defaults": { "color": { - "mode": "thresholds" + "mode": "palette-classic" }, "custom": { - "axisLabel": "", + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Time (milliseconds)", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 20, - "gradientMode": "scheme", + "gradientMode": "opacity", "hideFrom": { "legend": false, "tooltip": false, @@ -1471,10 +1947,13 @@ { "color": "green", "value": null + }, + { + "color": "red", + "value": 80 } ] - }, - "unit": "cpm" + } }, "overrides": [] }, @@ -1482,14 +1961,15 @@ "h": 8, "w": 12, "x": 12, - "y": 59 + "y": 35 }, - "id": 33, + "id": 40, "options": { "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom" + "placement": "bottom", + "showLegend": true }, "tooltip": { "mode": "single", @@ -1502,21 +1982,20 @@ "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, - "editorMode": "code", - "exemplar": true, - "expr": "sum(rate(controller_runtime_reconcile_errors_total{job=\"thundernetes-controller-manager-metrics-service\", namespace=\"thundernetes-system\"}[5m])) by (instance, pod)", - "interval": "", - "legendFormat": "{{instance}} {{pod}}", + "editorMode": "builder", + "expr": "thundernetes_gameservers_end_duration", + "hide": false, + "legendFormat": "{{BuildName}}-{{instance}}", "range": true, "refId": "A" } ], - "title": "Reconciliation Error Count Per Controller", + "title": "GameServer Scale Down Time", "type": "timeseries" } ], - "refresh": "10s", - "schemaVersion": 35, + "refresh": "5s", + "schemaVersion": 37, "style": "dark", "tags": [], "templating": { @@ -1530,6 +2009,6 @@ "timezone": "", "title": "Thundernetes GameServer Demo", "uid": "T9KjuZOnz", - "version": 3, + "version": 4, "weekStart": "" -} +} \ No newline at end of file From f432e82f1450328ce0e7ad5cfab26c2c90116ff9 Mon Sep 17 00:00:00 2001 From: dsmith111 Date: Thu, 15 Sep 2022 15:47:17 -0700 Subject: [PATCH 02/22] Update monitoring documentation; Refactor non-active handling --- docs/howtos/monitoring.md | 4 ++++ .../controllers/gameserverbuild_controller.go | 20 ++++++++++--------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/docs/howtos/monitoring.md b/docs/howtos/monitoring.md index c0f2484a..3ace35ee 100644 --- a/docs/howtos/monitoring.md +++ b/docs/howtos/monitoring.md @@ -86,9 +86,13 @@ There is a custom Grafana dashboard example that visualizes some of this data in | connected_players | Gauge | nodeagent | | gameservers_current_state_per_build | Gauge | controller-manager | | gameservers_created_total | Counter | controller-manager | +| gameservers_create_duration | Gauge | controller-manager | +| gameservers_reconcile_standby_duration | Gauge | controller-manager | | gameservers_sessionended_total | Counter | controller-manager | | gameservers_crashed_total | Counter | controller-manager | | gameservers_deleted_total | Counter | controller-manager | +| gameservers_end_duration | Gauge | controller-manager | +| gameservers_clean_up_duration | Gauge | controller-manager | | allocations_total | Counter | controller-manager | ## More pictures diff --git a/pkg/operator/controllers/gameserverbuild_controller.go b/pkg/operator/controllers/gameserverbuild_controller.go index a31f21ee..397c122c 100644 --- a/pkg/operator/controllers/gameserverbuild_controller.go +++ b/pkg/operator/controllers/gameserverbuild_controller.go @@ -219,17 +219,19 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ // calculate the total amount of servers not in the active state nonActiveGameServersCount := standingByCount + initializingCount + pendingCount + // Evaluate desired number of servers against actual + var totalNumberOfGameServersToDelete int + // user has decreased standingBy numbers if nonActiveGameServersCount > gsb.Spec.StandingBy { - var totalNumberOfGameServersToDelete int - - // we also need to check if we are above the max - // this can happen if the user modifies the spec.Max during the GameServerBuild's lifetime - if nonActiveGameServersCount+activeCount > gsb.Spec.Max { - totalNumberOfGameServersToDelete += int(math.Min(float64(nonActiveGameServersCount+activeCount-gsb.Spec.Max), maxNumberOfGameServersToDelete)) - } - - totalNumberOfGameServersToDelete = int(math.Min(float64(totalNumberOfGameServersToDelete+nonActiveGameServersCount-gsb.Spec.StandingBy), maxNumberOfGameServersToDelete)) + totalNumberOfGameServersToDelete += int(math.Min(float64(nonActiveGameServersCount-gsb.Spec.StandingBy), maxNumberOfGameServersToDelete)) + } + // we also need to check if we are above the max + // this can happen if the user modifies the spec.Max during the GameServerBuild's lifetime + if nonActiveGameServersCount+activeCount > gsb.Spec.Max { + totalNumberOfGameServersToDelete = int(math.Min(float64(totalNumberOfGameServersToDelete+(nonActiveGameServersCount+activeCount-gsb.Spec.Max)), maxNumberOfGameServersToDelete)) + } + if totalNumberOfGameServersToDelete > 0 { err := r.deleteNonActiveGameServers(ctx, &gsb, &gameServers, totalNumberOfGameServersToDelete) if err != nil { return ctrl.Result{}, err From 41e68da067126e7d26cd6d3abbe83f2ecbd7341e Mon Sep 17 00:00:00 2001 From: dsmith111 Date: Fri, 16 Sep 2022 17:05:44 -0700 Subject: [PATCH 03/22] Update yaml --- .../config/crd/bases/mps.playfab.com_gameservers.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pkg/operator/config/crd/bases/mps.playfab.com_gameservers.yaml b/pkg/operator/config/crd/bases/mps.playfab.com_gameservers.yaml index 68e420b9..dd811498 100644 --- a/pkg/operator/config/crd/bases/mps.playfab.com_gameservers.yaml +++ b/pkg/operator/config/crd/bases/mps.playfab.com_gameservers.yaml @@ -7246,6 +7246,15 @@ spec: description: Ports is a concatenated list of the ports this game server listens to type: string + prevstate: + description: The Previously known manually set state + enum: + - Initializing + - Active + - StandingBy + - Crashed + - GameCompleted + type: string publicIP: description: PublicIP is the PublicIP of the game server type: string From e9cd4fb27aab195c69fed14b6a61d21d9879eaa1 Mon Sep 17 00:00:00 2001 From: dsmith111 Date: Fri, 16 Sep 2022 17:27:04 -0700 Subject: [PATCH 04/22] Fix capitalization --- pkg/operator/api/v1alpha1/gameserver_types.go | 2 +- pkg/operator/config/crd/bases/mps.playfab.com_gameservers.yaml | 2 +- pkg/operator/controllers/gameserverbuild_controller.go | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/operator/api/v1alpha1/gameserver_types.go b/pkg/operator/api/v1alpha1/gameserver_types.go index 9504883d..52030bef 100644 --- a/pkg/operator/api/v1alpha1/gameserver_types.go +++ b/pkg/operator/api/v1alpha1/gameserver_types.go @@ -81,7 +81,7 @@ type GameServerStatus struct { // State defines the state of the game server (Initializing, StandingBy, Active etc.) State GameServerState `json:"state,omitempty"` // The Previously known manually set state - PrevState GameServerState `json:"prevstate,omitempty"` + PrevState GameServerState `json:"prevState,omitempty"` // PublicIP is the PublicIP of the game server PublicIP string `json:"publicIP,omitempty"` // Ports is a concatenated list of the ports this game server listens to diff --git a/pkg/operator/config/crd/bases/mps.playfab.com_gameservers.yaml b/pkg/operator/config/crd/bases/mps.playfab.com_gameservers.yaml index dd811498..c45fd432 100644 --- a/pkg/operator/config/crd/bases/mps.playfab.com_gameservers.yaml +++ b/pkg/operator/config/crd/bases/mps.playfab.com_gameservers.yaml @@ -7246,7 +7246,7 @@ spec: description: Ports is a concatenated list of the ports this game server listens to type: string - prevstate: + prevState: description: The Previously known manually set state enum: - Initializing diff --git a/pkg/operator/controllers/gameserverbuild_controller.go b/pkg/operator/controllers/gameserverbuild_controller.go index 397c122c..f7c2afad 100644 --- a/pkg/operator/controllers/gameserverbuild_controller.go +++ b/pkg/operator/controllers/gameserverbuild_controller.go @@ -160,7 +160,7 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ initializingCount++ } else if gs.Status.State == mpsv1alpha1.GameServerStateStandingBy && gs.Status.Health == mpsv1alpha1.GameServerHealthy { standingByCount++ - if gs.Status.PrevState != gs.Status.State { + if gs.Status.State != gs.Status.PrevState { timeToStandBySum += float64(gs.Status.ReachedStandingByOn.Sub(gs.CreationTimestamp.Time).Milliseconds()) recentStandingByCount++ } From 293efc18ab9c1d6a6a86aead8f7a56976040d2f2 Mon Sep 17 00:00:00 2001 From: dsmith111 Date: Fri, 16 Sep 2022 17:36:38 -0700 Subject: [PATCH 05/22] Revert extra changes triggering installfile alret --- .../config/crd/bases/mps.playfab.com_gameservers.yaml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/pkg/operator/config/crd/bases/mps.playfab.com_gameservers.yaml b/pkg/operator/config/crd/bases/mps.playfab.com_gameservers.yaml index c45fd432..68e420b9 100644 --- a/pkg/operator/config/crd/bases/mps.playfab.com_gameservers.yaml +++ b/pkg/operator/config/crd/bases/mps.playfab.com_gameservers.yaml @@ -7246,15 +7246,6 @@ spec: description: Ports is a concatenated list of the ports this game server listens to type: string - prevState: - description: The Previously known manually set state - enum: - - Initializing - - Active - - StandingBy - - Crashed - - GameCompleted - type: string publicIP: description: PublicIP is the PublicIP of the game server type: string From dc28b1317c03d50b426520f68d618d20be19f51e Mon Sep 17 00:00:00 2001 From: dsmith111 Date: Sat, 17 Sep 2022 19:00:31 -0700 Subject: [PATCH 06/22] Handle dereferencing; Add util funciton; Remove repeated lines --- pkg/operator/controllers/controller_utils.go | 13 +++++++++++++ pkg/operator/controllers/controller_utils_test.go | 13 +++++++++++++ .../controllers/gameserverbuild_controller.go | 12 +++++------- 3 files changed, 31 insertions(+), 7 deletions(-) diff --git a/pkg/operator/controllers/controller_utils.go b/pkg/operator/controllers/controller_utils.go index a9a0964a..f74a40c8 100644 --- a/pkg/operator/controllers/controller_utils.go +++ b/pkg/operator/controllers/controller_utils.go @@ -4,6 +4,7 @@ import ( "bytes" "context" "fmt" + "math" "math/rand" "strconv" "strings" @@ -71,6 +72,18 @@ func randString(n int) string { return string(b) } +// Determine whether to use an existing saved time variables or the current time for state duration +func getStateDuration(endTime *metav1.Time, startTime *metav1.Time) float64 { + var stateDuration float64 + // If the end time state is missing, use the current time + if endTime == nil { + stateDuration = math.Abs(float64(time.Since(startTime.Time).Milliseconds())) + } else { + stateDuration = math.Abs(float64(endTime.Time.Sub(startTime.Time).Milliseconds())) + } + return stateDuration +} + // GetNodeDetails returns the Public IP of the node and the node age in days // if the Node does not have a Public IP, method returns the internal one func GetNodeDetails(ctx context.Context, r client.Reader, nodeName string) (string, string, int, error) { diff --git a/pkg/operator/controllers/controller_utils_test.go b/pkg/operator/controllers/controller_utils_test.go index 21bf1dc0..4cb35146 100644 --- a/pkg/operator/controllers/controller_utils_test.go +++ b/pkg/operator/controllers/controller_utils_test.go @@ -2,6 +2,7 @@ package controllers import ( "fmt" + "time" mpsv1alpha1 "github.com/playfab/thundernetes/pkg/operator/api/v1alpha1" corev1 "k8s.io/api/core/v1" @@ -190,5 +191,17 @@ var _ = Describe("Utilities tests", func() { node.Labels[LabelGameServerNode] = "nottrue" Expect(isNodeGameServerNode(node)).To(BeFalse()) }) + It("should return a positive time duration", func() { + var startTime *metav1.Time + startTime.Time = time.Now() + + var endTime *metav1.Time + endTime.Time = time.Now().Add(3 * time.Second) + + Expect(getStateDuration(endTime, startTime)).To(BeAssignableToTypeOf(float64(0))) + Expect(getStateDuration(nil, startTime)).To(BeAssignableToTypeOf(float64(0))) + Expect(getStateDuration(startTime, endTime)).To(BeNumerically(">=", 0)) + Expect(getStateDuration(endTime, nil)).Error() + }) }) }) diff --git a/pkg/operator/controllers/gameserverbuild_controller.go b/pkg/operator/controllers/gameserverbuild_controller.go index f7c2afad..dd47613a 100644 --- a/pkg/operator/controllers/gameserverbuild_controller.go +++ b/pkg/operator/controllers/gameserverbuild_controller.go @@ -144,12 +144,10 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ // Gather sum of time taken to reach standingby phase and server count to produce the recent average gameserver initialization time var timeToStandBySum float64 var recentStandingByCount int - timeToStandBySum = 0 // Gather current sum of estimated time taken to clean up crashed or pending deletion gameservers var timeToDeleteBySum float64 var pendingCleanUpCount int - timeToStandBySum = 0 for i := 0; i < len(gameServers.Items); i++ { gs := gameServers.Items[i] @@ -161,7 +159,7 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ } else if gs.Status.State == mpsv1alpha1.GameServerStateStandingBy && gs.Status.Health == mpsv1alpha1.GameServerHealthy { standingByCount++ if gs.Status.State != gs.Status.PrevState { - timeToStandBySum += float64(gs.Status.ReachedStandingByOn.Sub(gs.CreationTimestamp.Time).Milliseconds()) + timeToStandBySum += getStateDuration(gs.Status.ReachedStandingByOn, &gs.CreationTimestamp) recentStandingByCount++ } } else if gs.Status.State == mpsv1alpha1.GameServerStateActive && gs.Status.Health == mpsv1alpha1.GameServerHealthy { @@ -177,7 +175,7 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ r.Recorder.Eventf(&gsb, corev1.EventTypeNormal, "Exited", "GameServer %s session completed", gs.Name) pendingCleanUpCount++ - timeToDeleteBySum += math.Abs(float64(time.Until(gs.DeletionTimestamp.Time).Milliseconds())) + timeToDeleteBySum += getStateDuration(gs.DeletionTimestamp, &gs.CreationTimestamp) } else if gs.Status.State == mpsv1alpha1.GameServerStateCrashed { // game server process exited with code != 0 (crashed) crashesCount++ @@ -189,7 +187,7 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ r.Recorder.Eventf(&gsb, corev1.EventTypeNormal, "Unhealthy", "GameServer %s was deleted because it became unhealthy, state: %s, health: %s", gs.Name, gs.Status.State, gs.Status.Health) pendingCleanUpCount++ - timeToDeleteBySum += math.Abs(float64(time.Until(gs.DeletionTimestamp.Time).Milliseconds())) + timeToDeleteBySum += getStateDuration(gs.DeletionTimestamp, &gs.CreationTimestamp) } else if gs.Status.Health == mpsv1alpha1.GameServerUnhealthy { // all cases where the game server was marked as Unhealthy crashesCount++ @@ -201,7 +199,7 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ r.Recorder.Eventf(&gsb, corev1.EventTypeNormal, "Crashed", "GameServer %s was deleted because it crashed, state: %s, health: %s", gs.Name, gs.Status.State, gs.Status.Health) pendingCleanUpCount++ - timeToDeleteBySum += math.Abs(float64(time.Until(gs.DeletionTimestamp.Time).Milliseconds())) + timeToDeleteBySum += getStateDuration(gs.DeletionTimestamp, &gs.CreationTimestamp) } if gs.Status.State != gs.Status.PrevState { gs.Status.PrevState = gs.Status.State @@ -220,7 +218,7 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ nonActiveGameServersCount := standingByCount + initializingCount + pendingCount // Evaluate desired number of servers against actual - var totalNumberOfGameServersToDelete int + var totalNumberOfGameServersToDelete int = 0 // user has decreased standingBy numbers if nonActiveGameServersCount > gsb.Spec.StandingBy { From c2a122e87ec92ba01c934c9d24e4424f59679a35 Mon Sep 17 00:00:00 2001 From: dsmith111 Date: Sun, 18 Sep 2022 17:12:51 -0700 Subject: [PATCH 07/22] Add pointers --- pkg/operator/controllers/controller_utils_test.go | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pkg/operator/controllers/controller_utils_test.go b/pkg/operator/controllers/controller_utils_test.go index 4cb35146..72fc0484 100644 --- a/pkg/operator/controllers/controller_utils_test.go +++ b/pkg/operator/controllers/controller_utils_test.go @@ -192,16 +192,15 @@ var _ = Describe("Utilities tests", func() { Expect(isNodeGameServerNode(node)).To(BeFalse()) }) It("should return a positive time duration", func() { - var startTime *metav1.Time + var startTime metav1.Time startTime.Time = time.Now() - var endTime *metav1.Time - endTime.Time = time.Now().Add(3 * time.Second) + var endTime metav1.Time + endTime.Time = time.Now().Add(20 * time.Second) - Expect(getStateDuration(endTime, startTime)).To(BeAssignableToTypeOf(float64(0))) - Expect(getStateDuration(nil, startTime)).To(BeAssignableToTypeOf(float64(0))) - Expect(getStateDuration(startTime, endTime)).To(BeNumerically(">=", 0)) - Expect(getStateDuration(endTime, nil)).Error() + Expect(getStateDuration(&endTime, &startTime)).To(BeAssignableToTypeOf(float64(0))) + Expect(getStateDuration(nil, &startTime)).To(BeAssignableToTypeOf(float64(0))) + Expect(getStateDuration(&endTime, &startTime)).To(BeNumerically(">=", float64(0))) }) }) }) From 38c7592c2b0cb7d2dbbe47c44af8df264289bb51 Mon Sep 17 00:00:00 2001 From: dsmith111 Date: Sun, 18 Sep 2022 17:32:27 -0700 Subject: [PATCH 08/22] Decreasing time diff --- pkg/operator/controllers/controller_utils_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/operator/controllers/controller_utils_test.go b/pkg/operator/controllers/controller_utils_test.go index 72fc0484..b75b1222 100644 --- a/pkg/operator/controllers/controller_utils_test.go +++ b/pkg/operator/controllers/controller_utils_test.go @@ -196,7 +196,7 @@ var _ = Describe("Utilities tests", func() { startTime.Time = time.Now() var endTime metav1.Time - endTime.Time = time.Now().Add(20 * time.Second) + endTime.Time = time.Now().Add(5 * time.Second) Expect(getStateDuration(&endTime, &startTime)).To(BeAssignableToTypeOf(float64(0))) Expect(getStateDuration(nil, &startTime)).To(BeAssignableToTypeOf(float64(0))) From ecebe2bf4dee98bbbb5e10766912f0fa99735f8e Mon Sep 17 00:00:00 2001 From: dsmith111 Date: Mon, 19 Sep 2022 09:43:56 -0700 Subject: [PATCH 09/22] PR Updates; Rename deleteSum; Fix GS state update; --- pkg/operator/controllers/controller_utils.go | 9 +++------ .../controllers/gameserverbuild_controller.go | 17 +++++++++++------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/pkg/operator/controllers/controller_utils.go b/pkg/operator/controllers/controller_utils.go index f74a40c8..547598e8 100644 --- a/pkg/operator/controllers/controller_utils.go +++ b/pkg/operator/controllers/controller_utils.go @@ -72,16 +72,13 @@ func randString(n int) string { return string(b) } -// Determine whether to use an existing saved time variables or the current time for state duration +// getStateDuration determine whether to use an existing saved time variables or the current time for state duration func getStateDuration(endTime *metav1.Time, startTime *metav1.Time) float64 { - var stateDuration float64 // If the end time state is missing, use the current time if endTime == nil { - stateDuration = math.Abs(float64(time.Since(startTime.Time).Milliseconds())) - } else { - stateDuration = math.Abs(float64(endTime.Time.Sub(startTime.Time).Milliseconds())) + return math.Abs(float64(time.Since(startTime.Time).Milliseconds())) } - return stateDuration + return math.Abs(float64(endTime.Time.Sub(startTime.Time).Milliseconds())) } // GetNodeDetails returns the Public IP of the node and the node age in days diff --git a/pkg/operator/controllers/gameserverbuild_controller.go b/pkg/operator/controllers/gameserverbuild_controller.go index dd47613a..9f48c9d9 100644 --- a/pkg/operator/controllers/gameserverbuild_controller.go +++ b/pkg/operator/controllers/gameserverbuild_controller.go @@ -146,7 +146,7 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ var recentStandingByCount int // Gather current sum of estimated time taken to clean up crashed or pending deletion gameservers - var timeToDeleteBySum float64 + var timeToDeleteSum float64 var pendingCleanUpCount int for i := 0; i < len(gameServers.Items); i++ { @@ -175,7 +175,7 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ r.Recorder.Eventf(&gsb, corev1.EventTypeNormal, "Exited", "GameServer %s session completed", gs.Name) pendingCleanUpCount++ - timeToDeleteBySum += getStateDuration(gs.DeletionTimestamp, &gs.CreationTimestamp) + timeToDeleteSum += getStateDuration(gs.DeletionTimestamp, &gs.CreationTimestamp) } else if gs.Status.State == mpsv1alpha1.GameServerStateCrashed { // game server process exited with code != 0 (crashed) crashesCount++ @@ -187,7 +187,7 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ r.Recorder.Eventf(&gsb, corev1.EventTypeNormal, "Unhealthy", "GameServer %s was deleted because it became unhealthy, state: %s, health: %s", gs.Name, gs.Status.State, gs.Status.Health) pendingCleanUpCount++ - timeToDeleteBySum += getStateDuration(gs.DeletionTimestamp, &gs.CreationTimestamp) + timeToDeleteSum += getStateDuration(gs.DeletionTimestamp, &gs.CreationTimestamp) } else if gs.Status.Health == mpsv1alpha1.GameServerUnhealthy { // all cases where the game server was marked as Unhealthy crashesCount++ @@ -199,10 +199,15 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ r.Recorder.Eventf(&gsb, corev1.EventTypeNormal, "Crashed", "GameServer %s was deleted because it crashed, state: %s, health: %s", gs.Name, gs.Status.State, gs.Status.Health) pendingCleanUpCount++ - timeToDeleteBySum += getStateDuration(gs.DeletionTimestamp, &gs.CreationTimestamp) + timeToDeleteSum += getStateDuration(gs.DeletionTimestamp, &gs.CreationTimestamp) } if gs.Status.State != gs.Status.PrevState { + patch := client.MergeFrom(gs.DeepCopy()) gs.Status.PrevState = gs.Status.State + // updating GameServer's previous state + if err := r.Status().Patch(ctx, &gs, patch); err != nil { + return ctrl.Result{}, err + } } } @@ -211,7 +216,7 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ } if pendingCleanUpCount > 0 { - GameServersCleanUpDuration.WithLabelValues(gsb.Name).Set(timeToDeleteBySum / float64(pendingCleanUpCount)) + GameServersCleanUpDuration.WithLabelValues(gsb.Name).Set(timeToDeleteSum / float64(pendingCleanUpCount)) } // calculate the total amount of servers not in the active state @@ -257,11 +262,11 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ errCh <- err return } + newgs.Status.PrevState = mpsv1alpha1.GameServerStateInitializing if err := r.Create(ctx, newgs); err != nil { errCh <- err return } - newgs.Status.PrevState = mpsv1alpha1.GameServerStateInitializing r.expectations.addGameServerToUnderCreationMap(gsb.Name, newgs.Name) GameServersCreatedCounter.WithLabelValues(gsb.Name).Inc() r.Recorder.Eventf(&gsb, corev1.EventTypeNormal, "Creating", "Creating GameServer %s", newgs.Name) From e41d915e30b3202f4cef5d11c24953314266bb61 Mon Sep 17 00:00:00 2001 From: dsmith111 Date: Tue, 20 Sep 2022 20:09:07 -0700 Subject: [PATCH 10/22] Add patching exception --- pkg/operator/controllers/gameserverbuild_controller.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pkg/operator/controllers/gameserverbuild_controller.go b/pkg/operator/controllers/gameserverbuild_controller.go index 9f48c9d9..87c5eef9 100644 --- a/pkg/operator/controllers/gameserverbuild_controller.go +++ b/pkg/operator/controllers/gameserverbuild_controller.go @@ -201,12 +201,16 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ pendingCleanUpCount++ timeToDeleteSum += getStateDuration(gs.DeletionTimestamp, &gs.CreationTimestamp) } + if gs.Status.State != gs.Status.PrevState { patch := client.MergeFrom(gs.DeepCopy()) gs.Status.PrevState = gs.Status.State + // updating GameServer's previous state if err := r.Status().Patch(ctx, &gs, patch); err != nil { - return ctrl.Result{}, err + if !apierrors.IsNotFound(err) { + return ctrl.Result{}, err + } } } } @@ -262,7 +266,6 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ errCh <- err return } - newgs.Status.PrevState = mpsv1alpha1.GameServerStateInitializing if err := r.Create(ctx, newgs); err != nil { errCh <- err return From 0be080c2f417c5e1ddaa52e445cece2e86f9a522 Mon Sep 17 00:00:00 2001 From: dsmith111 Date: Sun, 25 Sep 2022 11:23:03 -0700 Subject: [PATCH 11/22] Change metric emission to nodeagent --- cmd/nodeagent/nodeagentmanager.go | 5 + cmd/nodeagent/types.go | 21 +- cmd/nodeagent/utilities.go | 12 + pkg/operator/controllers/controller_utils.go | 10 - .../controllers/controller_utils_test.go | 12 - .../controllers/gameserverbuild_controller.go | 591 ++++++------------ pkg/operator/controllers/metrics.go | 24 - 7 files changed, 208 insertions(+), 467 deletions(-) diff --git a/cmd/nodeagent/nodeagentmanager.go b/cmd/nodeagent/nodeagentmanager.go index a09c5bea..27871497 100644 --- a/cmd/nodeagent/nodeagentmanager.go +++ b/cmd/nodeagent/nodeagentmanager.go @@ -220,11 +220,15 @@ func (n *NodeAgentManager) gameServerCreatedOrUpdated(obj *unstructured.Unstruct // or that the NodeAgent crashed and we're having a new instance // in any case, we're adding the details to the map logger.Infof("GameServer %s/%s does not exist in cache, we're creating it", gameServerNamespace, gameServerName) + // save actual gameserver creation time + creationTimeStamp := obj.GetCreationTimestamp() + gsdi = &GameServerInfo{ GameServerNamespace: gameServerNamespace, Mutex: &sync.RWMutex{}, GsUid: obj.GetUID(), CreationTime: n.nowFunc().UnixMilli(), + CreationTimeStamp: &creationTimeStamp, BuildName: gameServerBuildName, MarkedUnhealthy: false, // we're not adding details about health/state since the NodeAgent might have crashed @@ -454,6 +458,7 @@ func (n *NodeAgentManager) updateHealthAndStateIfNeeded(ctx context.Context, hb status.ReachedInitializingOn = &now } else if hb.CurrentGameState == GameStateStandingBy { status.ReachedStandingByOn = &now + GameServerCreateDuration.WithLabelValues(gsd.BuildName).Set(getStateDuration(status.ReachedStandingByOn, gsd.CreationTimeStamp)) } } diff --git a/cmd/nodeagent/types.go b/cmd/nodeagent/types.go index 74935bb6..b3781d8e 100644 --- a/cmd/nodeagent/types.go +++ b/cmd/nodeagent/types.go @@ -5,6 +5,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" ) @@ -58,6 +59,15 @@ var ( Name: "connected_players", Help: "Number of connected players per GameServer", }, []string{"namespace", "ServerName", "BuildName"}) + + GameServerCreateDuration = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: "thundernetes", + Name: "gameserver_create_duration", + Help: "Time taken to create a GameServers", + }, + []string{"BuildName"}, + ) ) // HeartbeatRequest contains data for the heartbeat request coming from the GSDK running alongside GameServer @@ -101,9 +111,10 @@ type GameServerInfo struct { GameServerNamespace string ConnectedPlayersCount int Mutex *sync.RWMutex - GsUid types.UID // UID of the GameServer object - CreationTime int64 // time when this GameServerInfo was created in the nodeagent - LastHeartbeatTime int64 // time since the nodeagent received a heartbeat from this GameServer - MarkedUnhealthy bool // if the GameServer was marked unhealthy by a heartbeat condition, used to avoid repeating the patch - BuildName string // the name of the GameServerBuild that this GameServer belongs to + GsUid types.UID // UID of the GameServer object + CreationTime int64 // time when this GameServerInfo was created in the nodeagent + CreationTimeStamp *metav1.Time // time when the GameServer was created + LastHeartbeatTime int64 // time since the nodeagent received a heartbeat from this GameServer + MarkedUnhealthy bool // if the GameServer was marked unhealthy by a heartbeat condition, used to avoid repeating the patch + BuildName string // the name of the GameServerBuild that this GameServer belongs to } diff --git a/cmd/nodeagent/utilities.go b/cmd/nodeagent/utilities.go index e66231fc..6dc8cb6f 100644 --- a/cmd/nodeagent/utilities.go +++ b/cmd/nodeagent/utilities.go @@ -3,12 +3,15 @@ package main import ( "errors" "fmt" + "math" "net/http" "os" "strconv" "strings" + "time" log "github.com/sirupsen/logrus" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/client-go/dynamic" "k8s.io/client-go/rest" @@ -104,6 +107,15 @@ func getLogger(gameServerName, gameServerNamespace string) *log.Entry { return log.WithFields(log.Fields{"GameServerName": gameServerName, "GameServerNamespace": gameServerNamespace}) } +// getStateDuration determine whether to use an existing saved time variables or the current time for state duration +func getStateDuration(endTime *metav1.Time, startTime *metav1.Time) float64 { + // If the end time state is missing, use the current time + if endTime == nil { + return math.Abs(float64(time.Since(startTime.Time).Milliseconds())) + } + return math.Abs(float64(endTime.Time.Sub(startTime.Time).Milliseconds())) +} + // sanitize removes new line characters from the string // https://codeql.github.com/codeql-query-help/go/go-log-injection/ func sanitize(s string) string { diff --git a/pkg/operator/controllers/controller_utils.go b/pkg/operator/controllers/controller_utils.go index 547598e8..a9a0964a 100644 --- a/pkg/operator/controllers/controller_utils.go +++ b/pkg/operator/controllers/controller_utils.go @@ -4,7 +4,6 @@ import ( "bytes" "context" "fmt" - "math" "math/rand" "strconv" "strings" @@ -72,15 +71,6 @@ func randString(n int) string { return string(b) } -// getStateDuration determine whether to use an existing saved time variables or the current time for state duration -func getStateDuration(endTime *metav1.Time, startTime *metav1.Time) float64 { - // If the end time state is missing, use the current time - if endTime == nil { - return math.Abs(float64(time.Since(startTime.Time).Milliseconds())) - } - return math.Abs(float64(endTime.Time.Sub(startTime.Time).Milliseconds())) -} - // GetNodeDetails returns the Public IP of the node and the node age in days // if the Node does not have a Public IP, method returns the internal one func GetNodeDetails(ctx context.Context, r client.Reader, nodeName string) (string, string, int, error) { diff --git a/pkg/operator/controllers/controller_utils_test.go b/pkg/operator/controllers/controller_utils_test.go index b75b1222..21bf1dc0 100644 --- a/pkg/operator/controllers/controller_utils_test.go +++ b/pkg/operator/controllers/controller_utils_test.go @@ -2,7 +2,6 @@ package controllers import ( "fmt" - "time" mpsv1alpha1 "github.com/playfab/thundernetes/pkg/operator/api/v1alpha1" corev1 "k8s.io/api/core/v1" @@ -191,16 +190,5 @@ var _ = Describe("Utilities tests", func() { node.Labels[LabelGameServerNode] = "nottrue" Expect(isNodeGameServerNode(node)).To(BeFalse()) }) - It("should return a positive time duration", func() { - var startTime metav1.Time - startTime.Time = time.Now() - - var endTime metav1.Time - endTime.Time = time.Now().Add(5 * time.Second) - - Expect(getStateDuration(&endTime, &startTime)).To(BeAssignableToTypeOf(float64(0))) - Expect(getStateDuration(nil, &startTime)).To(BeAssignableToTypeOf(float64(0))) - Expect(getStateDuration(&endTime, &startTime)).To(BeNumerically(">=", float64(0))) - }) }) }) diff --git a/pkg/operator/controllers/gameserverbuild_controller.go b/pkg/operator/controllers/gameserverbuild_controller.go index 87c5eef9..21bf1dc0 100644 --- a/pkg/operator/controllers/gameserverbuild_controller.go +++ b/pkg/operator/controllers/gameserverbuild_controller.go @@ -1,435 +1,194 @@ -/* -Copyright 2021. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - package controllers import ( - "context" "fmt" - "math" - "runtime" - "sort" - "sync" - "time" mpsv1alpha1 "github.com/playfab/thundernetes/pkg/operator/api/v1alpha1" corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - k8sruntime "k8s.io/apimachinery/pkg/runtime" - "k8s.io/client-go/tools/record" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/controller" - "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/controller-runtime/pkg/manager" -) - -// a map to hold the number of crashes per Build -// concurrent since the reconcile loop can be called multiple times for different GameServerBuilds -// key is namespace/name of the GameServerBuild -// value is the number of crashes -var crashesPerBuild = sync.Map{} -const ( - // maximum number of GameServers to create per reconcile loop - // we have this in place since each create call is synchronous and we want to minimize the time for each reconcile loop - maxNumberOfGameServersToAdd = 20 - // maximum number of GameServers to delete per reconcile loop - maxNumberOfGameServersToDelete = 20 + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" ) -// Simple async map implementation using a mutex -// used to manage the expected GameServer creations and deletions -type MutexMap struct { - data map[string]interface{} - mu sync.Mutex -} - -// GameServerBuildReconciler reconciles a GameServerBuild object -type GameServerBuildReconciler struct { - client.Client - Scheme *k8sruntime.Scheme - PortRegistry *PortRegistry - Recorder record.EventRecorder - expectations *GameServerExpectations -} - -// NewGameServerBuildReconciler returns a pointer to a new GameServerBuildReconciler -func NewGameServerBuildReconciler(mgr manager.Manager, portRegistry *PortRegistry) *GameServerBuildReconciler { - cl := mgr.GetClient() - return &GameServerBuildReconciler{ - Client: cl, - Scheme: mgr.GetScheme(), - PortRegistry: portRegistry, - Recorder: mgr.GetEventRecorderFor("GameServerBuild"), - expectations: NewGameServerExpectations(cl), - } -} - -//+kubebuilder:rbac:groups=mps.playfab.com,resources=gameserverbuilds,verbs=get;list;watch;create;update;patch;delete -//+kubebuilder:rbac:groups=mps.playfab.com,resources=gameserverbuilds/status,verbs=get;update;patch -//+kubebuilder:rbac:groups=mps.playfab.com,resources=gameserverbuilds/finalizers,verbs=update -//+kubebuilder:rbac:groups=mps.playfab.com,resources=gameservers,verbs=get;list;watch -//+kubebuilder:rbac:groups=mps.playfab.com,resources=gameservers/status,verbs=get -//+kubebuilder:rbac:groups="",resources=events,verbs=create;patch - -// Reconcile is part of the main kubernetes reconciliation loop which aims to -// move the current state of the cluster closer to the desired state. -// For more details, check Reconcile and its Result here: -// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.8.3/pkg/reconcile -func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - log := log.FromContext(ctx) - - var gsb mpsv1alpha1.GameServerBuild - if err := r.Get(ctx, req.NamespacedName, &gsb); err != nil { - if apierrors.IsNotFound(err) { - log.Info("Unable to fetch GameServerBuild - it is being deleted") - // GameServerBuild is being deleted so clear its entry from the crashesPerBuild map - // no-op if the entry is not present - crashesPerBuild.Delete(getKeyForCrashesPerBuildMap(&gsb)) - return ctrl.Result{}, nil - } - log.Error(err, "unable to fetch gameServerBuild") - return ctrl.Result{}, err - } - - // if GameServerBuild is unhealthy and current crashes equal or more than CrashesToMarkUnhealthy, do nothing more - if gsb.Status.Health == mpsv1alpha1.BuildUnhealthy && - gsb.Spec.CrashesToMarkUnhealthy != nil && - gsb.Status.CrashesCount >= *gsb.Spec.CrashesToMarkUnhealthy { - log.Info("GameServerBuild is Unhealthy, do nothing") - r.Recorder.Event(&gsb, corev1.EventTypeNormal, "Unhealthy Build", "GameServerBuild is Unhealthy, stopping reconciliation") - return ctrl.Result{}, nil - } - - deletionsCompleted, err := r.expectations.gameServersUnderDeletionWereDeleted(ctx, &gsb) - if err != nil { - return ctrl.Result{}, err - } - if !deletionsCompleted { - return ctrl.Result{}, nil - } - - creationsCompleted, err := r.expectations.gameServersUnderCreationWereCreated(ctx, &gsb) - if err != nil { - return ctrl.Result{}, err - } - if !creationsCompleted { - return ctrl.Result{}, nil - } - - // get the gameServers that are owned by this GameServerBuild - var gameServers mpsv1alpha1.GameServerList - if err := r.List(ctx, &gameServers, client.InNamespace(req.Namespace), client.MatchingFields{ownerKey: req.Name}); err != nil { - // there has been an error - return ctrl.Result{}, err - } - - // calculate counts by state so we can update .status accordingly - var activeCount, standingByCount, crashesCount, initializingCount, pendingCount int - // Gather sum of time taken to reach standingby phase and server count to produce the recent average gameserver initialization time - var timeToStandBySum float64 - var recentStandingByCount int - - // Gather current sum of estimated time taken to clean up crashed or pending deletion gameservers - var timeToDeleteSum float64 - var pendingCleanUpCount int - - for i := 0; i < len(gameServers.Items); i++ { - gs := gameServers.Items[i] - - if gs.Status.State == "" && gs.Status.Health != mpsv1alpha1.GameServerUnhealthy { // under normal circumstances, Health will also be equal to "" - pendingCount++ - } else if gs.Status.State == mpsv1alpha1.GameServerStateInitializing && gs.Status.Health == mpsv1alpha1.GameServerHealthy { - initializingCount++ - } else if gs.Status.State == mpsv1alpha1.GameServerStateStandingBy && gs.Status.Health == mpsv1alpha1.GameServerHealthy { - standingByCount++ - if gs.Status.State != gs.Status.PrevState { - timeToStandBySum += getStateDuration(gs.Status.ReachedStandingByOn, &gs.CreationTimestamp) - recentStandingByCount++ +var _ = Describe("Utilities tests", func() { + Context("Testing Utilities", func() { + It("should allocate hostPorts when creating game servers", func() { + pod := &corev1.Pod{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "nginx", + Ports: []corev1.ContainerPort{ + { + Name: "http", + ContainerPort: 80, + HostPort: 123, + }, + { + Name: "https", + ContainerPort: 443, + HostPort: 456, + }, + }, + }, + }, + }, } - } else if gs.Status.State == mpsv1alpha1.GameServerStateActive && gs.Status.Health == mpsv1alpha1.GameServerHealthy { - activeCount++ - } else if gs.Status.State == mpsv1alpha1.GameServerStateGameCompleted && gs.Status.Health == mpsv1alpha1.GameServerHealthy { - // game server process exited with code 0 - if err := r.Delete(ctx, &gs); err != nil { - return ctrl.Result{}, err + s := getContainerHostPortTuples(pod) + Expect(s).To(Equal("80:123,443:456")) + }) + It("should find if string is contained in the string slice", func() { + Expect(containsString([]string{"foo"}, "foo")).To(BeTrue()) + Expect(containsString([]string{"foo"}, "bar")).To(BeFalse()) + }) + It("should find if containerName/portName tuple is contained in the PortToExpose slice", func() { + p := []int32{ + 5, 10, 15, } - - GameServersSessionEndedCounter.WithLabelValues(gsb.Name).Inc() - r.expectations.addGameServerToUnderDeletionMap(gsb.Name, gs.Name) - r.Recorder.Eventf(&gsb, corev1.EventTypeNormal, "Exited", "GameServer %s session completed", gs.Name) - - pendingCleanUpCount++ - timeToDeleteSum += getStateDuration(gs.DeletionTimestamp, &gs.CreationTimestamp) - } else if gs.Status.State == mpsv1alpha1.GameServerStateCrashed { - // game server process exited with code != 0 (crashed) - crashesCount++ - if err := r.Delete(ctx, &gs); err != nil { - return ctrl.Result{}, err + Expect(sliceContainsPortToExpose(p, 5)).To(BeTrue()) + Expect(sliceContainsPortToExpose(p, 10)).To(BeTrue()) + Expect(sliceContainsPortToExpose(p, 16)).To(BeFalse()) + }) + It("should return env variables for GameServer", func() { + gs := &mpsv1alpha1.GameServer{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-gs", + Namespace: "test-ns", + }, + Spec: mpsv1alpha1.GameServerSpec{ + TitleID: "test-title", + BuildID: "test-build", + }, } - GameServersCrashedCounter.WithLabelValues(gsb.Name).Inc() - r.expectations.addGameServerToUnderDeletionMap(gsb.Name, gs.Name) - r.Recorder.Eventf(&gsb, corev1.EventTypeNormal, "Unhealthy", "GameServer %s was deleted because it became unhealthy, state: %s, health: %s", gs.Name, gs.Status.State, gs.Status.Health) - - pendingCleanUpCount++ - timeToDeleteSum += getStateDuration(gs.DeletionTimestamp, &gs.CreationTimestamp) - } else if gs.Status.Health == mpsv1alpha1.GameServerUnhealthy { - // all cases where the game server was marked as Unhealthy - crashesCount++ - if err := r.Delete(ctx, &gs); err != nil { - return ctrl.Result{}, err + s := getGameServerEnvVariables(gs, false) + Expect(testVerifyEnv(s, corev1.EnvVar{Name: "PF_GAMESERVER_NAME", Value: "test-gs"})).To(BeTrue()) + Expect(testVerifyEnv(s, corev1.EnvVar{Name: "PF_GAMESERVER_NAMESPACE", Value: "test-ns"})).To(BeTrue()) + Expect(testVerifyEnv(s, corev1.EnvVar{Name: "PF_BUILD_ID", Value: "test-build"})).To(BeTrue()) + Expect(testVerifyEnv(s, corev1.EnvVar{Name: "PF_TITLE_ID", Value: "test-title"})).To(BeTrue()) + }) + It("should return env variables for InitContainer", func() { + gs := &mpsv1alpha1.GameServer{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-GameServer", + }, + Spec: mpsv1alpha1.GameServerSpec{ + TitleID: "test-title", + BuildID: "test-build", + PortsToExpose: []int32{ + 80, 443, + }, + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "container1", + Ports: []corev1.ContainerPort{ + { + Name: "port1", + ContainerPort: 80, + HostPort: 123, + }, + { + Name: "port2", + ContainerPort: 443, + HostPort: 456, + }, + { + Name: "port3", + ContainerPort: 8080, + // this is not on GameServer.PortsToExpose so there will be no HostPost + }, + }, + }, + }, + }, + }, + }, } - GameServersUnhealthyCounter.WithLabelValues(gsb.Name).Inc() - r.expectations.addGameServerToUnderDeletionMap(gsb.Name, gs.Name) - r.Recorder.Eventf(&gsb, corev1.EventTypeNormal, "Crashed", "GameServer %s was deleted because it crashed, state: %s, health: %s", gs.Name, gs.Status.State, gs.Status.Health) - - pendingCleanUpCount++ - timeToDeleteSum += getStateDuration(gs.DeletionTimestamp, &gs.CreationTimestamp) - } - if gs.Status.State != gs.Status.PrevState { - patch := client.MergeFrom(gs.DeepCopy()) - gs.Status.PrevState = gs.Status.State - - // updating GameServer's previous state - if err := r.Status().Patch(ctx, &gs, patch); err != nil { - if !apierrors.IsNotFound(err) { - return ctrl.Result{}, err - } + s := getInitContainerEnvVariables(gs, false) + Expect(testVerifyEnv(s, corev1.EnvVar{Name: "HEARTBEAT_ENDPOINT_PORT", Value: fmt.Sprintf("%d", DaemonSetPort)})).To(BeTrue()) + Expect(testVerifyEnv(s, corev1.EnvVar{Name: "GSDK_CONFIG_FILE", Value: GsdkConfigFile})).To(BeTrue()) + Expect(testVerifyEnv(s, corev1.EnvVar{Name: "PF_SHARED_CONTENT_FOLDER", Value: GameSharedContentDirectory})).To(BeTrue()) + Expect(testVerifyEnv(s, corev1.EnvVar{Name: "CERTIFICATE_FOLDER", Value: CertificatesDirectory})).To(BeTrue()) + Expect(testVerifyEnv(s, corev1.EnvVar{Name: "PF_SERVER_LOG_DIRECTORY", Value: LogDirectory})).To(BeTrue()) + Expect(testVerifyEnv(s, corev1.EnvVar{Name: "PF_GAMESERVER_NAME", Value: gs.Name})).To(BeTrue()) + Expect(testVerifyEnv(s, corev1.EnvVar{Name: "PF_GAMESERVER_PORTS", Value: "port1,80,123?port2,443,456"})).To(BeTrue()) + }) + It("should attach data volume", func() { + container := &corev1.Container{} + attachDataVolumeOnContainer(container, false) + Expect(container.VolumeMounts[len(container.VolumeMounts)-1]).To(BeEquivalentTo(corev1.VolumeMount{ + Name: DataVolumeName, + MountPath: DataVolumeMountPath, + })) + }) + It("should create data volume", func() { + pod := &corev1.Pod{} + createDataVolumeOnPod(pod) + Expect(pod.Spec.Volumes[len(pod.Spec.Volumes)-1]).To(BeEquivalentTo(corev1.Volume{ + Name: DataVolumeName, + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + })) + }) + It("should attach init container", func() { + gs := &mpsv1alpha1.GameServer{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-gs", + Namespace: "test-ns", + }, + Spec: mpsv1alpha1.GameServerSpec{ + TitleID: "test-title", + BuildID: "test-build", + }, } - } - } - - if recentStandingByCount > 0 { - GameServersCreatedDuration.WithLabelValues(gsb.Name).Set(timeToStandBySum / float64(recentStandingByCount)) - } - - if pendingCleanUpCount > 0 { - GameServersCleanUpDuration.WithLabelValues(gsb.Name).Set(timeToDeleteSum / float64(pendingCleanUpCount)) - } - - // calculate the total amount of servers not in the active state - nonActiveGameServersCount := standingByCount + initializingCount + pendingCount - - // Evaluate desired number of servers against actual - var totalNumberOfGameServersToDelete int = 0 - - // user has decreased standingBy numbers - if nonActiveGameServersCount > gsb.Spec.StandingBy { - totalNumberOfGameServersToDelete += int(math.Min(float64(nonActiveGameServersCount-gsb.Spec.StandingBy), maxNumberOfGameServersToDelete)) - } - // we also need to check if we are above the max - // this can happen if the user modifies the spec.Max during the GameServerBuild's lifetime - if nonActiveGameServersCount+activeCount > gsb.Spec.Max { - totalNumberOfGameServersToDelete = int(math.Min(float64(totalNumberOfGameServersToDelete+(nonActiveGameServersCount+activeCount-gsb.Spec.Max)), maxNumberOfGameServersToDelete)) - } - if totalNumberOfGameServersToDelete > 0 { - err := r.deleteNonActiveGameServers(ctx, &gsb, &gameServers, totalNumberOfGameServersToDelete) - if err != nil { - return ctrl.Result{}, err - } - } - - // we are in need of standingBy servers, so we're creating them here - // we're also limiting the number of game servers that are created to avoid issues like this https://github.com/kubernetes-sigs/controller-runtime/issues/1782 - // we attempt to create the missing number of game servers, but we don't want to create more than the max - // an error channel for the go routines to write errors - errCh := make(chan error, maxNumberOfGameServersToAdd) - - // Time how long it takes to trigger new standby gameservers - standByReconcileStartTime := time.Now() - // a waitgroup for async create calls - var wg sync.WaitGroup - for i := 0; i < gsb.Spec.StandingBy-nonActiveGameServersCount && - i+nonActiveGameServersCount+activeCount < gsb.Spec.Max && - i < maxNumberOfGameServersToAdd; i++ { - wg.Add(1) - go func(standByStartTime time.Time) { - defer wg.Done() - newgs, err := NewGameServerForGameServerBuild(&gsb, r.PortRegistry) - if err != nil { - errCh <- err - return + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + }, } - if err := r.Create(ctx, newgs); err != nil { - errCh <- err - return + var testInitContainerImage string = "testInitContainerImage" + attachInitContainer(gs, pod, testInitContainerImage, false) + Expect(pod.Spec.InitContainers[len(pod.Spec.InitContainers)-1]).To(BeEquivalentTo(corev1.Container{ + Name: InitContainerName, + ImagePullPolicy: corev1.PullIfNotPresent, + Image: testInitContainerImage, + Env: getInitContainerEnvVariables(gs, false), + VolumeMounts: []corev1.VolumeMount{ + { + Name: DataVolumeName, + MountPath: DataVolumeMountPath, + }, + }, + })) + }) + It("shoud modify restart policy", func() { + pod := &corev1.Pod{ + Spec: corev1.PodSpec{}, } - r.expectations.addGameServerToUnderCreationMap(gsb.Name, newgs.Name) - GameServersCreatedCounter.WithLabelValues(gsb.Name).Inc() - r.Recorder.Eventf(&gsb, corev1.EventTypeNormal, "Creating", "Creating GameServer %s", newgs.Name) - GameServersStandByReconcileDuration.WithLabelValues(gsb.Name).Set(float64(time.Since(standByStartTime).Milliseconds())) - }(standByReconcileStartTime) - } - wg.Wait() - - if len(errCh) > 0 { - return ctrl.Result{}, <-errCh - } - - return r.updateStatus(ctx, &gsb, pendingCount, initializingCount, standingByCount, activeCount, crashesCount) -} - -// updateStatus patches the GameServerBuild's status only if the status of at least one of its GameServers has changed -func (r *GameServerBuildReconciler) updateStatus(ctx context.Context, gsb *mpsv1alpha1.GameServerBuild, pendingCount, initializingCount, standingByCount, activeCount, crashesCount int) (ctrl.Result, error) { - // patch GameServerBuild status only if one of the fields has changed - if gsb.Status.CurrentPending != pendingCount || - gsb.Status.CurrentInitializing != initializingCount || - gsb.Status.CurrentActive != activeCount || - gsb.Status.CurrentStandingBy != standingByCount || - crashesCount > 0 { - - patch := client.MergeFrom(gsb.DeepCopy()) - - gsb.Status.CurrentPending = pendingCount - gsb.Status.CurrentInitializing = initializingCount - gsb.Status.CurrentActive = activeCount - gsb.Status.CurrentStandingBy = standingByCount - - existingCrashes := r.getExistingCrashes(gsb, crashesCount) - - // update the crashesCount status with the new value of total crashes - gsb.Status.CrashesCount = existingCrashes + crashesCount - gsb.Status.CurrentStandingByReadyDesired = fmt.Sprintf("%d/%d", standingByCount, gsb.Spec.StandingBy) - - // GameServerBuild can only be set as Unhealthy if CrashesToMarkUnhealthy has been explicitly been set by the user - if gsb.Spec.CrashesToMarkUnhealthy != nil && gsb.Status.CrashesCount >= *gsb.Spec.CrashesToMarkUnhealthy { - gsb.Status.Health = mpsv1alpha1.BuildUnhealthy - } else { - gsb.Status.Health = mpsv1alpha1.BuildHealthy - } - - if err := r.Status().Patch(ctx, gsb, patch); err != nil { - return ctrl.Result{}, err - } - } - - CurrentGameServerGauge.WithLabelValues(gsb.Name, PendingServerStatus).Set(float64(pendingCount)) - CurrentGameServerGauge.WithLabelValues(gsb.Name, InitializingServerStatus).Set(float64(initializingCount)) - CurrentGameServerGauge.WithLabelValues(gsb.Name, StandingByServerStatus).Set(float64(standingByCount)) - CurrentGameServerGauge.WithLabelValues(gsb.Name, ActiveServerStatus).Set(float64(activeCount)) - - return ctrl.Result{}, nil -} - -// SetupWithManager sets up the controller with the Manager. -func (r *GameServerBuildReconciler) SetupWithManager(mgr ctrl.Manager) error { - if err := mgr.GetFieldIndexer().IndexField(context.Background(), &mpsv1alpha1.GameServer{}, ownerKey, func(rawObj client.Object) []string { - // grab the GameServer object, extract the owner... - gs := rawObj.(*mpsv1alpha1.GameServer) - owner := metav1.GetControllerOf(gs) - if owner == nil { - return nil - } - // ...make sure it's a GameServerBuild... - if owner.APIVersion != apiGVStr || owner.Kind != "GameServerBuild" { - return nil - } - - // ...and if so, return it - return []string{owner.Name} - }); err != nil { - return err - } - - return ctrl.NewControllerManagedBy(mgr). - For(&mpsv1alpha1.GameServerBuild{}). - Owns(&mpsv1alpha1.GameServer{}). - WithOptions(controller.Options{ - MaxConcurrentReconciles: runtime.NumCPU(), - }). - Complete(r) -} - -// getKeyForCrashesPerBuildMap returns the key for the map of crashes per build -// key is namespace/name -func getKeyForCrashesPerBuildMap(gsb *mpsv1alpha1.GameServerBuild) string { - return fmt.Sprintf("%s/%s", gsb.Namespace, gsb.Name) -} - -// deleteNonActiveGameServers loops through all the GameServers CRs and deletes non-Active ones -// after it sorts all of them by state -func (r *GameServerBuildReconciler) deleteNonActiveGameServers(ctx context.Context, - gsb *mpsv1alpha1.GameServerBuild, - gameServers *mpsv1alpha1.GameServerList, - totalNumberOfGameServersToDelete int) error { - // an error channel for the go routines to write errors - errCh := make(chan error, totalNumberOfGameServersToDelete) - // a waitgroup for async deletion calls - var wg sync.WaitGroup - deletionCalls := 0 - deletionStartTime := time.Now() - - // we sort the GameServers by state so that we can delete the ones that are empty state or Initializing before we delete the StandingBy ones (if needed) - // this is to make sure we don't fall below the desired number of StandingBy during scaling down - sort.Sort(ByState(gameServers.Items)) - for i := 0; i < len(gameServers.Items) && deletionCalls < totalNumberOfGameServersToDelete; i++ { - gs := gameServers.Items[i] - // we're deleting only initializing/pending/standingBy servers, never touching active - if gs.Status.State == "" || gs.Status.State == mpsv1alpha1.GameServerStateInitializing || gs.Status.State == mpsv1alpha1.GameServerStateStandingBy { - deletionCalls++ - wg.Add(1) - go func(deletionStartTime time.Time) { - defer wg.Done() - if err := r.deleteGameServer(ctx, &gs); err != nil { - if apierrors.IsConflict(err) { // this GameServer has been updated, skip it - return - } - errCh <- err - return - } - GameServersDeletedCounter.WithLabelValues(gsb.Name).Inc() - r.expectations.addGameServerToUnderDeletionMap(gsb.Name, gs.Name) - r.Recorder.Eventf(gsb, corev1.EventTypeNormal, "GameServer deleted", "GameServer %s deleted", gs.Name) - duration := time.Since(deletionStartTime).Milliseconds() - GameServersEndedDuration.WithLabelValues(gsb.Name).Set(float64(duration)) - }(deletionStartTime) - } - } - wg.Wait() - if len(errCh) > 0 { - return <-errCh - } - return nil -} - -// deleteGameServer deletes the provided GameServer -func (r *GameServerBuildReconciler) deleteGameServer(ctx context.Context, gs *mpsv1alpha1.GameServer) error { - // we're requesting the GameServer to be deleted to have the same ResourceVersion - // since it might have been updated (e.g. allocated) and the cache hasn't been updated yet - return r.Client.Delete(ctx, gs, &client.DeleteOptions{ - Preconditions: &metav1.Preconditions{ - ResourceVersion: &gs.ResourceVersion, - }}) -} - -// getTotalCrashes returns the total number of crashes for this GameServerBuild -func (r *GameServerBuildReconciler) getExistingCrashes(gsb *mpsv1alpha1.GameServerBuild, newCrashesCount int) int { - // try and get existing crashesCount from the map - // if it doesn't exist, create it with initial value the number of crashes we detected on this reconcile loop - key := getKeyForCrashesPerBuildMap(gsb) - val, ok := crashesPerBuild.LoadOrStore(key, newCrashesCount) - // if we have existing crashes, get the value - var existingCrashes int = 0 - if ok { - existingCrashes = val.(int) - // and store the new one - crashesPerBuild.Store(key, newCrashesCount+existingCrashes) - } - return existingCrashes -} + setPodRestartPolicyToNever(pod) + Expect(pod.Spec.RestartPolicy).To(Equal(corev1.RestartPolicyNever)) + }) + It("should generate a random name with prefix", func() { + prefix := "panathinaikos" + s := generateName(prefix) + Expect(s).To(HavePrefix(prefix)) + Expect(len(s)).To(BeNumerically(">", len(prefix))) + }) + It("should check if a Node is a GameServer Node", func() { + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + LabelGameServerNode: "true", + }, + }, + } + Expect(isNodeGameServerNode(node)).To(BeTrue()) + node.Labels[LabelGameServerNode] = "nottrue" + Expect(isNodeGameServerNode(node)).To(BeFalse()) + }) + }) +}) diff --git a/pkg/operator/controllers/metrics.go b/pkg/operator/controllers/metrics.go index fa100d3e..25b65401 100644 --- a/pkg/operator/controllers/metrics.go +++ b/pkg/operator/controllers/metrics.go @@ -24,14 +24,6 @@ var ( }, []string{"BuildName"}, ) - GameServersCreatedDuration = registry.NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: "thundernetes", - Name: "gameservers_create_duration", - Help: "Average time it took to create a the newest set of GameServers", - }, - []string{"BuildName"}, - ) GameServersStandByReconcileDuration = registry.NewGaugeVec( prometheus.GaugeOpts{ Namespace: "thundernetes", @@ -48,22 +40,6 @@ var ( }, []string{"BuildName"}, ) - GameServersEndedDuration = registry.NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: "thundernetes", - Name: "gameservers_end_duration", - Help: "Time it took to delete a set of non-active GameServers", - }, - []string{"BuildName"}, - ) - GameServersCleanUpDuration = registry.NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: "thundernetes", - Name: "gameservers_clean_up_duration", - Help: "Average time it took to clean up all completed or unhealthy GameServers", - }, - []string{"BuildName"}, - ) GameServersCrashedCounter = registry.NewCounterVec( prometheus.CounterOpts{ Namespace: "thundernetes", From 20be9c03a573d1282785e937cfd34d4894b7eca4 Mon Sep 17 00:00:00 2001 From: dsmith111 Date: Sun, 25 Sep 2022 11:29:30 -0700 Subject: [PATCH 12/22] Update dashboard --- samples/grafana/dashboard.json | 228 +++------------------------------ 1 file changed, 16 insertions(+), 212 deletions(-) diff --git a/samples/grafana/dashboard.json b/samples/grafana/dashboard.json index ec0b0523..f0d2f33c 100644 --- a/samples/grafana/dashboard.json +++ b/samples/grafana/dashboard.json @@ -359,8 +359,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] } @@ -465,8 +464,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -558,8 +556,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -651,8 +648,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -744,8 +740,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -866,8 +861,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -962,8 +956,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1056,8 +1049,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1150,8 +1142,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1622,7 +1613,7 @@ "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, - "description": "This panel consists of the average initialization time among all of the new GameServers from a batch of of GameServers (consisting of 1 or more) needing to be created in the reconciliation loop. ", + "description": "This panel consists of the 5-minute average time among all of the new GameServers to reach StandBy.", "fieldConfig": { "defaults": { "color": { @@ -1682,6 +1673,7 @@ "y": 27 }, "id": 37, + "interval": "0.01", "options": { "legend": { "calcs": [], @@ -1701,7 +1693,7 @@ "uid": "P1809F7CD0C75ACF3" }, "editorMode": "builder", - "expr": "thundernetes_gameservers_create_duration", + "expr": "avg_over_time(thundernetes_gameservers_create_duration[5m])", "hide": false, "legendFormat": "{{BuildName}}-{{instance}}", "range": true, @@ -1804,197 +1796,9 @@ ], "title": "GameServer StandBy Reconciliation Time", "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" - }, - "description": "During a reconciliation loop, all GameServers marked as non-healthy or completed will have the time difference between now and their estimated time until deletion summed up and averaged.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "Time (milliseconds)", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "opacity", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 3, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 35 - }, - "id": 39, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" - }, - "editorMode": "builder", - "expr": "thundernetes_gameservers_clean_up_duration", - "hide": false, - "legendFormat": "{{BuildName}}-{{instance}}", - "range": true, - "refId": "A" - } - ], - "title": "Average Estimated Time Until Hanging GameServer Clean Up", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" - }, - "description": "The time it has taken to begin the scaling down/deletion of every inactive GameServer given a decrease in the number of allowed StandBy servers or a breach past the maximum allowed.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "Time (milliseconds)", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "opacity", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 3, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 35 - }, - "id": 40, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" - }, - "editorMode": "builder", - "expr": "thundernetes_gameservers_end_duration", - "hide": false, - "legendFormat": "{{BuildName}}-{{instance}}", - "range": true, - "refId": "A" - } - ], - "title": "GameServer Scale Down Time", - "type": "timeseries" } ], - "refresh": "5s", + "refresh": false, "schemaVersion": 37, "style": "dark", "tags": [], @@ -2002,13 +1806,13 @@ "list": [] }, "time": { - "from": "now-1h", - "to": "now" + "from": "2022-09-25T15:57:24.300Z", + "to": "2022-09-25T17:57:24.300Z" }, "timepicker": {}, "timezone": "", "title": "Thundernetes GameServer Demo", "uid": "T9KjuZOnz", - "version": 4, + "version": 3, "weekStart": "" } \ No newline at end of file From e0fc97863012733ad147959341129a564e6092ab Mon Sep 17 00:00:00 2001 From: dsmith111 Date: Sun, 25 Sep 2022 11:46:15 -0700 Subject: [PATCH 13/22] Revert test --- .../controllers/gameserverbuild_controller.go | 553 ++++++++++++------ 1 file changed, 376 insertions(+), 177 deletions(-) diff --git a/pkg/operator/controllers/gameserverbuild_controller.go b/pkg/operator/controllers/gameserverbuild_controller.go index 21bf1dc0..f3d5c069 100644 --- a/pkg/operator/controllers/gameserverbuild_controller.go +++ b/pkg/operator/controllers/gameserverbuild_controller.go @@ -1,194 +1,393 @@ +/* +Copyright 2021. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package controllers import ( + "context" "fmt" + "math" + "runtime" + "sort" + "sync" + "time" mpsv1alpha1 "github.com/playfab/thundernetes/pkg/operator/api/v1alpha1" corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + k8sruntime "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/tools/record" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" +) + +// a map to hold the number of crashes per Build +// concurrent since the reconcile loop can be called multiple times for different GameServerBuilds +// key is namespace/name of the GameServerBuild +// value is the number of crashes +var crashesPerBuild = sync.Map{} - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" +const ( + // maximum number of GameServers to create per reconcile loop + // we have this in place since each create call is synchronous and we want to minimize the time for each reconcile loop + maxNumberOfGameServersToAdd = 20 + // maximum number of GameServers to delete per reconcile loop + maxNumberOfGameServersToDelete = 20 ) -var _ = Describe("Utilities tests", func() { - Context("Testing Utilities", func() { - It("should allocate hostPorts when creating game servers", func() { - pod := &corev1.Pod{ - Spec: corev1.PodSpec{ - Containers: []corev1.Container{ - { - Name: "nginx", - Ports: []corev1.ContainerPort{ - { - Name: "http", - ContainerPort: 80, - HostPort: 123, - }, - { - Name: "https", - ContainerPort: 443, - HostPort: 456, - }, - }, - }, - }, - }, - } - s := getContainerHostPortTuples(pod) - Expect(s).To(Equal("80:123,443:456")) - }) - It("should find if string is contained in the string slice", func() { - Expect(containsString([]string{"foo"}, "foo")).To(BeTrue()) - Expect(containsString([]string{"foo"}, "bar")).To(BeFalse()) - }) - It("should find if containerName/portName tuple is contained in the PortToExpose slice", func() { - p := []int32{ - 5, 10, 15, - } - Expect(sliceContainsPortToExpose(p, 5)).To(BeTrue()) - Expect(sliceContainsPortToExpose(p, 10)).To(BeTrue()) - Expect(sliceContainsPortToExpose(p, 16)).To(BeFalse()) - }) - It("should return env variables for GameServer", func() { - gs := &mpsv1alpha1.GameServer{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-gs", - Namespace: "test-ns", - }, - Spec: mpsv1alpha1.GameServerSpec{ - TitleID: "test-title", - BuildID: "test-build", - }, - } - s := getGameServerEnvVariables(gs, false) - Expect(testVerifyEnv(s, corev1.EnvVar{Name: "PF_GAMESERVER_NAME", Value: "test-gs"})).To(BeTrue()) - Expect(testVerifyEnv(s, corev1.EnvVar{Name: "PF_GAMESERVER_NAMESPACE", Value: "test-ns"})).To(BeTrue()) - Expect(testVerifyEnv(s, corev1.EnvVar{Name: "PF_BUILD_ID", Value: "test-build"})).To(BeTrue()) - Expect(testVerifyEnv(s, corev1.EnvVar{Name: "PF_TITLE_ID", Value: "test-title"})).To(BeTrue()) - }) - It("should return env variables for InitContainer", func() { - gs := &mpsv1alpha1.GameServer{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-GameServer", - }, - Spec: mpsv1alpha1.GameServerSpec{ - TitleID: "test-title", - BuildID: "test-build", - PortsToExpose: []int32{ - 80, 443, - }, - Template: corev1.PodTemplateSpec{ - Spec: corev1.PodSpec{ - Containers: []corev1.Container{ - { - Name: "container1", - Ports: []corev1.ContainerPort{ - { - Name: "port1", - ContainerPort: 80, - HostPort: 123, - }, - { - Name: "port2", - ContainerPort: 443, - HostPort: 456, - }, - { - Name: "port3", - ContainerPort: 8080, - // this is not on GameServer.PortsToExpose so there will be no HostPost - }, - }, - }, - }, - }, - }, - }, +// Simple async map implementation using a mutex +// used to manage the expected GameServer creations and deletions +type MutexMap struct { + data map[string]interface{} + mu sync.Mutex +} + +// GameServerBuildReconciler reconciles a GameServerBuild object +type GameServerBuildReconciler struct { + client.Client + Scheme *k8sruntime.Scheme + PortRegistry *PortRegistry + Recorder record.EventRecorder + expectations *GameServerExpectations +} + +// NewGameServerBuildReconciler returns a pointer to a new GameServerBuildReconciler +func NewGameServerBuildReconciler(mgr manager.Manager, portRegistry *PortRegistry) *GameServerBuildReconciler { + cl := mgr.GetClient() + return &GameServerBuildReconciler{ + Client: cl, + Scheme: mgr.GetScheme(), + PortRegistry: portRegistry, + Recorder: mgr.GetEventRecorderFor("GameServerBuild"), + expectations: NewGameServerExpectations(cl), + } +} + +//+kubebuilder:rbac:groups=mps.playfab.com,resources=gameserverbuilds,verbs=get;list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups=mps.playfab.com,resources=gameserverbuilds/status,verbs=get;update;patch +//+kubebuilder:rbac:groups=mps.playfab.com,resources=gameserverbuilds/finalizers,verbs=update +//+kubebuilder:rbac:groups=mps.playfab.com,resources=gameservers,verbs=get;list;watch +//+kubebuilder:rbac:groups=mps.playfab.com,resources=gameservers/status,verbs=get +//+kubebuilder:rbac:groups="",resources=events,verbs=create;patch + +// Reconcile is part of the main kubernetes reconciliation loop which aims to +// move the current state of the cluster closer to the desired state. +// For more details, check Reconcile and its Result here: +// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.8.3/pkg/reconcile +func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := log.FromContext(ctx) + + var gsb mpsv1alpha1.GameServerBuild + if err := r.Get(ctx, req.NamespacedName, &gsb); err != nil { + if apierrors.IsNotFound(err) { + log.Info("Unable to fetch GameServerBuild - it is being deleted") + // GameServerBuild is being deleted so clear its entry from the crashesPerBuild map + // no-op if the entry is not present + crashesPerBuild.Delete(getKeyForCrashesPerBuildMap(&gsb)) + return ctrl.Result{}, nil + } + log.Error(err, "unable to fetch gameServerBuild") + return ctrl.Result{}, err + } + + // if GameServerBuild is unhealthy and current crashes equal or more than CrashesToMarkUnhealthy, do nothing more + if gsb.Status.Health == mpsv1alpha1.BuildUnhealthy && + gsb.Spec.CrashesToMarkUnhealthy != nil && + gsb.Status.CrashesCount >= *gsb.Spec.CrashesToMarkUnhealthy { + log.Info("GameServerBuild is Unhealthy, do nothing") + r.Recorder.Event(&gsb, corev1.EventTypeNormal, "Unhealthy Build", "GameServerBuild is Unhealthy, stopping reconciliation") + return ctrl.Result{}, nil + } + + deletionsCompleted, err := r.expectations.gameServersUnderDeletionWereDeleted(ctx, &gsb) + if err != nil { + return ctrl.Result{}, err + } + if !deletionsCompleted { + return ctrl.Result{}, nil + } + + creationsCompleted, err := r.expectations.gameServersUnderCreationWereCreated(ctx, &gsb) + if err != nil { + return ctrl.Result{}, err + } + if !creationsCompleted { + return ctrl.Result{}, nil + } + + // get the gameServers that are owned by this GameServerBuild + var gameServers mpsv1alpha1.GameServerList + if err := r.List(ctx, &gameServers, client.InNamespace(req.Namespace), client.MatchingFields{ownerKey: req.Name}); err != nil { + // there has been an error + return ctrl.Result{}, err + } + + // calculate counts by state so we can update .status accordingly + var activeCount, standingByCount, crashesCount, initializingCount, pendingCount int + + for i := 0; i < len(gameServers.Items); i++ { + gs := gameServers.Items[i] + + if gs.Status.State == "" && gs.Status.Health != mpsv1alpha1.GameServerUnhealthy { // under normal circumstances, Health will also be equal to "" + pendingCount++ + } else if gs.Status.State == mpsv1alpha1.GameServerStateInitializing && gs.Status.Health == mpsv1alpha1.GameServerHealthy { + initializingCount++ + } else if gs.Status.State == mpsv1alpha1.GameServerStateStandingBy && gs.Status.Health == mpsv1alpha1.GameServerHealthy { + standingByCount++ + } else if gs.Status.State == mpsv1alpha1.GameServerStateActive && gs.Status.Health == mpsv1alpha1.GameServerHealthy { + activeCount++ + } else if gs.Status.State == mpsv1alpha1.GameServerStateGameCompleted && gs.Status.Health == mpsv1alpha1.GameServerHealthy { + // game server process exited with code 0 + if err := r.Delete(ctx, &gs); err != nil { + return ctrl.Result{}, err } - s := getInitContainerEnvVariables(gs, false) - Expect(testVerifyEnv(s, corev1.EnvVar{Name: "HEARTBEAT_ENDPOINT_PORT", Value: fmt.Sprintf("%d", DaemonSetPort)})).To(BeTrue()) - Expect(testVerifyEnv(s, corev1.EnvVar{Name: "GSDK_CONFIG_FILE", Value: GsdkConfigFile})).To(BeTrue()) - Expect(testVerifyEnv(s, corev1.EnvVar{Name: "PF_SHARED_CONTENT_FOLDER", Value: GameSharedContentDirectory})).To(BeTrue()) - Expect(testVerifyEnv(s, corev1.EnvVar{Name: "CERTIFICATE_FOLDER", Value: CertificatesDirectory})).To(BeTrue()) - Expect(testVerifyEnv(s, corev1.EnvVar{Name: "PF_SERVER_LOG_DIRECTORY", Value: LogDirectory})).To(BeTrue()) - Expect(testVerifyEnv(s, corev1.EnvVar{Name: "PF_GAMESERVER_NAME", Value: gs.Name})).To(BeTrue()) - Expect(testVerifyEnv(s, corev1.EnvVar{Name: "PF_GAMESERVER_PORTS", Value: "port1,80,123?port2,443,456"})).To(BeTrue()) - }) - It("should attach data volume", func() { - container := &corev1.Container{} - attachDataVolumeOnContainer(container, false) - Expect(container.VolumeMounts[len(container.VolumeMounts)-1]).To(BeEquivalentTo(corev1.VolumeMount{ - Name: DataVolumeName, - MountPath: DataVolumeMountPath, - })) - }) - It("should create data volume", func() { - pod := &corev1.Pod{} - createDataVolumeOnPod(pod) - Expect(pod.Spec.Volumes[len(pod.Spec.Volumes)-1]).To(BeEquivalentTo(corev1.Volume{ - Name: DataVolumeName, - VolumeSource: corev1.VolumeSource{ - EmptyDir: &corev1.EmptyDirVolumeSource{}, - }, - })) - }) - It("should attach init container", func() { - gs := &mpsv1alpha1.GameServer{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-gs", - Namespace: "test-ns", - }, - Spec: mpsv1alpha1.GameServerSpec{ - TitleID: "test-title", - BuildID: "test-build", - }, + GameServersSessionEndedCounter.WithLabelValues(gsb.Name).Inc() + r.expectations.addGameServerToUnderDeletionMap(gsb.Name, gs.Name) + r.Recorder.Eventf(&gsb, corev1.EventTypeNormal, "Exited", "GameServer %s session completed", gs.Name) + } else if gs.Status.State == mpsv1alpha1.GameServerStateCrashed { + // game server process exited with code != 0 (crashed) + crashesCount++ + if err := r.Delete(ctx, &gs); err != nil { + return ctrl.Result{}, err } - pod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pod", - }, + GameServersCrashedCounter.WithLabelValues(gsb.Name).Inc() + r.expectations.addGameServerToUnderDeletionMap(gsb.Name, gs.Name) + r.Recorder.Eventf(&gsb, corev1.EventTypeNormal, "Unhealthy", "GameServer %s was deleted because it became unhealthy, state: %s, health: %s", gs.Name, gs.Status.State, gs.Status.Health) + } else if gs.Status.Health == mpsv1alpha1.GameServerUnhealthy { + // all cases where the game server was marked as Unhealthy + crashesCount++ + if err := r.Delete(ctx, &gs); err != nil { + return ctrl.Result{}, err } - var testInitContainerImage string = "testInitContainerImage" - attachInitContainer(gs, pod, testInitContainerImage, false) - Expect(pod.Spec.InitContainers[len(pod.Spec.InitContainers)-1]).To(BeEquivalentTo(corev1.Container{ - Name: InitContainerName, - ImagePullPolicy: corev1.PullIfNotPresent, - Image: testInitContainerImage, - Env: getInitContainerEnvVariables(gs, false), - VolumeMounts: []corev1.VolumeMount{ - { - Name: DataVolumeName, - MountPath: DataVolumeMountPath, - }, - }, - })) - }) - It("shoud modify restart policy", func() { - pod := &corev1.Pod{ - Spec: corev1.PodSpec{}, + GameServersUnhealthyCounter.WithLabelValues(gsb.Name).Inc() + r.expectations.addGameServerToUnderDeletionMap(gsb.Name, gs.Name) + r.Recorder.Eventf(&gsb, corev1.EventTypeNormal, "Crashed", "GameServer %s was deleted because it crashed, state: %s, health: %s", gs.Name, gs.Status.State, gs.Status.Health) + } + } + + // calculate the total amount of servers not in the active state + nonActiveGameServersCount := standingByCount + initializingCount + pendingCount + + // Evaluate desired number of servers against actual + var totalNumberOfGameServersToDelete int = 0 + + // user has decreased standingBy numbers + if nonActiveGameServersCount > gsb.Spec.StandingBy { + totalNumberOfGameServersToDelete += int(math.Min(float64(nonActiveGameServersCount-gsb.Spec.StandingBy), maxNumberOfGameServersToDelete)) + } + // we also need to check if we are above the max + // this can happen if the user modifies the spec.Max during the GameServerBuild's lifetime + if nonActiveGameServersCount+activeCount > gsb.Spec.Max { + totalNumberOfGameServersToDelete = int(math.Min(float64(totalNumberOfGameServersToDelete+(nonActiveGameServersCount+activeCount-gsb.Spec.Max)), maxNumberOfGameServersToDelete)) + } + if totalNumberOfGameServersToDelete > 0 { + err := r.deleteNonActiveGameServers(ctx, &gsb, &gameServers, totalNumberOfGameServersToDelete) + if err != nil { + return ctrl.Result{}, err + } + } + + // we are in need of standingBy servers, so we're creating them here + // we're also limiting the number of game servers that are created to avoid issues like this https://github.com/kubernetes-sigs/controller-runtime/issues/1782 + // we attempt to create the missing number of game servers, but we don't want to create more than the max + // an error channel for the go routines to write errors + errCh := make(chan error, maxNumberOfGameServersToAdd) + + // Time how long it takes to trigger new standby gameservers + standByReconcileStartTime := time.Now() + // a waitgroup for async create calls + var wg sync.WaitGroup + for i := 0; i < gsb.Spec.StandingBy-nonActiveGameServersCount && + i+nonActiveGameServersCount+activeCount < gsb.Spec.Max && + i < maxNumberOfGameServersToAdd; i++ { + wg.Add(1) + go func(standByStartTime time.Time) { + defer wg.Done() + newgs, err := NewGameServerForGameServerBuild(&gsb, r.PortRegistry) + if err != nil { + errCh <- err + return } - setPodRestartPolicyToNever(pod) - Expect(pod.Spec.RestartPolicy).To(Equal(corev1.RestartPolicyNever)) - }) - It("should generate a random name with prefix", func() { - prefix := "panathinaikos" - s := generateName(prefix) - Expect(s).To(HavePrefix(prefix)) - Expect(len(s)).To(BeNumerically(">", len(prefix))) - }) - It("should check if a Node is a GameServer Node", func() { - node := &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{ - LabelGameServerNode: "true", - }, - }, + if err := r.Create(ctx, newgs); err != nil { + errCh <- err + return } - Expect(isNodeGameServerNode(node)).To(BeTrue()) - node.Labels[LabelGameServerNode] = "nottrue" - Expect(isNodeGameServerNode(node)).To(BeFalse()) - }) - }) -}) + r.expectations.addGameServerToUnderCreationMap(gsb.Name, newgs.Name) + GameServersCreatedCounter.WithLabelValues(gsb.Name).Inc() + r.Recorder.Eventf(&gsb, corev1.EventTypeNormal, "Creating", "Creating GameServer %s", newgs.Name) + GameServersStandByReconcileDuration.WithLabelValues(gsb.Name).Set(float64(time.Since(standByStartTime).Milliseconds())) + }(standByReconcileStartTime) + } + wg.Wait() + + if len(errCh) > 0 { + return ctrl.Result{}, <-errCh + } + + return r.updateStatus(ctx, &gsb, pendingCount, initializingCount, standingByCount, activeCount, crashesCount) +} + +// updateStatus patches the GameServerBuild's status only if the status of at least one of its GameServers has changed +func (r *GameServerBuildReconciler) updateStatus(ctx context.Context, gsb *mpsv1alpha1.GameServerBuild, pendingCount, initializingCount, standingByCount, activeCount, crashesCount int) (ctrl.Result, error) { + // patch GameServerBuild status only if one of the fields has changed + if gsb.Status.CurrentPending != pendingCount || + gsb.Status.CurrentInitializing != initializingCount || + gsb.Status.CurrentActive != activeCount || + gsb.Status.CurrentStandingBy != standingByCount || + crashesCount > 0 { + + patch := client.MergeFrom(gsb.DeepCopy()) + + gsb.Status.CurrentPending = pendingCount + gsb.Status.CurrentInitializing = initializingCount + gsb.Status.CurrentActive = activeCount + gsb.Status.CurrentStandingBy = standingByCount + + existingCrashes := r.getExistingCrashes(gsb, crashesCount) + + // update the crashesCount status with the new value of total crashes + gsb.Status.CrashesCount = existingCrashes + crashesCount + gsb.Status.CurrentStandingByReadyDesired = fmt.Sprintf("%d/%d", standingByCount, gsb.Spec.StandingBy) + + // GameServerBuild can only be set as Unhealthy if CrashesToMarkUnhealthy has been explicitly been set by the user + if gsb.Spec.CrashesToMarkUnhealthy != nil && gsb.Status.CrashesCount >= *gsb.Spec.CrashesToMarkUnhealthy { + gsb.Status.Health = mpsv1alpha1.BuildUnhealthy + } else { + gsb.Status.Health = mpsv1alpha1.BuildHealthy + } + + if err := r.Status().Patch(ctx, gsb, patch); err != nil { + return ctrl.Result{}, err + } + } + + CurrentGameServerGauge.WithLabelValues(gsb.Name, PendingServerStatus).Set(float64(pendingCount)) + CurrentGameServerGauge.WithLabelValues(gsb.Name, InitializingServerStatus).Set(float64(initializingCount)) + CurrentGameServerGauge.WithLabelValues(gsb.Name, StandingByServerStatus).Set(float64(standingByCount)) + CurrentGameServerGauge.WithLabelValues(gsb.Name, ActiveServerStatus).Set(float64(activeCount)) + + return ctrl.Result{}, nil +} + +// SetupWithManager sets up the controller with the Manager. +func (r *GameServerBuildReconciler) SetupWithManager(mgr ctrl.Manager) error { + if err := mgr.GetFieldIndexer().IndexField(context.Background(), &mpsv1alpha1.GameServer{}, ownerKey, func(rawObj client.Object) []string { + // grab the GameServer object, extract the owner... + gs := rawObj.(*mpsv1alpha1.GameServer) + owner := metav1.GetControllerOf(gs) + if owner == nil { + return nil + } + // ...make sure it's a GameServerBuild... + if owner.APIVersion != apiGVStr || owner.Kind != "GameServerBuild" { + return nil + } + + // ...and if so, return it + return []string{owner.Name} + }); err != nil { + return err + } + + return ctrl.NewControllerManagedBy(mgr). + For(&mpsv1alpha1.GameServerBuild{}). + Owns(&mpsv1alpha1.GameServer{}). + WithOptions(controller.Options{ + MaxConcurrentReconciles: runtime.NumCPU(), + }). + Complete(r) +} + +// getKeyForCrashesPerBuildMap returns the key for the map of crashes per build +// key is namespace/name +func getKeyForCrashesPerBuildMap(gsb *mpsv1alpha1.GameServerBuild) string { + return fmt.Sprintf("%s/%s", gsb.Namespace, gsb.Name) +} + +// deleteNonActiveGameServers loops through all the GameServers CRs and deletes non-Active ones +// after it sorts all of them by state +func (r *GameServerBuildReconciler) deleteNonActiveGameServers(ctx context.Context, + gsb *mpsv1alpha1.GameServerBuild, + gameServers *mpsv1alpha1.GameServerList, + totalNumberOfGameServersToDelete int) error { + // an error channel for the go routines to write errors + errCh := make(chan error, totalNumberOfGameServersToDelete) + // a waitgroup for async deletion calls + var wg sync.WaitGroup + deletionCalls := 0 + deletionStartTime := time.Now() + + // we sort the GameServers by state so that we can delete the ones that are empty state or Initializing before we delete the StandingBy ones (if needed) + // this is to make sure we don't fall below the desired number of StandingBy during scaling down + sort.Sort(ByState(gameServers.Items)) + for i := 0; i < len(gameServers.Items) && deletionCalls < totalNumberOfGameServersToDelete; i++ { + gs := gameServers.Items[i] + // we're deleting only initializing/pending/standingBy servers, never touching active + if gs.Status.State == "" || gs.Status.State == mpsv1alpha1.GameServerStateInitializing || gs.Status.State == mpsv1alpha1.GameServerStateStandingBy { + deletionCalls++ + wg.Add(1) + go func(deletionStartTime time.Time) { + defer wg.Done() + if err := r.deleteGameServer(ctx, &gs); err != nil { + if apierrors.IsConflict(err) { // this GameServer has been updated, skip it + return + } + errCh <- err + return + } + GameServersDeletedCounter.WithLabelValues(gsb.Name).Inc() + r.expectations.addGameServerToUnderDeletionMap(gsb.Name, gs.Name) + r.Recorder.Eventf(gsb, corev1.EventTypeNormal, "GameServer deleted", "GameServer %s deleted", gs.Name) + }(deletionStartTime) + } + } + wg.Wait() + if len(errCh) > 0 { + return <-errCh + } + return nil +} + +// deleteGameServer deletes the provided GameServer +func (r *GameServerBuildReconciler) deleteGameServer(ctx context.Context, gs *mpsv1alpha1.GameServer) error { + // we're requesting the GameServer to be deleted to have the same ResourceVersion + // since it might have been updated (e.g. allocated) and the cache hasn't been updated yet + return r.Client.Delete(ctx, gs, &client.DeleteOptions{ + Preconditions: &metav1.Preconditions{ + ResourceVersion: &gs.ResourceVersion, + }}) +} + +// getTotalCrashes returns the total number of crashes for this GameServerBuild +func (r *GameServerBuildReconciler) getExistingCrashes(gsb *mpsv1alpha1.GameServerBuild, newCrashesCount int) int { + // try and get existing crashesCount from the map + // if it doesn't exist, create it with initial value the number of crashes we detected on this reconcile loop + key := getKeyForCrashesPerBuildMap(gsb) + val, ok := crashesPerBuild.LoadOrStore(key, newCrashesCount) + // if we have existing crashes, get the value + var existingCrashes int = 0 + if ok { + existingCrashes = val.(int) + // and store the new one + crashesPerBuild.Store(key, newCrashesCount+existingCrashes) + } + return existingCrashes +} From a67fbb926df45cfd329038ff90567ace276bef97 Mon Sep 17 00:00:00 2001 From: dsmith111 Date: Sun, 25 Sep 2022 11:48:11 -0700 Subject: [PATCH 14/22] Cleanup deletes --- pkg/operator/controllers/gameserverbuild_controller.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pkg/operator/controllers/gameserverbuild_controller.go b/pkg/operator/controllers/gameserverbuild_controller.go index f3d5c069..794946fc 100644 --- a/pkg/operator/controllers/gameserverbuild_controller.go +++ b/pkg/operator/controllers/gameserverbuild_controller.go @@ -333,7 +333,6 @@ func (r *GameServerBuildReconciler) deleteNonActiveGameServers(ctx context.Conte // a waitgroup for async deletion calls var wg sync.WaitGroup deletionCalls := 0 - deletionStartTime := time.Now() // we sort the GameServers by state so that we can delete the ones that are empty state or Initializing before we delete the StandingBy ones (if needed) // this is to make sure we don't fall below the desired number of StandingBy during scaling down @@ -344,7 +343,7 @@ func (r *GameServerBuildReconciler) deleteNonActiveGameServers(ctx context.Conte if gs.Status.State == "" || gs.Status.State == mpsv1alpha1.GameServerStateInitializing || gs.Status.State == mpsv1alpha1.GameServerStateStandingBy { deletionCalls++ wg.Add(1) - go func(deletionStartTime time.Time) { + go func() { defer wg.Done() if err := r.deleteGameServer(ctx, &gs); err != nil { if apierrors.IsConflict(err) { // this GameServer has been updated, skip it @@ -356,7 +355,7 @@ func (r *GameServerBuildReconciler) deleteNonActiveGameServers(ctx context.Conte GameServersDeletedCounter.WithLabelValues(gsb.Name).Inc() r.expectations.addGameServerToUnderDeletionMap(gsb.Name, gs.Name) r.Recorder.Eventf(gsb, corev1.EventTypeNormal, "GameServer deleted", "GameServer %s deleted", gs.Name) - }(deletionStartTime) + }() } } wg.Wait() From 0cd6e86b4097a08a3da3438f15b37e0bf8ac1035 Mon Sep 17 00:00:00 2001 From: dsmith111 Date: Sun, 25 Sep 2022 11:49:24 -0700 Subject: [PATCH 15/22] Minor tweaks --- pkg/operator/controllers/gameserverbuild_controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/operator/controllers/gameserverbuild_controller.go b/pkg/operator/controllers/gameserverbuild_controller.go index 794946fc..72d115f2 100644 --- a/pkg/operator/controllers/gameserverbuild_controller.go +++ b/pkg/operator/controllers/gameserverbuild_controller.go @@ -196,7 +196,7 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ // we also need to check if we are above the max // this can happen if the user modifies the spec.Max during the GameServerBuild's lifetime if nonActiveGameServersCount+activeCount > gsb.Spec.Max { - totalNumberOfGameServersToDelete = int(math.Min(float64(totalNumberOfGameServersToDelete+(nonActiveGameServersCount+activeCount-gsb.Spec.Max)), maxNumberOfGameServersToDelete)) + totalNumberOfGameServersToDelete += int(math.Min(float64(totalNumberOfGameServersToDelete+(nonActiveGameServersCount+activeCount-gsb.Spec.Max)), maxNumberOfGameServersToDelete)) } if totalNumberOfGameServersToDelete > 0 { err := r.deleteNonActiveGameServers(ctx, &gsb, &gameServers, totalNumberOfGameServersToDelete) From c4217f8e87f8e3025d6e8e16b82d93a28a6237c5 Mon Sep 17 00:00:00 2001 From: dsmith111 Date: Sun, 25 Sep 2022 12:02:39 -0700 Subject: [PATCH 16/22] Conditional --- cmd/nodeagent/nodeagentmanager.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cmd/nodeagent/nodeagentmanager.go b/cmd/nodeagent/nodeagentmanager.go index 27871497..078b471e 100644 --- a/cmd/nodeagent/nodeagentmanager.go +++ b/cmd/nodeagent/nodeagentmanager.go @@ -458,7 +458,10 @@ func (n *NodeAgentManager) updateHealthAndStateIfNeeded(ctx context.Context, hb status.ReachedInitializingOn = &now } else if hb.CurrentGameState == GameStateStandingBy { status.ReachedStandingByOn = &now - GameServerCreateDuration.WithLabelValues(gsd.BuildName).Set(getStateDuration(status.ReachedStandingByOn, gsd.CreationTimeStamp)) + // emit duration if creationTimeStamp was able to be saved + if gsd.CreationTimeStamp != nil { + GameServerCreateDuration.WithLabelValues(gsd.BuildName).Set(getStateDuration(status.ReachedStandingByOn, gsd.CreationTimeStamp)) + } } } From 77b1c88273866b0dd32105bfa838934dac8329a6 Mon Sep 17 00:00:00 2001 From: dsmith111 Date: Sun, 25 Sep 2022 21:32:15 -0700 Subject: [PATCH 17/22] PR Suggested changes --- cmd/nodeagent/nodeagentmanager.go | 11 +- cmd/nodeagent/types.go | 27 +++-- cmd/nodeagent/utilities.go | 12 --- docs/howtos/monitoring.md | 5 +- pkg/operator/api/v1alpha1/gameserver_types.go | 2 - samples/grafana/dashboard.json | 101 +++++++++++++++++- 6 files changed, 121 insertions(+), 37 deletions(-) diff --git a/cmd/nodeagent/nodeagentmanager.go b/cmd/nodeagent/nodeagentmanager.go index 078b471e..351520fc 100644 --- a/cmd/nodeagent/nodeagentmanager.go +++ b/cmd/nodeagent/nodeagentmanager.go @@ -220,15 +220,12 @@ func (n *NodeAgentManager) gameServerCreatedOrUpdated(obj *unstructured.Unstruct // or that the NodeAgent crashed and we're having a new instance // in any case, we're adding the details to the map logger.Infof("GameServer %s/%s does not exist in cache, we're creating it", gameServerNamespace, gameServerName) - // save actual gameserver creation time - creationTimeStamp := obj.GetCreationTimestamp() gsdi = &GameServerInfo{ GameServerNamespace: gameServerNamespace, Mutex: &sync.RWMutex{}, GsUid: obj.GetUID(), CreationTime: n.nowFunc().UnixMilli(), - CreationTimeStamp: &creationTimeStamp, BuildName: gameServerBuildName, MarkedUnhealthy: false, // we're not adding details about health/state since the NodeAgent might have crashed @@ -456,12 +453,12 @@ func (n *NodeAgentManager) updateHealthAndStateIfNeeded(ctx context.Context, hb now := metav1.Time{Time: n.nowFunc()} if hb.CurrentGameState == GameStateInitializing { status.ReachedInitializingOn = &now + timeDif := time.Now().UnixMilli() - gsd.CreationTime + GameServerReachedInitializingDuration.WithLabelValues(gsd.BuildName).Set(float64(timeDif)) } else if hb.CurrentGameState == GameStateStandingBy { status.ReachedStandingByOn = &now - // emit duration if creationTimeStamp was able to be saved - if gsd.CreationTimeStamp != nil { - GameServerCreateDuration.WithLabelValues(gsd.BuildName).Set(getStateDuration(status.ReachedStandingByOn, gsd.CreationTimeStamp)) - } + timeDif := time.Now().UnixMilli() - gsd.CreationTime + GameServerReachedStandingByDuration.WithLabelValues(gsd.BuildName).Set(float64(timeDif)) } } diff --git a/cmd/nodeagent/types.go b/cmd/nodeagent/types.go index b3781d8e..d09592c4 100644 --- a/cmd/nodeagent/types.go +++ b/cmd/nodeagent/types.go @@ -5,7 +5,6 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" ) @@ -60,11 +59,20 @@ var ( Help: "Number of connected players per GameServer", }, []string{"namespace", "ServerName", "BuildName"}) - GameServerCreateDuration = promauto.NewGaugeVec( + GameServerReachedStandingByDuration = promauto.NewGaugeVec( prometheus.GaugeOpts{ Namespace: "thundernetes", - Name: "gameserver_create_duration", - Help: "Time taken to create a GameServers", + Name: "gameserver_standing_by_duration", + Help: "Time taken for a GameServer to reach StandingBy", + }, + []string{"BuildName"}, + ) + + GameServerReachedInitializingDuration = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: "thundernetes", + Name: "gameserver_initialization_duration", + Help: "Time taken for a GameServer to reach initialization", }, []string{"BuildName"}, ) @@ -111,10 +119,9 @@ type GameServerInfo struct { GameServerNamespace string ConnectedPlayersCount int Mutex *sync.RWMutex - GsUid types.UID // UID of the GameServer object - CreationTime int64 // time when this GameServerInfo was created in the nodeagent - CreationTimeStamp *metav1.Time // time when the GameServer was created - LastHeartbeatTime int64 // time since the nodeagent received a heartbeat from this GameServer - MarkedUnhealthy bool // if the GameServer was marked unhealthy by a heartbeat condition, used to avoid repeating the patch - BuildName string // the name of the GameServerBuild that this GameServer belongs to + GsUid types.UID // UID of the GameServer object + CreationTime int64 // time when this GameServerInfo was created in the nodeagent + LastHeartbeatTime int64 // time since the nodeagent received a heartbeat from this GameServer + MarkedUnhealthy bool // if the GameServer was marked unhealthy by a heartbeat condition, used to avoid repeating the patch + BuildName string // the name of the GameServerBuild that this GameServer belongs to } diff --git a/cmd/nodeagent/utilities.go b/cmd/nodeagent/utilities.go index 6dc8cb6f..e66231fc 100644 --- a/cmd/nodeagent/utilities.go +++ b/cmd/nodeagent/utilities.go @@ -3,15 +3,12 @@ package main import ( "errors" "fmt" - "math" "net/http" "os" "strconv" "strings" - "time" log "github.com/sirupsen/logrus" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/client-go/dynamic" "k8s.io/client-go/rest" @@ -107,15 +104,6 @@ func getLogger(gameServerName, gameServerNamespace string) *log.Entry { return log.WithFields(log.Fields{"GameServerName": gameServerName, "GameServerNamespace": gameServerNamespace}) } -// getStateDuration determine whether to use an existing saved time variables or the current time for state duration -func getStateDuration(endTime *metav1.Time, startTime *metav1.Time) float64 { - // If the end time state is missing, use the current time - if endTime == nil { - return math.Abs(float64(time.Since(startTime.Time).Milliseconds())) - } - return math.Abs(float64(endTime.Time.Sub(startTime.Time).Milliseconds())) -} - // sanitize removes new line characters from the string // https://codeql.github.com/codeql-query-help/go/go-log-injection/ func sanitize(s string) string { diff --git a/docs/howtos/monitoring.md b/docs/howtos/monitoring.md index e540e45a..e8478569 100644 --- a/docs/howtos/monitoring.md +++ b/docs/howtos/monitoring.md @@ -84,15 +84,14 @@ There is a custom Grafana dashboard example that visualizes some of this data in | --- | --- | --- | | gameserver_states | Gauge | nodeagent | | connected_players | Gauge | nodeagent | +| gameserver_initialization_duration | Gauge | nodeagent | +| gameserver_standing_by_duration | Gauge | nodeagent | | gameservers_current_state_per_build | Gauge | controller-manager | | gameservers_created_total | Counter | controller-manager | -| gameservers_create_duration | Gauge | controller-manager | | gameservers_reconcile_standby_duration | Gauge | controller-manager | | gameservers_sessionended_total | Counter | controller-manager | | gameservers_crashed_total | Counter | controller-manager | | gameservers_deleted_total | Counter | controller-manager | -| gameservers_end_duration | Gauge | controller-manager | -| gameservers_clean_up_duration | Gauge | controller-manager | | allocations_total | Counter | controller-manager | ## More pictures diff --git a/pkg/operator/api/v1alpha1/gameserver_types.go b/pkg/operator/api/v1alpha1/gameserver_types.go index 52030bef..554ff2ea 100644 --- a/pkg/operator/api/v1alpha1/gameserver_types.go +++ b/pkg/operator/api/v1alpha1/gameserver_types.go @@ -80,8 +80,6 @@ type GameServerStatus struct { Health GameServerHealth `json:"health,omitempty"` // State defines the state of the game server (Initializing, StandingBy, Active etc.) State GameServerState `json:"state,omitempty"` - // The Previously known manually set state - PrevState GameServerState `json:"prevState,omitempty"` // PublicIP is the PublicIP of the game server PublicIP string `json:"publicIP,omitempty"` // Ports is a concatenated list of the ports this game server listens to diff --git a/samples/grafana/dashboard.json b/samples/grafana/dashboard.json index f0d2f33c..e29940a3 100644 --- a/samples/grafana/dashboard.json +++ b/samples/grafana/dashboard.json @@ -1693,7 +1693,7 @@ "uid": "P1809F7CD0C75ACF3" }, "editorMode": "builder", - "expr": "avg_over_time(thundernetes_gameservers_create_duration[5m])", + "expr": "avg_over_time(thundernetes_gameserver_standing_by_duration[5m])", "hide": false, "legendFormat": "{{BuildName}}-{{instance}}", "range": true, @@ -1708,7 +1708,7 @@ "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, - "description": "The time it has taken to begin the initialization of every needed GameServer in the latest reconciliation loop.", + "description": "This panel consists of the 5-minute average time among all of the new GameServers to reach Initialization.", "fieldConfig": { "defaults": { "color": { @@ -1767,6 +1767,101 @@ "x": 12, "y": 27 }, + "id": 39, + "interval": "0.01", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "editorMode": "builder", + "expr": "avg_over_time(thundernetes_gameserver_initialization_duration[5m])", + "hide": false, + "legendFormat": "{{BuildName}}-{{instance}}", + "range": true, + "refId": "A" + } + ], + "title": "Average Time Until GameServer Reaches Initialization", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P1809F7CD0C75ACF3" + }, + "description": "The time it has taken to begin the initialization of every needed GameServer in the latest reconciliation loop.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Time (milliseconds)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 35 + }, "id": 38, "options": { "legend": { @@ -1813,6 +1908,6 @@ "timezone": "", "title": "Thundernetes GameServer Demo", "uid": "T9KjuZOnz", - "version": 3, + "version": 4, "weekStart": "" } \ No newline at end of file From feab86c9674a37cf533df1730d1378a764f03f8e Mon Sep 17 00:00:00 2001 From: dsmith111 Date: Sun, 25 Sep 2022 22:10:50 -0700 Subject: [PATCH 18/22] Remove spacing added to gameserverbuild --- pkg/operator/controllers/gameserverbuild_controller.go | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pkg/operator/controllers/gameserverbuild_controller.go b/pkg/operator/controllers/gameserverbuild_controller.go index 72d115f2..e3d9ff52 100644 --- a/pkg/operator/controllers/gameserverbuild_controller.go +++ b/pkg/operator/controllers/gameserverbuild_controller.go @@ -141,7 +141,6 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ // calculate counts by state so we can update .status accordingly var activeCount, standingByCount, crashesCount, initializingCount, pendingCount int - for i := 0; i < len(gameServers.Items); i++ { gs := gameServers.Items[i] @@ -158,7 +157,6 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ if err := r.Delete(ctx, &gs); err != nil { return ctrl.Result{}, err } - GameServersSessionEndedCounter.WithLabelValues(gsb.Name).Inc() r.expectations.addGameServerToUnderDeletionMap(gsb.Name, gs.Name) r.Recorder.Eventf(&gsb, corev1.EventTypeNormal, "Exited", "GameServer %s session completed", gs.Name) @@ -188,7 +186,6 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ // Evaluate desired number of servers against actual var totalNumberOfGameServersToDelete int = 0 - // user has decreased standingBy numbers if nonActiveGameServersCount > gsb.Spec.StandingBy { totalNumberOfGameServersToDelete += int(math.Min(float64(nonActiveGameServersCount-gsb.Spec.StandingBy), maxNumberOfGameServersToDelete)) @@ -210,7 +207,6 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ // we attempt to create the missing number of game servers, but we don't want to create more than the max // an error channel for the go routines to write errors errCh := make(chan error, maxNumberOfGameServersToAdd) - // Time how long it takes to trigger new standby gameservers standByReconcileStartTime := time.Now() // a waitgroup for async create calls @@ -237,7 +233,6 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ }(standByReconcileStartTime) } wg.Wait() - if len(errCh) > 0 { return ctrl.Result{}, <-errCh } @@ -333,7 +328,6 @@ func (r *GameServerBuildReconciler) deleteNonActiveGameServers(ctx context.Conte // a waitgroup for async deletion calls var wg sync.WaitGroup deletionCalls := 0 - // we sort the GameServers by state so that we can delete the ones that are empty state or Initializing before we delete the StandingBy ones (if needed) // this is to make sure we don't fall below the desired number of StandingBy during scaling down sort.Sort(ByState(gameServers.Items)) From d0dbb613a71e22da8956ba2890f14eeda2236071 Mon Sep 17 00:00:00 2001 From: dsmith111 Date: Sun, 25 Sep 2022 22:13:04 -0700 Subject: [PATCH 19/22] Remove empty line in nodeagent --- cmd/nodeagent/nodeagentmanager.go | 1 - 1 file changed, 1 deletion(-) diff --git a/cmd/nodeagent/nodeagentmanager.go b/cmd/nodeagent/nodeagentmanager.go index 351520fc..d5922566 100644 --- a/cmd/nodeagent/nodeagentmanager.go +++ b/cmd/nodeagent/nodeagentmanager.go @@ -220,7 +220,6 @@ func (n *NodeAgentManager) gameServerCreatedOrUpdated(obj *unstructured.Unstruct // or that the NodeAgent crashed and we're having a new instance // in any case, we're adding the details to the map logger.Infof("GameServer %s/%s does not exist in cache, we're creating it", gameServerNamespace, gameServerName) - gsdi = &GameServerInfo{ GameServerNamespace: gameServerNamespace, Mutex: &sync.RWMutex{}, From b3e07d9d602d10d086cde6a839ef0b8886618afe Mon Sep 17 00:00:00 2001 From: dsmith111 Date: Mon, 26 Sep 2022 06:59:59 -0700 Subject: [PATCH 20/22] Renaming --- pkg/operator/controllers/gameserverbuild_controller.go | 8 ++++---- pkg/operator/controllers/metrics.go | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pkg/operator/controllers/gameserverbuild_controller.go b/pkg/operator/controllers/gameserverbuild_controller.go index e3d9ff52..665de672 100644 --- a/pkg/operator/controllers/gameserverbuild_controller.go +++ b/pkg/operator/controllers/gameserverbuild_controller.go @@ -208,14 +208,14 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ // an error channel for the go routines to write errors errCh := make(chan error, maxNumberOfGameServersToAdd) // Time how long it takes to trigger new standby gameservers - standByReconcileStartTime := time.Now() + serverBatchCreateStartTime := time.Now() // a waitgroup for async create calls var wg sync.WaitGroup for i := 0; i < gsb.Spec.StandingBy-nonActiveGameServersCount && i+nonActiveGameServersCount+activeCount < gsb.Spec.Max && i < maxNumberOfGameServersToAdd; i++ { wg.Add(1) - go func(standByStartTime time.Time) { + go func(serverBatchCreateStartTime time.Time) { defer wg.Done() newgs, err := NewGameServerForGameServerBuild(&gsb, r.PortRegistry) if err != nil { @@ -229,8 +229,8 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ r.expectations.addGameServerToUnderCreationMap(gsb.Name, newgs.Name) GameServersCreatedCounter.WithLabelValues(gsb.Name).Inc() r.Recorder.Eventf(&gsb, corev1.EventTypeNormal, "Creating", "Creating GameServer %s", newgs.Name) - GameServersStandByReconcileDuration.WithLabelValues(gsb.Name).Set(float64(time.Since(standByStartTime).Milliseconds())) - }(standByReconcileStartTime) + GameServerBatchCreationDuration.WithLabelValues(gsb.Name).Set(float64(time.Since(serverBatchCreateStartTime).Milliseconds())) + }(serverBatchCreateStartTime) } wg.Wait() if len(errCh) > 0 { diff --git a/pkg/operator/controllers/metrics.go b/pkg/operator/controllers/metrics.go index 25b65401..1eea549d 100644 --- a/pkg/operator/controllers/metrics.go +++ b/pkg/operator/controllers/metrics.go @@ -24,11 +24,11 @@ var ( }, []string{"BuildName"}, ) - GameServersStandByReconcileDuration = registry.NewGaugeVec( + GameServerBatchCreationDuration = registry.NewGaugeVec( prometheus.GaugeOpts{ Namespace: "thundernetes", - Name: "gameservers_reconcile_standby_duration", - Help: "Time it took to begin initialization for all new GameServers", + Name: "gameservers_batch_creation_duration", + Help: "Time it took the controller to create a requested number of GameServer objects", }, []string{"BuildName"}, ) From fd2fbaec8ca1bdc1a140f89cf7be07ff772b6442 Mon Sep 17 00:00:00 2001 From: dsmith111 Date: Mon, 26 Sep 2022 07:19:58 -0700 Subject: [PATCH 21/22] Update dashboard --- samples/grafana/dashboard.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/samples/grafana/dashboard.json b/samples/grafana/dashboard.json index e29940a3..c0cdac65 100644 --- a/samples/grafana/dashboard.json +++ b/samples/grafana/dashboard.json @@ -1803,7 +1803,7 @@ "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, - "description": "The time it has taken to begin the initialization of every needed GameServer in the latest reconciliation loop.", + "description": "The time it has taken to create a batch of GameServer objects.", "fieldConfig": { "defaults": { "color": { @@ -1882,14 +1882,14 @@ "uid": "P1809F7CD0C75ACF3" }, "editorMode": "builder", - "expr": "thundernetes_gameservers_reconcile_standby_duration", + "expr": "thundernetes_gameservers_batch_creation_duration", "hide": false, "legendFormat": "{{BuildName}}-{{instance}}", "range": true, "refId": "A" } ], - "title": "GameServer StandBy Reconciliation Time", + "title": "GameServer Object Batch Creation Time", "type": "timeseries" } ], @@ -1908,6 +1908,6 @@ "timezone": "", "title": "Thundernetes GameServer Demo", "uid": "T9KjuZOnz", - "version": 4, + "version": 5, "weekStart": "" } \ No newline at end of file From 9318c35b3809415ad5d4a9ee541821754f2033e6 Mon Sep 17 00:00:00 2001 From: dsmith111 Date: Mon, 26 Sep 2022 10:26:40 -0700 Subject: [PATCH 22/22] Remove metric --- docs/howtos/monitoring.md | 1 - .../controllers/gameserverbuild_controller.go | 8 +- pkg/operator/controllers/metrics.go | 8 -- samples/grafana/dashboard.json | 103 +----------------- 4 files changed, 7 insertions(+), 113 deletions(-) diff --git a/docs/howtos/monitoring.md b/docs/howtos/monitoring.md index e8478569..e7a62d04 100644 --- a/docs/howtos/monitoring.md +++ b/docs/howtos/monitoring.md @@ -88,7 +88,6 @@ There is a custom Grafana dashboard example that visualizes some of this data in | gameserver_standing_by_duration | Gauge | nodeagent | | gameservers_current_state_per_build | Gauge | controller-manager | | gameservers_created_total | Counter | controller-manager | -| gameservers_reconcile_standby_duration | Gauge | controller-manager | | gameservers_sessionended_total | Counter | controller-manager | | gameservers_crashed_total | Counter | controller-manager | | gameservers_deleted_total | Counter | controller-manager | diff --git a/pkg/operator/controllers/gameserverbuild_controller.go b/pkg/operator/controllers/gameserverbuild_controller.go index 665de672..f976d330 100644 --- a/pkg/operator/controllers/gameserverbuild_controller.go +++ b/pkg/operator/controllers/gameserverbuild_controller.go @@ -23,7 +23,6 @@ import ( "runtime" "sort" "sync" - "time" mpsv1alpha1 "github.com/playfab/thundernetes/pkg/operator/api/v1alpha1" corev1 "k8s.io/api/core/v1" @@ -207,15 +206,13 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ // we attempt to create the missing number of game servers, but we don't want to create more than the max // an error channel for the go routines to write errors errCh := make(chan error, maxNumberOfGameServersToAdd) - // Time how long it takes to trigger new standby gameservers - serverBatchCreateStartTime := time.Now() // a waitgroup for async create calls var wg sync.WaitGroup for i := 0; i < gsb.Spec.StandingBy-nonActiveGameServersCount && i+nonActiveGameServersCount+activeCount < gsb.Spec.Max && i < maxNumberOfGameServersToAdd; i++ { wg.Add(1) - go func(serverBatchCreateStartTime time.Time) { + go func() { defer wg.Done() newgs, err := NewGameServerForGameServerBuild(&gsb, r.PortRegistry) if err != nil { @@ -229,8 +226,7 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ r.expectations.addGameServerToUnderCreationMap(gsb.Name, newgs.Name) GameServersCreatedCounter.WithLabelValues(gsb.Name).Inc() r.Recorder.Eventf(&gsb, corev1.EventTypeNormal, "Creating", "Creating GameServer %s", newgs.Name) - GameServerBatchCreationDuration.WithLabelValues(gsb.Name).Set(float64(time.Since(serverBatchCreateStartTime).Milliseconds())) - }(serverBatchCreateStartTime) + }() } wg.Wait() if len(errCh) > 0 { diff --git a/pkg/operator/controllers/metrics.go b/pkg/operator/controllers/metrics.go index 1eea549d..13f14aa7 100644 --- a/pkg/operator/controllers/metrics.go +++ b/pkg/operator/controllers/metrics.go @@ -24,14 +24,6 @@ var ( }, []string{"BuildName"}, ) - GameServerBatchCreationDuration = registry.NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: "thundernetes", - Name: "gameservers_batch_creation_duration", - Help: "Time it took the controller to create a requested number of GameServer objects", - }, - []string{"BuildName"}, - ) GameServersSessionEndedCounter = registry.NewCounterVec( prometheus.CounterOpts{ Namespace: "thundernetes", diff --git a/samples/grafana/dashboard.json b/samples/grafana/dashboard.json index c0cdac65..7dec2d81 100644 --- a/samples/grafana/dashboard.json +++ b/samples/grafana/dashboard.json @@ -1673,7 +1673,8 @@ "y": 27 }, "id": 37, - "interval": "0.01", + "interval": "10", + "maxDataPoints": 596, "options": { "legend": { "calcs": [], @@ -1692,8 +1693,8 @@ "type": "prometheus", "uid": "P1809F7CD0C75ACF3" }, - "editorMode": "builder", - "expr": "avg_over_time(thundernetes_gameserver_standing_by_duration[5m])", + "editorMode": "code", + "expr": "", "hide": false, "legendFormat": "{{BuildName}}-{{instance}}", "range": true, @@ -1797,100 +1798,6 @@ ], "title": "Average Time Until GameServer Reaches Initialization", "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" - }, - "description": "The time it has taken to create a batch of GameServer objects.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "Time (milliseconds)", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "opacity", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 3, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 35 - }, - "id": 38, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "P1809F7CD0C75ACF3" - }, - "editorMode": "builder", - "expr": "thundernetes_gameservers_batch_creation_duration", - "hide": false, - "legendFormat": "{{BuildName}}-{{instance}}", - "range": true, - "refId": "A" - } - ], - "title": "GameServer Object Batch Creation Time", - "type": "timeseries" } ], "refresh": false, @@ -1908,6 +1815,6 @@ "timezone": "", "title": "Thundernetes GameServer Demo", "uid": "T9KjuZOnz", - "version": 5, + "version": 2, "weekStart": "" } \ No newline at end of file