Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add controller performance metrics #391

Merged
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
9283e2f
Added GameServer control performance metrics;
dsmith111 Sep 15, 2022
466f2af
Merge branch 'main' of https://github.com/dsmith111/thundernetes into…
dsmith111 Sep 15, 2022
71fd931
Merge branch 'main' into smithdavi/add-controller-performance-metrics
dsmith111 Sep 15, 2022
f432e82
Update monitoring documentation;
dsmith111 Sep 15, 2022
baa71f4
Merge branch 'smithdavi/add-controller-performance-metrics' of https:…
dsmith111 Sep 15, 2022
41e68da
Update yaml
dsmith111 Sep 17, 2022
e9cd4fb
Fix capitalization
dsmith111 Sep 17, 2022
293efc1
Revert extra changes triggering installfile alret
dsmith111 Sep 17, 2022
dc28b13
Handle dereferencing;
dsmith111 Sep 18, 2022
c2a122e
Add pointers
dsmith111 Sep 19, 2022
38c7592
Decreasing time diff
dsmith111 Sep 19, 2022
dbd8b71
Merge branch 'main' into smithdavi/add-controller-performance-metrics
dsmith111 Sep 19, 2022
ecebe2b
PR Updates;
dsmith111 Sep 19, 2022
d8f0d88
Merge branch 'smithdavi/add-controller-performance-metrics' of github…
dsmith111 Sep 19, 2022
e41d915
Add patching exception
dsmith111 Sep 21, 2022
b7ed55e
Merge branch 'main' into smithdavi/add-controller-performance-metrics
dsmith111 Sep 21, 2022
0be080c
Change metric emission to nodeagent
dsmith111 Sep 25, 2022
20be9c0
Update dashboard
dsmith111 Sep 25, 2022
6e30235
Merge branch 'main' of github.com:dsmith111/thundernetes into smithda…
dsmith111 Sep 25, 2022
e0fc978
Revert test
dsmith111 Sep 25, 2022
a67fbb9
Cleanup deletes
dsmith111 Sep 25, 2022
0cd6e86
Minor tweaks
dsmith111 Sep 25, 2022
c4217f8
Conditional
dsmith111 Sep 25, 2022
77b1c88
PR Suggested changes
dsmith111 Sep 26, 2022
feab86c
Remove spacing added to gameserverbuild
dsmith111 Sep 26, 2022
d0dbb61
Remove empty line in nodeagent
dsmith111 Sep 26, 2022
b3e07d9
Renaming
dsmith111 Sep 26, 2022
fd2fbae
Update dashboard
dsmith111 Sep 26, 2022
9318c35
Remove metric
dsmith111 Sep 26, 2022
1861a24
Merge branch 'main' into smithdavi/add-controller-performance-metrics
dgkanatsios Sep 26, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,8 @@ installfilesdev

.uptodate

# vscode settings
.vscode

# allocator compiled plugin
kubectl-gameserver
5 changes: 5 additions & 0 deletions cmd/nodeagent/nodeagentmanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ func (n *NodeAgentManager) gameServerCreatedOrUpdated(obj *unstructured.Unstruct
// or that the NodeAgent crashed and we're having a new instance
// in any case, we're adding the details to the map
logger.Infof("GameServer %s/%s does not exist in cache, we're creating it", gameServerNamespace, gameServerName)

gsdi = &GameServerInfo{
GameServerNamespace: gameServerNamespace,
Mutex: &sync.RWMutex{},
Expand Down Expand Up @@ -452,8 +453,12 @@ func (n *NodeAgentManager) updateHealthAndStateIfNeeded(ctx context.Context, hb
now := metav1.Time{Time: n.nowFunc()}
if hb.CurrentGameState == GameStateInitializing {
status.ReachedInitializingOn = &now
timeDif := time.Now().UnixMilli() - gsd.CreationTime
GameServerReachedInitializingDuration.WithLabelValues(gsd.BuildName).Set(float64(timeDif))
} else if hb.CurrentGameState == GameStateStandingBy {
status.ReachedStandingByOn = &now
timeDif := time.Now().UnixMilli() - gsd.CreationTime
GameServerReachedStandingByDuration.WithLabelValues(gsd.BuildName).Set(float64(timeDif))
}
}

Expand Down
18 changes: 18 additions & 0 deletions cmd/nodeagent/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,24 @@ var (
Name: "connected_players",
Help: "Number of connected players per GameServer",
}, []string{"namespace", "ServerName", "BuildName"})

GameServerReachedStandingByDuration = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "thundernetes",
Name: "gameserver_standing_by_duration",
Help: "Time taken for a GameServer to reach StandingBy",
},
[]string{"BuildName"},
)

GameServerReachedInitializingDuration = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "thundernetes",
Name: "gameserver_initialization_duration",
Help: "Time taken for a GameServer to reach initialization",
},
[]string{"BuildName"},
)
)

// HeartbeatRequest contains data for the heartbeat request coming from the GSDK running alongside GameServer
Expand Down
3 changes: 3 additions & 0 deletions docs/howtos/monitoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,11 @@ There is a custom Grafana dashboard example that visualizes some of this data in
| --- | --- | --- |
| gameserver_states | Gauge | nodeagent |
| connected_players | Gauge | nodeagent |
| gameserver_initialization_duration | Gauge | nodeagent |
| gameserver_standing_by_duration | Gauge | nodeagent |
| gameservers_current_state_per_build | Gauge | controller-manager |
| gameservers_created_total | Counter | controller-manager |
| gameservers_reconcile_standby_duration | Gauge | controller-manager |
| gameservers_sessionended_total | Counter | controller-manager |
| gameservers_crashed_total | Counter | controller-manager |
| gameservers_deleted_total | Counter | controller-manager |
Expand Down
29 changes: 19 additions & 10 deletions pkg/operator/controllers/gameserverbuild_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"runtime"
"sort"
"sync"
"time"

mpsv1alpha1 "github.com/playfab/thundernetes/pkg/operator/api/v1alpha1"
corev1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -140,6 +141,7 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ

// calculate counts by state so we can update .status accordingly
var activeCount, standingByCount, crashesCount, initializingCount, pendingCount int
dsmith111 marked this conversation as resolved.
Show resolved Hide resolved

for i := 0; i < len(gameServers.Items); i++ {
gs := gameServers.Items[i]

Expand All @@ -156,6 +158,7 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ
if err := r.Delete(ctx, &gs); err != nil {
return ctrl.Result{}, err
}
dsmith111 marked this conversation as resolved.
Show resolved Hide resolved

GameServersSessionEndedCounter.WithLabelValues(gsb.Name).Inc()
r.expectations.addGameServerToUnderDeletionMap(gsb.Name, gs.Name)
r.Recorder.Eventf(&gsb, corev1.EventTypeNormal, "Exited", "GameServer %s session completed", gs.Name)
Expand Down Expand Up @@ -183,19 +186,19 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ
// calculate the total amount of servers not in the active state
nonActiveGameServersCount := standingByCount + initializingCount + pendingCount

// Evaluate desired number of servers against actual
var totalNumberOfGameServersToDelete int = 0

// user has decreased standingBy numbers
if nonActiveGameServersCount > gsb.Spec.StandingBy {
totalNumberOfGameServersToDelete := int(math.Min(float64(nonActiveGameServersCount-gsb.Spec.StandingBy), maxNumberOfGameServersToDelete))
err := r.deleteNonActiveGameServers(ctx, &gsb, &gameServers, totalNumberOfGameServersToDelete)
if err != nil {
return ctrl.Result{}, err
}
totalNumberOfGameServersToDelete += int(math.Min(float64(nonActiveGameServersCount-gsb.Spec.StandingBy), maxNumberOfGameServersToDelete))
}

// we need to check if we are above the max
// we also need to check if we are above the max
// this can happen if the user modifies the spec.Max during the GameServerBuild's lifetime
if nonActiveGameServersCount+activeCount > gsb.Spec.Max {
totalNumberOfGameServersToDelete := int(math.Min(float64(nonActiveGameServersCount+activeCount-gsb.Spec.Max), maxNumberOfGameServersToDelete))
totalNumberOfGameServersToDelete += int(math.Min(float64(totalNumberOfGameServersToDelete+(nonActiveGameServersCount+activeCount-gsb.Spec.Max)), maxNumberOfGameServersToDelete))
}
if totalNumberOfGameServersToDelete > 0 {
err := r.deleteNonActiveGameServers(ctx, &gsb, &gameServers, totalNumberOfGameServersToDelete)
if err != nil {
return ctrl.Result{}, err
Expand All @@ -207,13 +210,16 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ
// we attempt to create the missing number of game servers, but we don't want to create more than the max
// an error channel for the go routines to write errors
errCh := make(chan error, maxNumberOfGameServersToAdd)

// Time how long it takes to trigger new standby gameservers
standByReconcileStartTime := time.Now()
dsmith111 marked this conversation as resolved.
Show resolved Hide resolved
// a waitgroup for async create calls
var wg sync.WaitGroup
for i := 0; i < gsb.Spec.StandingBy-nonActiveGameServersCount &&
i+nonActiveGameServersCount+activeCount < gsb.Spec.Max &&
i < maxNumberOfGameServersToAdd; i++ {
wg.Add(1)
go func() {
go func(standByStartTime time.Time) {
defer wg.Done()
newgs, err := NewGameServerForGameServerBuild(&gsb, r.PortRegistry)
if err != nil {
Expand All @@ -227,9 +233,11 @@ func (r *GameServerBuildReconciler) Reconcile(ctx context.Context, req ctrl.Requ
r.expectations.addGameServerToUnderCreationMap(gsb.Name, newgs.Name)
GameServersCreatedCounter.WithLabelValues(gsb.Name).Inc()
r.Recorder.Eventf(&gsb, corev1.EventTypeNormal, "Creating", "Creating GameServer %s", newgs.Name)
}()
GameServersStandByReconcileDuration.WithLabelValues(gsb.Name).Set(float64(time.Since(standByStartTime).Milliseconds()))
}(standByReconcileStartTime)
}
wg.Wait()

if len(errCh) > 0 {
return ctrl.Result{}, <-errCh
}
Expand Down Expand Up @@ -325,6 +333,7 @@ func (r *GameServerBuildReconciler) deleteNonActiveGameServers(ctx context.Conte
// a waitgroup for async deletion calls
var wg sync.WaitGroup
deletionCalls := 0

// we sort the GameServers by state so that we can delete the ones that are empty state or Initializing before we delete the StandingBy ones (if needed)
// this is to make sure we don't fall below the desired number of StandingBy during scaling down
sort.Sort(ByState(gameServers.Items))
Expand Down
8 changes: 8 additions & 0 deletions pkg/operator/controllers/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,14 @@ var (
},
[]string{"BuildName"},
)
GameServersStandByReconcileDuration = registry.NewGaugeVec(
dsmith111 marked this conversation as resolved.
Show resolved Hide resolved
prometheus.GaugeOpts{
Namespace: "thundernetes",
Name: "gameservers_reconcile_standby_duration",
Help: "Time it took to begin initialization for all new GameServers",
},
[]string{"BuildName"},
)
GameServersSessionEndedCounter = registry.NewCounterVec(
prometheus.CounterOpts{
Namespace: "thundernetes",
Expand Down
Loading