Skip to content

Commit

Permalink
Fix bug: Disabled health checking not implemented
Browse files Browse the repository at this point in the history
This commit fixes actually two bugs:

1) While we documented that you could disabled health checking we never
actually implemented it! This fixes that. Thankfully, the work that has been
done to retrieve `GameServer` details through the SDK makes this relatively
easy.

2) SDK Server sidecar never had the necessary RBAC permissions to send events
to the `GameServer` CRD. This is now fixed as well.
  • Loading branch information
markmandel committed Aug 14, 2018
1 parent b92ad1d commit c0b5c72
Show file tree
Hide file tree
Showing 5 changed files with 147 additions and 118 deletions.
53 changes: 14 additions & 39 deletions cmd/sdk-server/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ import (
"net"
"net/http"
"strings"
"time"

"agones.dev/agones/pkg"
"agones.dev/agones/pkg/client/clientset/versioned"
Expand All @@ -46,12 +45,8 @@ const (
podNamespaceEnv = "POD_NAMESPACE"

// Flags (that can also be env vars)
localFlag = "local"
addressFlag = "address"
healthDisabledFlag = "health-disabled"
healthTimeoutFlag = "health-timeout"
healthInitialDelayFlag = "health-initial-delay"
healthFailureThresholdFlag = "health-failure-threshold"
localFlag = "local"
addressFlag = "address"
)

var (
Expand Down Expand Up @@ -107,14 +102,18 @@ func main() {
}

var s *gameservers.SDKServer
s, err = gameservers.NewSDKServer(viper.GetString(gameServerNameEnv), viper.GetString(podNamespaceEnv),
ctlConf.HealthDisabled, ctlConf.HealthTimeout, ctlConf.HealthFailureThreshold,
ctlConf.HealthInitialDelay, kubeClient, agonesClient)
s, err = gameservers.NewSDKServer(viper.GetString(gameServerNameEnv),
viper.GetString(podNamespaceEnv), kubeClient, agonesClient)
if err != nil {
logger.WithError(err).Fatalf("Could not start sidecar")
}

go s.Run(ctx.Done())
go func() {
err := s.Run(ctx.Done())
if err != nil {
logger.WithError(err).Fatalf("Could not run sidecar")
}
}()
sdk.RegisterSDKServer(grpcServer, s)
}

Expand Down Expand Up @@ -159,49 +158,25 @@ func runGateway(ctx context.Context, grpcEndpoint string, mux *gwruntime.ServeMu
func parseEnvFlags() config {
viper.SetDefault(localFlag, false)
viper.SetDefault(addressFlag, "localhost")
viper.SetDefault(healthDisabledFlag, false)
viper.SetDefault(healthTimeoutFlag, 5)
viper.SetDefault(healthInitialDelayFlag, 5)
viper.SetDefault(healthFailureThresholdFlag, 3)
pflag.Bool(localFlag, viper.GetBool(localFlag),
"Set this, or LOCAL env, to 'true' to run this binary in local development mode. Defaults to 'false'")
pflag.String(addressFlag, viper.GetString(addressFlag), "The Address to bind the server grpcPort to. Defaults to 'localhost")
pflag.Bool(healthDisabledFlag, viper.GetBool(healthDisabledFlag),
"Set this, or HEALTH_ENABLED env, to 'true' to enable health checking on the GameServer. Defaults to 'true'")
pflag.Int64(healthTimeoutFlag, viper.GetInt64(healthTimeoutFlag),
"Set this or HEALTH_TIMEOUT env to the number of seconds that the health check times out at. Defaults to 5")
pflag.Int64(healthInitialDelayFlag, viper.GetInt64(healthInitialDelayFlag),
"Set this or HEALTH_INITIAL_DELAY env to the number of seconds that the health will wait before starting. Defaults to 5")
pflag.Int64(healthFailureThresholdFlag, viper.GetInt64(healthFailureThresholdFlag),
"Set this or HEALTH_FAILURE_THRESHOLD env to the number of times the health check needs to fail to be deemed unhealthy. Defaults to 3")
pflag.Parse()

viper.SetEnvKeyReplacer(strings.NewReplacer("-", "_"))
runtime.Must(viper.BindEnv(localFlag))
runtime.Must(viper.BindEnv(gameServerNameEnv))
runtime.Must(viper.BindEnv(podNamespaceEnv))
runtime.Must(viper.BindEnv(healthDisabledFlag))
runtime.Must(viper.BindEnv(healthTimeoutFlag))
runtime.Must(viper.BindEnv(healthInitialDelayFlag))
runtime.Must(viper.BindEnv(healthFailureThresholdFlag))
runtime.Must(viper.BindPFlags(pflag.CommandLine))

return config{
IsLocal: viper.GetBool(localFlag),
Address: viper.GetString(addressFlag),
HealthDisabled: viper.GetBool(healthDisabledFlag),
HealthTimeout: time.Duration(viper.GetInt64(healthTimeoutFlag)) * time.Second,
HealthInitialDelay: time.Duration(viper.GetInt64(healthInitialDelayFlag)) * time.Second,
HealthFailureThreshold: viper.GetInt64(healthFailureThresholdFlag),
IsLocal: viper.GetBool(localFlag),
Address: viper.GetString(addressFlag),
}
}

// config is all the configuration for this program
type config struct {
Address string
IsLocal bool
HealthDisabled bool
HealthTimeout time.Duration
HealthInitialDelay time.Duration
HealthFailureThreshold int64
Address string
IsLocal bool
}
3 changes: 3 additions & 0 deletions install/helm/agones/templates/serviceaccounts/sdk.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ metadata:
release: {{ .Release.Name }}
heritage: {{ .Release.Service }}
rules:
- apiGroups: [""]
resources: ["events"]
verbs: ["create"]
- apiGroups: ["stable.agones.dev"]
resources: ["gameservers"]
verbs: ["list", "update", "watch"]
Expand Down
3 changes: 3 additions & 0 deletions install/yaml/install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,9 @@ metadata:
release: agones-manual
heritage: Tiller
rules:
- apiGroups: [""]
resources: ["events"]
verbs: ["create"]
- apiGroups: ["stable.agones.dev"]
resources: ["gameservers"]
verbs: ["list", "update", "watch"]
Expand Down
90 changes: 49 additions & 41 deletions pkg/gameservers/sdkserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,33 +50,30 @@ var _ sdk.SDKServer = &SDKServer{}
// SDKServer is a gRPC server, that is meant to be a sidecar
// for a GameServer that will update the game server status on SDK requests
type SDKServer struct {
logger *logrus.Entry
gameServerName string
namespace string
informerFactory externalversions.SharedInformerFactory
gameServerGetter typedv1alpha1.GameServersGetter
gameServerLister v1alpha1.GameServerLister
gameServerSynced cache.InformerSynced
server *http.Server
clock clock.Clock
healthDisabled bool
healthTimeout time.Duration
healthFailureThreshold int64
healthMutex sync.RWMutex
healthLastUpdated time.Time
healthFailureCount int64
workerqueue *workerqueue.WorkerQueue
streamMutex sync.RWMutex
connectedStreams []sdk.SDK_WatchGameServerServer
stop <-chan struct{}
recorder record.EventRecorder
logger *logrus.Entry
gameServerName string
namespace string
informerFactory externalversions.SharedInformerFactory
gameServerGetter typedv1alpha1.GameServersGetter
gameServerLister v1alpha1.GameServerLister
gameServerSynced cache.InformerSynced
server *http.Server
clock clock.Clock
health stablev1alpha1.Health
healthTimeout time.Duration
healthMutex sync.RWMutex
healthLastUpdated time.Time
healthFailureCount int32
workerqueue *workerqueue.WorkerQueue
streamMutex sync.RWMutex
connectedStreams []sdk.SDK_WatchGameServerServer
stop <-chan struct{}
recorder record.EventRecorder
}

// NewSDKServer creates a SDKServer that sets up an
// InClusterConfig for Kubernetes
func NewSDKServer(gameServerName, namespace string,
healthDisabled bool, healthTimeout time.Duration, healthFailureThreshold int64, healthInitialDelay time.Duration,
kubeClient kubernetes.Interface,
func NewSDKServer(gameServerName, namespace string, kubeClient kubernetes.Interface,
agonesClient versioned.Interface) (*SDKServer, error) {
mux := http.NewServeMux()

Expand All @@ -97,13 +94,10 @@ func NewSDKServer(gameServerName, namespace string,
Addr: ":8080",
Handler: mux,
},
clock: clock.RealClock{},
healthDisabled: healthDisabled,
healthFailureThreshold: healthFailureThreshold,
healthTimeout: healthTimeout,
healthMutex: sync.RWMutex{},
healthFailureCount: 0,
streamMutex: sync.RWMutex{},
clock: clock.RealClock{},
healthMutex: sync.RWMutex{},
healthFailureCount: 0,
streamMutex: sync.RWMutex{},
}

s.informerFactory = factory
Expand Down Expand Up @@ -140,7 +134,6 @@ func NewSDKServer(gameServerName, namespace string,
}
})

s.initHealthLastUpdated(healthInitialDelay)
s.workerqueue = workerqueue.NewWorkerQueue(
func(key string) error {
return s.updateState(stablev1alpha1.State(key))
Expand All @@ -161,7 +154,28 @@ func (s *SDKServer) initHealthLastUpdated(healthInitialDelay time.Duration) {

// Run processes the rate limited queue.
// Will block until stop is closed
func (s *SDKServer) Run(stop <-chan struct{}) {
func (s *SDKServer) Run(stop <-chan struct{}) error {
s.informerFactory.Start(stop)
cache.WaitForCacheSync(stop, s.gameServerSynced)

gs, err := s.gameServerLister.GameServers(s.namespace).Get(s.gameServerName)
if err != nil {
return errors.Wrapf(err, "error retrieving gameserver %s/%s", s.namespace, s.gameServerName)
}

// grab configuration details
s.health = gs.Spec.Health
s.logger.WithField("health", s.health).Info("setting health configuration")
s.healthTimeout = time.Duration(gs.Spec.Health.PeriodSeconds) * time.Second
s.initHealthLastUpdated(time.Duration(gs.Spec.Health.InitialDelaySeconds) * time.Second)

// start health checking running
if !s.health.Disabled {
s.logger.Info("Starting GameServer health checking")
go wait.Until(s.runHealth, s.healthTimeout, stop)
}

// then start the http endpoints
s.logger.Info("Starting SDKServer http health check...")
go func() {
if err := s.server.ListenAndServe(); err != nil {
Expand All @@ -175,16 +189,10 @@ func (s *SDKServer) Run(stop <-chan struct{}) {
}()
defer s.server.Close() // nolint: errcheck

if !s.healthDisabled {
s.logger.Info("Starting GameServer health checking")
go wait.Until(s.runHealth, s.healthTimeout, stop)
}

s.informerFactory.Start(stop)
cache.WaitForCacheSync(stop, s.gameServerSynced)
// need this for streaming gRPC commands
s.stop = stop
s.workerqueue.Run(1, stop)
return nil
}

// updateState sets the GameServer Status's state to the state
Expand Down Expand Up @@ -371,11 +379,11 @@ func (s *SDKServer) checkHealth() {
// currently healthy or not based on the configured
// failure count vs failure threshold
func (s *SDKServer) healthy() bool {
if s.healthDisabled {
if s.health.Disabled {
return true
}

s.healthMutex.RLock()
defer s.healthMutex.RUnlock()
return s.healthFailureCount < s.healthFailureThreshold
return s.healthFailureCount < s.health.FailureThreshold
}
Loading

0 comments on commit c0b5c72

Please sign in to comment.