Skip to content

Commit

Permalink
Adds OpenCensus metrics integration.
Browse files Browse the repository at this point in the history
- First set of metrics added using informers cache.
- Added flag for switching off metrics via helm and env vars.
- Added a documentation section about metrics and usage.
  • Loading branch information
Cyril TOVENA committed Dec 28, 2018
1 parent b492850 commit 46242fb
Show file tree
Hide file tree
Showing 317 changed files with 38,581 additions and 2,700 deletions.
199 changes: 60 additions & 139 deletions Gopkg.lock

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions Gopkg.toml
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,11 @@
[[constraint]]
branch = "master"
name = "github.com/joonix/log"

[[constraint]]
name = "go.opencensus.io"
version = "0.18.0"

[[override]]
name = "github.com/prometheus/client_golang"
version = "0.9.2"
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ Documentation and usage guides on how to develop and host dedicated game servers
- [Game Server Specification](./docs/gameserver_spec.md)
- [Fleet Specification](./docs/fleet_spec.md)
- [Fleet Autoscaler Specification](./docs/fleetautoscaler_spec.md)
- [Metrics](./docs/metrics.md)

### Examples
- [Full GameServer Configuration](./examples/gameserver.yaml)
Expand Down
5 changes: 3 additions & 2 deletions build/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ push-build-image:

# port forward the agones controller.
# useful for pprof and stats viewing, etc
controller-portforward: PORT ?= 6060
controller-portforward: PORT ?= 8080
controller-portforward:
docker run --rm -it $(common_mounts) $(DOCKER_RUN_ARGS) \
-e "KUBECONFIG=/root/.kube/$(kubeconfig_file)" -p $(PORT):$(PORT) $(build_tag) \
Expand Down Expand Up @@ -395,6 +395,7 @@ do-release:
git push -u upstream release-$(RELEASE_VERSION)
@echo "Now go make the $(RELEASE_VERSION) release on Github!"

setup-test-cluster: DOCKER_RUN_ARGS+=--network=host
setup-test-cluster: $(ensure-build-image)
$(DOCKER_RUN) kubectl apply -f $(mount_path)/build/helm.yaml
$(DOCKER_RUN) helm init --wait --service-account helm
Expand Down Expand Up @@ -570,7 +571,7 @@ kind-delete-cluster:
kind delete cluster --name $(KIND_PROFILE)

# start an interactive shell with kubectl configured to target the kind cluster
kind-shell: $(ensure-build-image)
kind-shell: $(ensure-build-image)
$(MAKE) shell KUBECONFIG="$(shell kind get kubeconfig-path --name="$(KIND_PROFILE)")" \
DOCKER_RUN_ARGS="--network=host $(DOCKER_RUN_ARGS)"

Expand Down
21 changes: 20 additions & 1 deletion build/helm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,23 @@ roleRef:
subjects:
- kind: ServiceAccount
name: helm
namespace: kube-system
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
creationTimestamp: null
name: cluster-admin
annotations:
rbac.authorization.kubernetes.io/autoupdate: "true"
rules:
- apiGroups:
- '*'
resources:
- '*'
verbs:
- '*'
- nonResourceURLs:
- '*'
verbs:
- '*'
61 changes: 49 additions & 12 deletions cmd/controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,21 +32,24 @@ import (
"agones.dev/agones/pkg/fleets"
"agones.dev/agones/pkg/gameservers"
"agones.dev/agones/pkg/gameserversets"
"agones.dev/agones/pkg/metrics"
"agones.dev/agones/pkg/util/runtime"
"agones.dev/agones/pkg/util/signals"
"agones.dev/agones/pkg/util/webhooks"
"github.com/heptiolabs/healthcheck"
"github.com/pkg/errors"
prom "github.com/prometheus/client_golang/prometheus"
"github.com/spf13/pflag"
"github.com/spf13/viper"
extclientset "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
)

const (
enableMetricsFlag = "metrics"
sidecarImageFlag = "sidecar-image"
sidecarCPURequestFlag = "sidecar-cpu-request"
sidecarCPULimitFlag = "sidecar-cpu-limit"
Expand All @@ -55,6 +58,7 @@ const (
maxPortFlag = "max-port"
certFileFlag = "cert-file"
keyFileFlag = "key-file"
kubeconfigFlag = "kubeconfig"
workers = 2
defaultResync = 30 * time.Second
)
Expand All @@ -73,7 +77,8 @@ func main() {
logger.WithError(err).Fatal("Could not create controller from environment or flags")
}

clientConf, err := rest.InClusterConfig()
// if the kubeconfig fails BuildConfigFromFlags will try in cluster config
clientConf, err := clientcmd.BuildConfigFromFlags("", ctlConf.KubeConfig)
if err != nil {
logger.WithError(err).Fatal("Could not create in cluster config")
}
Expand All @@ -93,11 +98,30 @@ func main() {
logger.WithError(err).Fatal("Could not create the agones api clientset")
}

health := healthcheck.NewHandler()
wh := webhooks.NewWebHook(ctlConf.CertFile, ctlConf.KeyFile)
agonesInformerFactory := externalversions.NewSharedInformerFactory(agonesClient, defaultResync)
kubeInformationFactory := informers.NewSharedInformerFactory(kubeClient, defaultResync)

server := &httpServer{}
var health healthcheck.Handler
var metricsController *metrics.Controller

if ctlConf.Metrics {
registry := prom.NewRegistry()
metricHandler, err := metrics.RegisterPrometheusExporter(registry)
if err != nil {
logger.WithError(err).Fatal("Could not create register prometheus exporter")
}
server.Handle("/metrics", metricHandler)
health = healthcheck.NewMetricsHandler(registry, "agones")
metricsController = metrics.NewController(kubeClient, agonesClient, agonesInformerFactory)

} else {
health = healthcheck.NewHandler()
}

server.Handle("/", health)

allocationMutex := &sync.Mutex{}

gsController := gameservers.NewController(wh, health, allocationMutex,
Expand All @@ -112,15 +136,19 @@ func main() {
fasController := fleetautoscalers.NewController(wh, health,
kubeClient, extClient, agonesClient, agonesInformerFactory)

rs := []runner{
wh, gsController, gsSetController, fleetController, faController, fasController, metricsController, server,
}

stop := signals.NewStopChannel()

kubeInformationFactory.Start(stop)
agonesInformerFactory.Start(stop)

rs := []runner{
wh, gsController, gsSetController, fleetController, faController, fasController, healthServer{handler: health},
}
for _, r := range rs {
if r == nil {
continue
}
go func(rr runner) {
if runErr := rr.Run(workers, stop); runErr != nil {
logger.WithError(runErr).Fatalf("could not start runner: %s", reflect.TypeOf(rr))
Expand All @@ -145,6 +173,7 @@ func parseEnvFlags() config {
viper.SetDefault(pullSidecarFlag, false)
viper.SetDefault(certFileFlag, filepath.Join(base, "certs/server.crt"))
viper.SetDefault(keyFileFlag, filepath.Join(base, "certs/server.key"))
viper.SetDefault(enableMetricsFlag, true)

pflag.String(sidecarImageFlag, viper.GetString(sidecarImageFlag), "Flag to overwrite the GameServer sidecar image that is used. Can also use SIDECAR env variable")
pflag.String(sidecarCPULimitFlag, viper.GetString(sidecarCPULimitFlag), "Flag to overwrite the GameServer sidecar container's cpu limit. Can also use SIDECAR_CPU_LIMIT env variable")
Expand All @@ -154,6 +183,8 @@ func parseEnvFlags() config {
pflag.Int32(maxPortFlag, 0, "Required. The maximum port that that a GameServer can be allocated to. Can also use MAX_PORT env variable")
pflag.String(keyFileFlag, viper.GetString(keyFileFlag), "Optional. Path to the key file")
pflag.String(certFileFlag, viper.GetString(certFileFlag), "Optional. Path to the crt file")
pflag.String(kubeconfigFlag, viper.GetString(kubeconfigFlag), "Optional. kubeconfig to run the controller out of the cluster. Only use it for debugging as webhook won't works.")
pflag.Bool(enableMetricsFlag, viper.GetBool(enableMetricsFlag), "Flag to activate metrics of Agones. Can also use METRICS env variable.")
pflag.Parse()

viper.SetEnvKeyReplacer(strings.NewReplacer("-", "_"))
Expand All @@ -165,6 +196,8 @@ func parseEnvFlags() config {
runtime.Must(viper.BindEnv(maxPortFlag))
runtime.Must(viper.BindEnv(keyFileFlag))
runtime.Must(viper.BindEnv(certFileFlag))
runtime.Must(viper.BindEnv(kubeconfigFlag))
runtime.Must(viper.BindEnv(enableMetricsFlag))
runtime.Must(viper.BindPFlags(pflag.CommandLine))

request, err := resource.ParseQuantity(viper.GetString(sidecarCPURequestFlag))
Expand All @@ -186,6 +219,8 @@ func parseEnvFlags() config {
AlwaysPullSidecar: viper.GetBool(pullSidecarFlag),
KeyFile: viper.GetString(keyFileFlag),
CertFile: viper.GetString(certFileFlag),
KubeConfig: viper.GetString(kubeconfigFlag),
Metrics: viper.GetBool(enableMetricsFlag),
}
}

Expand All @@ -197,8 +232,10 @@ type config struct {
SidecarCPURequest resource.Quantity
SidecarCPULimit resource.Quantity
AlwaysPullSidecar bool
Metrics bool
KeyFile string
CertFile string
KubeConfig string
}

// validate ensures the ctlConfig data is valid.
Expand All @@ -216,21 +253,21 @@ type runner interface {
Run(workers int, stop <-chan struct{}) error
}

type healthServer struct {
handler http.Handler
type httpServer struct {
http.ServeMux
}

func (h healthServer) Run(workers int, stop <-chan struct{}) error {
logger.Info("Starting health check...")
func (h *httpServer) Run(workers int, stop <-chan struct{}) error {
logger.Info("Starting http server...")
srv := &http.Server{
Addr: ":8080",
Handler: h.handler,
Handler: h,
}
defer srv.Close() // nolint: errcheck

if err := srv.ListenAndServe(); err != nil {
if err == http.ErrServerClosed {
logger.WithError(err).Info("health check: http server closed")
logger.WithError(err).Info("http server closed")
} else {
wrappedErr := errors.Wrap(err, "Could not listen on :8080")
runtime.HandleError(logger.WithError(wrappedErr), wrappedErr)
Expand Down
78 changes: 78 additions & 0 deletions docs/metrics.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# Metrics

Agones controller exposes metrics via [OpenCensus](https://opencensus.io/). OpenCensus is a single distribution of libraries that collect metrics and distributed traces from your services, we only use it for metrics but it will allow us to support multiple exporters in the future.

We choose to start with Prometheus as this is the most popular with Kubernetes but it is also compatible with Stackdriver.
If you need another exporter, check the [list of supported](https://opencensus.io/exporters/supported-exporters/go/) exporters. It should be pretty straightforward to register a new one.(Github PR are more than welcomed)

We plan to support multiple exporters in the future via environement variables and helm flags.

## Backend integrations

### Prometheus

If you are running a [Prometheus](https://prometheus.io/) intance you just need to ensure that metrics and kubernetes service discovery are enabled. (helm chart values `agones.metrics.enabled` and `agones.metrics.prometheusServiceDiscovery`). This will automatically add annotations required by Prometheus to discover Agones metrics and start collecting them. (see [example](https://github.com/prometheus/prometheus/tree/master/documentation/examples/kubernetes-rabbitmq))

### Prometheus Operator

If you have [Prometheus operator](https://github.com/coreos/prometheus-operator) installed in your cluster, make sure to add a [`ServiceMonitor`](https://github.com/coreos/prometheus-operator/blob/v0.17.0/Documentation/api.md#servicemonitorspec) to discover Agones metrics as shown below:

```yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: agones
labels:
app: agones
spec:
selector:
matchLabels:
stable.agones.dev/role: controller
endpoints:
- port: web
```
Finally include that `ServiceMonitor` in your [Prometheus instance CRD](https://github.com/coreos/prometheus-operator/blob/v0.17.0/Documentation/user-guides/getting-started.md#include-servicemonitors), this is usually done by adding a label to the `ServiceMonitor` above that is matched by the prometheus instance of your choice.

### Stackdriver

We don't yet support the [OpenCensus Stackdriver exporter](https://opencensus.io/exporters/supported-exporters/go/stackdriver/) but you can still use the Prometheus Stackdriver integration by following these [instructions](https://cloud.google.com/monitoring/kubernetes-engine/prometheus).
Annotations required by this integration can be activated by setting the `agones.metrics.prometheusServiceDiscovery` to true (default) via the [helm chart value](../install/helm/README.md#configuration).

## Metrics available

| Name | Description | Type |
|-------------------------------------------------|---------------------------------------------------------------------|---------|
| agones_gameservers_count | The number of gameservers per fleet and status | gauge |
| agones_fleet_allocations_count | The number of fleet allocations per fleet | gauge |
| agones_gameservers_total | The total of gameservers per fleet and status | counter |
| agones_fleet_allocations_total | The total of fleet allocations per fleet | counter |
| agones_fleets_replicas_count | The number of replicas per fleet (total, desired, ready, allocated) | gauge |
| agones_fleet_autoscalers_able_to_scale | The fleet autoscaler can access the fleet to scale | gauge |
| agones_fleet_autoscalers_buffer_limits | he limits of buffer based fleet autoscalers (min, max) | gauge |
| agones_fleet_autoscalers_buffer_size | The buffer size of fleet autoscalers (count or percentage) | gauge |
| agones_fleet_autoscalers_current_replicas_count | The current replicas count as seen by autoscalers | gauge |
| agones_fleet_autoscalers_desired_replicas_count | The desired replicas count as seen by autoscalers | gauge |
| agones_fleet_autoscalers_limited | The fleet autoscaler is capped (1) | gauge |

## Dashboard

Grafana and Stackdriver - Coming Soon

## Adding more metrics

If you want to contribute and add more metrics we recommend to use shared informers (cache) as it is currently implemented in the [metrics controller](../pkg/metrics/controller.go). Using shared informers allows to keep metrics code in one place and doesn't overload the Kubernetes API.

However there is some cases where you will have to add code inside your ressource controller (eg. latency metrics), you should minize metrics code in your controller by adding specific functions in the metrics packages as shown below.

```golang
package metrics
import "go.opencensus.io/stats"
...
func RecordSomeLatency(latency int64,ressourceName string) {
stats.RecordWithTags(....)
}
```
4 changes: 3 additions & 1 deletion install/helm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ The following tables lists the configurable parameters of the Agones chart and t
| `agones.rbacEnabled` | Creates RBAC resources. Must be set for any cluster configured with RBAC | `true` |
| `agones.crds.install` | Install the CRDs with this chart. Useful to disable if you want to subchart (since crd-install hook is broken), so you can copy the CRDs into your own chart. | `true` |
| `agones.crds.cleanupOnDelete` | Run the pre-delete hook to delete all GameServers and their backing Pods when deleting the helm chart, so that all CRDs can be removed on chart deletion | `true` |
| `agones.metrics.enabled` | Enables controller metrics on port `8080` and path `/metrics` | `true` |
| `agones.metrics.prometheusServiceDiscovery` | Adds annotations for Prometheus ServiceDiscovery (and also Strackdriver) | `true` |
| `agones.serviceaccount.controller` | Service account name for the controller | `agones-controller` |
| `agones.serviceaccount.sdk` | Service account name for the sdk | `agones-sdk` |
| `agones.image.registry` | Global image registry for all images | `gcr.io/agones-images` |
Expand All @@ -99,7 +101,7 @@ The following tables lists the configurable parameters of the Agones chart and t
| `agones.image.sdk.alwaysPull` | Tells if the sdk image should always be pulled | `false` |
| `agones.image.ping.name` | ( ⚠️ development feature ⚠️ ) Image name for the ping service | `agones-ping` |
| `agones.image.ping.pullPolicy` | ( ⚠️ development feature ⚠️ ) Image pull policy for the ping service | `IfNotPresent` |
| `agones.controller.healthCheck.http.port` | Port to use for liveness probe service | `8080` |
| `agones.controller.http.port` | Port to use for liveness probe service and metrics | `8080` |
| `agones.controller.healthCheck.initialDelaySeconds` | Initial delay before performing the first probe (in seconds) | `3` |
| `agones.controller.healthCheck.periodSeconds` | Seconds between every liveness probe (in seconds) | `3` |
| `agones.controller.healthCheck.failureThreshold` | Number of times before giving up (in seconds) | `3` |
Expand Down
9 changes: 8 additions & 1 deletion install/helm/agones/templates/controller.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ spec:
cluster-autoscaler.kubernetes.io/safe-to-evict: {{ .Values.agones.controller.safeToEvict | quote }}
{{- if .Values.agones.controller.generateTLS }}
revision/tls-cert: {{ .Release.Revision | quote }}
{{- end }}
{{- if and (.Values.agones.metrics.prometheusServiceDiscovery) (.Values.agones.metrics.enabled) }}
prometheus.io/scrape: "true"
prometheus.io/port: {{ .Values.agones.controller.http.port | quote }}
prometheus.io/path: "/metrics"
{{- end }}
labels:
stable.agones.dev/role: controller
Expand All @@ -64,12 +69,14 @@ spec:
value: {{ .Values.agones.image.sdk.alwaysPull | quote }}
- name: SIDECAR_CPU_REQUEST
value: {{ .Values.agones.image.sdk.cpuRequest | quote }}
- name: METRICS
value: {{ .Values.agones.metrics.enabled | quote }}
- name: SIDECAR_CPU_LIMIT
value: {{ .Values.agones.image.sdk.cpuLimit | quote }}
livenessProbe:
httpGet:
path: /live
port: {{ .Values.agones.controller.healthCheck.http.port }}
port: {{ .Values.agones.controller.http.port }}
initialDelaySeconds: {{ .Values.agones.controller.healthCheck.initialDelaySeconds }}
periodSeconds: {{ .Values.agones.controller.healthCheck.periodSeconds }}
failureThreshold: {{ .Values.agones.controller.healthCheck.failureThreshold }}
Expand Down
9 changes: 6 additions & 3 deletions install/helm/agones/templates/service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ metadata:
name: agones-controller-service
namespace: {{ .Release.Namespace }}
labels:
component: controller
stable.agones.dev/role: controller
app: {{ template "agones.name" . }}
chart: {{ template "agones.chart" . }}
release: {{ .Release.Name }}
Expand All @@ -27,5 +27,8 @@ spec:
selector:
stable.agones.dev/role: controller
ports:
- port: 443
targetPort: 8081
- name: webhooks
port: 443
targetPort: 8081
- name: web
port: {{ .Values.agones.controller.http.port }}
Loading

0 comments on commit 46242fb

Please sign in to comment.