diff --git a/cmd/controller/main.go b/cmd/controller/main.go index 8d4e5ad9e6..6ac888baf6 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -60,8 +60,10 @@ const ( maxPortFlag = "max-port" certFileFlag = "cert-file" keyFileFlag = "key-file" + numWorkersFlag = "num-workers" + apiServerSustainedQPSFlag = "api-server-qps" + apiServerBurstQPSFlag = "api-server-qps-burst" kubeconfigFlag = "kubeconfig" - workers = 2 defaultResync = 30 * time.Second ) @@ -85,6 +87,9 @@ func main() { logger.WithError(err).Fatal("Could not create in cluster config") } + clientConf.QPS = float32(ctlConf.APIServerSustainedQPS) + clientConf.Burst = ctlConf.APIServerBurstQPS + kubeClient, err := kubernetes.NewForConfig(clientConf) if err != nil { logger.WithError(err).Fatal("Could not create the kubernetes clientset") @@ -169,7 +174,7 @@ func main() { for _, r := range rs { go func(rr runner) { - if runErr := rr.Run(workers, stop); runErr != nil { + if runErr := rr.Run(ctlConf.NumWorkers, stop); runErr != nil { logger.WithError(runErr).Fatalf("could not start runner: %T", rr) } }(r) @@ -195,6 +200,9 @@ func parseEnvFlags() config { viper.SetDefault(enablePrometheusMetricsFlag, true) viper.SetDefault(enableStackdriverMetricsFlag, false) viper.SetDefault(projectIDFlag, "") + viper.SetDefault(numWorkersFlag, 64) + viper.SetDefault(apiServerSustainedQPSFlag, 100) + viper.SetDefault(apiServerBurstQPSFlag, 200) pflag.String(sidecarImageFlag, viper.GetString(sidecarImageFlag), "Flag to overwrite the GameServer sidecar image that is used. Can also use SIDECAR env variable") pflag.String(sidecarCPULimitFlag, viper.GetString(sidecarCPULimitFlag), "Flag to overwrite the GameServer sidecar container's cpu limit. Can also use SIDECAR_CPU_LIMIT env variable") @@ -208,6 +216,9 @@ func parseEnvFlags() config { pflag.Bool(enablePrometheusMetricsFlag, viper.GetBool(enablePrometheusMetricsFlag), "Flag to activate metrics of Agones. Can also use PROMETHEUS_EXPORTER env variable.") pflag.Bool(enableStackdriverMetricsFlag, viper.GetBool(enableStackdriverMetricsFlag), "Flag to activate stackdriver monitoring metrics for Agones. Can also use STACKDRIVER_EXPORTER env variable.") pflag.String(projectIDFlag, viper.GetString(projectIDFlag), "GCP ProjectID used for Stackdriver, if not specified ProjectID from Application Default Credentials would be used. Can also use GCP_PROJECT_ID env variable.") + pflag.Int32(numWorkersFlag, 64, "Number of controller workers per resource type") + pflag.Int32(apiServerSustainedQPSFlag, 100, "Maximum sustained queries per second to send to the API server") + pflag.Int32(apiServerBurstQPSFlag, 200, "Maximum burst queries per second to send to the API server") pflag.Parse() viper.SetEnvKeyReplacer(strings.NewReplacer("-", "_")) @@ -224,6 +235,9 @@ func parseEnvFlags() config { runtime.Must(viper.BindEnv(enableStackdriverMetricsFlag)) runtime.Must(viper.BindEnv(projectIDFlag)) runtime.Must(viper.BindPFlags(pflag.CommandLine)) + runtime.Must(viper.BindEnv(numWorkersFlag)) + runtime.Must(viper.BindEnv(apiServerSustainedQPSFlag)) + runtime.Must(viper.BindEnv(apiServerBurstQPSFlag)) request, err := resource.ParseQuantity(viper.GetString(sidecarCPURequestFlag)) if err != nil { @@ -236,35 +250,41 @@ func parseEnvFlags() config { } return config{ - MinPort: int32(viper.GetInt64(minPortFlag)), - MaxPort: int32(viper.GetInt64(maxPortFlag)), - SidecarImage: viper.GetString(sidecarImageFlag), - SidecarCPURequest: request, - SidecarCPULimit: limit, - AlwaysPullSidecar: viper.GetBool(pullSidecarFlag), - KeyFile: viper.GetString(keyFileFlag), - CertFile: viper.GetString(certFileFlag), - KubeConfig: viper.GetString(kubeconfigFlag), - PrometheusMetrics: viper.GetBool(enablePrometheusMetricsFlag), - Stackdriver: viper.GetBool(enableStackdriverMetricsFlag), - GCPProjectID: viper.GetString(projectIDFlag), + MinPort: int32(viper.GetInt64(minPortFlag)), + MaxPort: int32(viper.GetInt64(maxPortFlag)), + SidecarImage: viper.GetString(sidecarImageFlag), + SidecarCPURequest: request, + SidecarCPULimit: limit, + AlwaysPullSidecar: viper.GetBool(pullSidecarFlag), + KeyFile: viper.GetString(keyFileFlag), + CertFile: viper.GetString(certFileFlag), + KubeConfig: viper.GetString(kubeconfigFlag), + PrometheusMetrics: viper.GetBool(enablePrometheusMetricsFlag), + Stackdriver: viper.GetBool(enableStackdriverMetricsFlag), + GCPProjectID: viper.GetString(projectIDFlag), + NumWorkers: int(viper.GetInt32(numWorkersFlag)), + APIServerSustainedQPS: int(viper.GetInt32(apiServerSustainedQPSFlag)), + APIServerBurstQPS: int(viper.GetInt32(apiServerBurstQPSFlag)), } } // config stores all required configuration to create a game server controller. type config struct { - MinPort int32 - MaxPort int32 - SidecarImage string - SidecarCPURequest resource.Quantity - SidecarCPULimit resource.Quantity - AlwaysPullSidecar bool - PrometheusMetrics bool - Stackdriver bool - KeyFile string - CertFile string - KubeConfig string - GCPProjectID string + MinPort int32 + MaxPort int32 + SidecarImage string + SidecarCPURequest resource.Quantity + SidecarCPULimit resource.Quantity + AlwaysPullSidecar bool + PrometheusMetrics bool + Stackdriver bool + KeyFile string + CertFile string + KubeConfig string + GCPProjectID string + NumWorkers int + APIServerSustainedQPS int + APIServerBurstQPS int } // validate ensures the ctlConfig data is valid. diff --git a/install/helm/agones/templates/controller.yaml b/install/helm/agones/templates/controller.yaml index 811acb0b0a..3566c6363a 100644 --- a/install/helm/agones/templates/controller.yaml +++ b/install/helm/agones/templates/controller.yaml @@ -90,6 +90,12 @@ spec: value: {{ .Values.agones.metrics.stackdriverProjectID | quote }} - name: SIDECAR_CPU_LIMIT value: {{ .Values.agones.image.sdk.cpuLimit | quote }} + - name: NUM_WORKERS + value: {{ .Values.agones.controller.numWorkers | quote }} + - name: API_SERVER_QPS + value: {{ .Values.agones.controller.apiServerQPS | quote }} + - name: API_SERVER_QPS_BURST + value: {{ .Values.agones.controller.apiServerQPSBurst | quote }} livenessProbe: httpGet: path: /live diff --git a/install/helm/agones/values.yaml b/install/helm/agones/values.yaml index 424e840a9c..a14ba9046f 100644 --- a/install/helm/agones/values.yaml +++ b/install/helm/agones/values.yaml @@ -46,6 +46,9 @@ agones: operator: Exists generateTLS: true safeToEvict: false + numWorkers: 64 + apiServerQPS: 100 + apiServerQPSBurst: 200 http: port: 8080 healthCheck: diff --git a/install/yaml/install.yaml b/install/yaml/install.yaml index b8c9a9e52e..c9213a5dd4 100644 --- a/install/yaml/install.yaml +++ b/install/yaml/install.yaml @@ -1055,6 +1055,12 @@ spec: value: "" - name: SIDECAR_CPU_LIMIT value: "0" + - name: NUM_WORKERS + value: "64" + - name: API_SERVER_QPS + value: "100" + - name: API_SERVER_QPS_BURST + value: "200" livenessProbe: httpGet: path: /live diff --git a/site/content/en/docs/Installation/helm.md b/site/content/en/docs/Installation/helm.md index 9737a433c8..bac332c922 100644 --- a/site/content/en/docs/Installation/helm.md +++ b/site/content/en/docs/Installation/helm.md @@ -130,13 +130,13 @@ The following tables lists the configurable parameters of the Agones chart and t | `agones.controller.resources` | Controller resource requests/limit | `{}` | | `agones.controller.generateTLS` | Set to true to generate TLS certificates or false to provide your own certificates in `certs/*` | `true` | | `agones.ping.install` | Whether to install the [ping service][ping] | `true` | -| `agones.ping.replicas` | The number of replicas to run in the deployment | `2` | -| `agones.ping.http.expose` | Expose the http ping service via a Service | `true` | -| `agones.ping.http.response` | The string response returned from the http service | `ok` | +| `agones.ping.replicas` | The number of replicas to run in the deployment | `2` | +| `agones.ping.http.expose` | Expose the http ping service via a Service | `true` | +| `agones.ping.http.response` | The string response returned from the http service | `ok` | | `agones.ping.http.port` | The port to expose on the service | `80` | | `agones.ping.http.serviceType` | The [Service Type][service] of the HTTP Service | `LoadBalancer` | -| `agones.ping.udp.expose` | Expose the udp ping service via a Service | `true` | -| `agones.ping.udp.rateLimit` | Number of UDP packets the ping service handles per instance, per second, per sender | `20` | +| `agones.ping.udp.expose` | Expose the udp ping service via a Service | `true` | +| `agones.ping.udp.rateLimit` | Number of UDP packets the ping service handles per instance, per second, per sender | `20` | | `agones.ping.udp.port` | The port to expose on the service | `80` | | `agones.ping.udp.serviceType` | The [Service Type][service] of the UDP Service | `LoadBalancer` | | `agones.ping.healthCheck.initialDelaySeconds` | Initial delay before performing the first probe (in seconds) | `3` | @@ -162,6 +162,10 @@ The following tables lists the configurable parameters of the Agones chart and t | `agones.ping.nodeSelector` | Ping [node labels](nodeSelector) for pod assignment | `{}` | | `agones.ping.tolerations` | Ping [toleration][toleration] labels for pod assignment | `[]` | | `agones.ping.affinity` | Ping [affinity](affinity) settings for pod assignment | `{}` | +| `agones.controller.numWorkers` | Number of workers to spin per resource type | `64` | +| `agones.controller.apiServerQPS` | Maximum sustained queries per second that controller should be making against API Server | `100` | +| `agones.controller.apiServerQPSBurst` | Maximum burst queries per second that controller should be making against API Server | `200` | + [toleration]: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/ {{% /feature %}} diff --git a/test/e2e/fleet_test.go b/test/e2e/fleet_test.go index 8763b5de11..214d5757fd 100644 --- a/test/e2e/fleet_test.go +++ b/test/e2e/fleet_test.go @@ -26,6 +26,7 @@ import ( "github.com/stretchr/testify/assert" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" + k8serrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/wait" @@ -445,7 +446,7 @@ func TestFleetAllocationDuringGameServerDeletion(t *testing.T) { fltCopy := flt.DeepCopy() fltCopy.Spec.Template.ObjectMeta.Annotations[key] = green _, err = framework.AgonesClient.StableV1alpha1().Fleets(defaultNs).Update(fltCopy) - assert.Nil(t, err) + assertSuccessOrUpdateConflict(t, err) }) }) @@ -467,11 +468,18 @@ func TestFleetAllocationDuringGameServerDeletion(t *testing.T) { fltCopy := flt.DeepCopy() fltCopy.Spec.Template.ObjectMeta.Annotations[key] = green _, err = framework.AgonesClient.StableV1alpha1().Fleets(defaultNs).Update(fltCopy) - assert.Nil(t, err) + assertSuccessOrUpdateConflict(t, err) }) }) } +func assertSuccessOrUpdateConflict(t *testing.T, err error) { + if !k8serrors.IsConflict(err) { + // update conflicts are sometimes ok, we simply lost the race. + assert.Nil(t, err) + } +} + // TestGameServerAllocationDuringGameServerDeletion is built to specifically // test for race conditions of allocations when doing scale up/down, // rolling updates, etc. Failures may not happen ALL the time -- as that is the @@ -567,7 +575,7 @@ func TestGameServerAllocationDuringGameServerDeletion(t *testing.T) { fltCopy := flt.DeepCopy() fltCopy.Spec.Template.ObjectMeta.Annotations[key] = green _, err = framework.AgonesClient.StableV1alpha1().Fleets(defaultNs).Update(fltCopy) - assert.Nil(t, err) + assertSuccessOrUpdateConflict(t, err) }) }) @@ -589,7 +597,7 @@ func TestGameServerAllocationDuringGameServerDeletion(t *testing.T) { fltCopy := flt.DeepCopy() fltCopy.Spec.Template.ObjectMeta.Annotations[key] = green _, err = framework.AgonesClient.StableV1alpha1().Fleets(defaultNs).Update(fltCopy) - assert.Nil(t, err) + assertSuccessOrUpdateConflict(t, err) }) }) }