fix(ktor): scalability for server (#559)

* fix(ktor): scalability for server * noticed slow processing in ktor, when there is a high amount of load. The thread pool sizes (connectionGroupSize, workerGroupSize, and callGroupSize) dynamically calculated based on the available processors, ensuring optimal performance. This setup will be 16, 32, 32 instead of 8, 8, 16 -> This configuration assumes that application processing is more CPU-intensive than connection handling. Not true looking at metrics. checking the docs: If /token processing is delayed, we can increase callGroupSize gradually to handle more concurrent requests. consider: install(IdleTimeout) { requestTimeoutMillis = 15000 idleTimeoutMillis = 60000 } to handle connection not consuming use. * * fix(server): without setting limit for cpu, not overloading we devide by 2 * fix(server): more robust handling of scale, ensure that the callGroupSize never exceeds the database maxConnectionPool, that could start starvation or exceed the database `max_connection` limit. * set max_connections flag = 200 * increase the production pool size * 10 pods * 20 connections = 200 connections (matches a database with max_connections = 200) * fix(server): remove comment * update(build): update deps * update(metrics): max replicas is 12, so assure the number of concurrent connections for the database matches the replicas * pol_max
nais · Dec 12, 2024 · 9715ea3 · 9715ea3
1 parent 76c2261
commit 9715ea3
Show file tree

Hide file tree

Showing 4 changed files with 20 additions and 11 deletions.
diff --git a/build.gradle.kts b/build.gradle.kts
@@ -12,7 +12,7 @@ val kotestVersion = "5.9.1"
 val kotlinLoggingVersion = "3.0.5"
 val kotlinVersion = "2.0.21"
 val kotliqueryVersion = "1.9.0"
-val ktorVersion = "3.0.1"
+val ktorVersion = "3.0.2"
 val logbackVersion = "1.5.12"
 val logstashLogbackEncoderVersion = "8.0"
 val micrometerRegistryPrometheusVersion = "1.14.0"
@@ -54,6 +54,13 @@ repositories {
     mavenCentral()
 }
 
+
+configurations.all {
+    resolutionStrategy {
+        force("org.apache.commons:commons-compress:1.26.0")
+    }
+}
+
 dependencies {
     implementation(kotlin("stdlib"))
     implementation("org.jetbrains.kotlin:kotlin-reflect:$kotlinVersion")

diff --git a/charts/templates/tokendings.yaml b/charts/templates/tokendings.yaml
@@ -59,6 +59,9 @@ spec:
         databases:
           - name: tokendings
             envVarPrefix: DB
+        flags:
+          - name: max_connections
+            value: "300"
   ingresses:
     - "{{- include "tokenx.tokendings.URL" . }}"
   {{- if .Values.tokendings.mapSubjectTokenClaims }}

diff --git a/src/main/kotlin/io/nais/security/oauth2/TokenExchangeApp.kt b/src/main/kotlin/io/nais/security/oauth2/TokenExchangeApp.kt
@@ -45,6 +45,7 @@ import io.micrometer.prometheus.PrometheusConfig
 import io.micrometer.prometheus.PrometheusMeterRegistry
 import io.nais.security.oauth2.authentication.clientRegistrationAuth
 import io.nais.security.oauth2.config.AppConfiguration
+import io.nais.security.oauth2.config.HikariProperties
 import io.nais.security.oauth2.config.configByProfile
 import io.nais.security.oauth2.config.isNonProd
 import io.nais.security.oauth2.metrics.Metrics
@@ -81,15 +82,17 @@ fun main() {
 
 fun server(): EmbeddedServer<NettyApplicationEngine, NettyApplicationEngine.Configuration> {
     val config = configByProfile()
+    val processors = Runtime.getRuntime().availableProcessors()
+    val maxConnectionPool = if (isNonProd()) HikariProperties.MAX_POOL_SIZE_NON_PROD else HikariProperties.MAX_POOL_SIZE_PROD
     return embeddedServer(
         Netty,
         configure = {
             connector {
                 port = config.serverProperties.port
             }
-            connectionGroupSize = 8
-            workerGroupSize = 8
-            callGroupSize = 16
+            connectionGroupSize = maxOf(1, processors / 2)
+            workerGroupSize = processors
+            callGroupSize = maxOf(1, minOf(processors * 2, maxConnectionPool))
         },
         module = {
             tokenExchangeApp(config, DefaultRouting(config))
@@ -149,11 +152,7 @@ fun Application.tokenExchangeApp(config: AppConfiguration, routing: ApiRouting)
                     call.respondWithError(cause, includeErrorDetails)
                 }
 
-                is BadRequestException -> {
-                    call.respond(HttpStatusCode.BadRequest, "invalid request content")
-                }
-
-                is JsonProcessingException -> {
+                is BadRequestException, is JsonProcessingException -> {
                     call.respond(HttpStatusCode.BadRequest, "invalid request content")
                 }
 

diff --git a/src/main/kotlin/io/nais/security/oauth2/config/DatabaseConfig.kt b/src/main/kotlin/io/nais/security/oauth2/config/DatabaseConfig.kt
@@ -60,8 +60,8 @@ object HikariProperties {
     const val IDLE_TIMEOUT_PROD = 300000L
     const val CONNECTION_TIMEOUT_PROD = 5000L
     const val MAX_LIFETIME_PROD = 1800000L
-    const val MAX_POOL_SIZE_PROD = 10
-    const val MIN_IDLE_CONNECTIONS_PROD = 5
+    const val MAX_POOL_SIZE_PROD = 20
+    const val MIN_IDLE_CONNECTIONS_PROD = 10
 
     // Non-production-specific
     const val IDLE_TIMEOUT_NON_PROD = 600000L