From 8e6bad0f0fe2384745027fce129af6b76f99ce38 Mon Sep 17 00:00:00 2001 From: Brant Knudson Date: Wed, 24 Feb 2021 16:22:40 -0600 Subject: [PATCH] CASMPET-3836: Increase timeouts for OPA, remove "/apis/rm", FQDN hostname I was seeing lots of 503 UAEX failures on a system where the time for the request was ~5s. We'd seen this before but I'd increased the timeout in the envoyfilter to 10s. That change happened before I had to refactor JWT cert handling to move it from istio into OPA. Google turned up that the http.send function has a default 5s timeout[0]. I tried increasing the timeout on the system where the failures were happening and after this the 503 errors went away. [0] https://github.com/open-policy-agent/opa/issues/2099 This change adds a variable to set the http timeout and defaults it to 10s. I also increased the default envoyfilter timeout to 25s since there are potentially 2 calls to http.send in the OPA rules. In addition I removed the /apis/rm paths from the auth checks because this service was actually removed (CASMSEC-307). Also, added the . to the DNS names in the JWKS query URLs. This prevents the resolver from going through the DNS search path so there's fewer DNS queries. CASMPET-3838 : SCALE: CSM 0.8.11: cray-opa pods being OOM killed at 2GiB (cherry picked from commit 8d71d3da16e3e6c45932a549cd91418031fef0c7) --- stable/cray-opa/Chart.yaml | 2 +- stable/cray-opa/templates/_policy.tpl | 3 --- stable/cray-opa/templates/deployment.yaml | 4 ++++ stable/cray-opa/templates/envoyfilter.yaml | 2 +- stable/cray-opa/values.yaml | 13 ++++++++----- 5 files changed, 14 insertions(+), 10 deletions(-) diff --git a/stable/cray-opa/Chart.yaml b/stable/cray-opa/Chart.yaml index 4f54c6c..ec308f0 100644 --- a/stable/cray-opa/Chart.yaml +++ b/stable/cray-opa/Chart.yaml @@ -3,4 +3,4 @@ appVersion: 0.24.0 name: cray-opa description: Cray Open Policy Agent home: "cloud/cray-charts" -version: 0.15.1 +version: 0.15.2 diff --git a/stable/cray-opa/templates/_policy.tpl b/stable/cray-opa/templates/_policy.tpl index b81d20f..f08efe2 100644 --- a/stable/cray-opa/templates/_policy.tpl +++ b/stable/cray-opa/templates/_policy.tpl @@ -170,9 +170,6 @@ allowed_methods := { {"method": "DELETE", "path": `^/apis/pals/v1/.*$`}, # All PALs API Calls - DELETE {"method": "HEAD", "path": `^/apis/pals/v1/.*$`}, # All PALs API Calls - HEAD {"method": "PATCH", "path": `^/apis/pals/v1/.*$`}, # All PALs API Calls - PATCH - # Replicant - {"method": "GET", "path": `^/apis/rm/v1/report/[\d\w|-]+$`}, # Get Report by id - {"method": "GET", "path": `^/apis/rm/v1/reports$`}, # Get Reports # Analytics Capsules {"method": "DELETE", "path": `^/apis/capsules/.*$`}, # All Capsules API Calls - DELETE {"method": "GET", "path": `^/apis/capsules/.*$`}, # All Capsules API Calls - GET diff --git a/stable/cray-opa/templates/deployment.yaml b/stable/cray-opa/templates/deployment.yaml index e9d07f9..5c9fb3d 100644 --- a/stable/cray-opa/templates/deployment.yaml +++ b/stable/cray-opa/templates/deployment.yaml @@ -28,6 +28,10 @@ spec: - name: POLICY_CONFIGMAP_VERSION # Change to force opa pods to restart and re-read ConfigMap. value: "2" + {{- if .Values.opa.httpTimeout }} + - name: HTTP_SEND_TIMEOUT + value: {{ .Values.opa.httpTimeout | quote }} + {{- end }} args: - run - --server diff --git a/stable/cray-opa/templates/envoyfilter.yaml b/stable/cray-opa/templates/envoyfilter.yaml index 46d8266..ab625fd 100644 --- a/stable/cray-opa/templates/envoyfilter.yaml +++ b/stable/cray-opa/templates/envoyfilter.yaml @@ -41,7 +41,7 @@ spec: code: ServiceUnavailable grpc_service: google_grpc: - target_uri: {{ include "cray-opa.fullname" . }}.{{ .Release.Namespace }}.svc.cluster.local:{{ .Values.opa.port }} + target_uri: {{ include "cray-opa.fullname" . }}.{{ .Release.Namespace }}.svc.cluster.local.:{{ .Values.opa.port }} stat_prefix: "ext_authz" {{- if .Values.opa.timeout }} timeout: {{ .Values.opa.timeout }} diff --git a/stable/cray-opa/values.yaml b/stable/cray-opa/values.yaml index 32c404b..bedd91a 100644 --- a/stable/cray-opa/values.yaml +++ b/stable/cray-opa/values.yaml @@ -30,13 +30,16 @@ opa: memory: "128Mi" cpu: "250m" limits: - memory: "2Gi" + memory: "5Gi" cpu: "2" - # Timeout defaults to 200ms if not specified. Setting it to 10s, an + # Timeout defaults to 200ms if not specified. Setting it to 20s, an # arbitrary long timeout, provides sufficient overhead to resolve # CASMPET-1804/2570 "deadline exceeded" gRPC errors for the ext_authz filter. # A newer version of OPA might fix this with better performance. - timeout: 10s + timeout: 25s + # http.send requests default to 5s timeout. This was failing on a system so + # increase this to 10s. + httpTimeout: 10s # To overide the default policy in files/policy.rego follow the example below @@ -65,7 +68,7 @@ affinity: jwtValidation: keycloak: - jwksUri: "https://istio-ingressgateway.istio-system.svc.cluster.local/keycloak/realms/shasta/protocol/openid-connect/certs" + jwksUri: "https://istio-ingressgateway.istio-system.svc.cluster.local./keycloak/realms/shasta/protocol/openid-connect/certs" issuers: # These are expected to be overriden via values.yaml customization. # XXX `shasta` is work-around until the CLI can support a separate @@ -79,6 +82,6 @@ jwtValidation: mgmt_http: "http://mgmt-plane-cmn.local/keycloak/realms/shasta" mgmt_https: "https://mgmt-plane-cmn.local/keycloak/realms/shasta" spire: - jwksUri: "https://istio-ingressgateway.istio-system.svc.cluster.local/spire-jwks-vshastaio/keys" + jwksUri: "https://istio-ingressgateway.istio-system.svc.cluster.local./spire-jwks-vshastaio/keys" issuers: vshasta.io: "http://spire.local/shasta/vshastaio"