Skip to content

Commit

Permalink
Helm chart updates related to Prometheus, Webhook HPA, and Flyteconso…
Browse files Browse the repository at this point in the history
…le probes (flyteorg#5508)
  • Loading branch information
mhotan authored Jun 25, 2024
1 parent e1d9c5c commit 3ee7120
Show file tree
Hide file tree
Showing 23 changed files with 217 additions and 9 deletions.
26 changes: 25 additions & 1 deletion charts/flyte-core/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ helm install gateway bitnami/contour -n flyte
| cloud_events.eventsPublisher.eventTypes[0] | string | `"all"` | |
| cloud_events.eventsPublisher.topicName | string | `"arn:aws:sns:us-east-2:123456:123-my-topic"` | |
| cloud_events.type | string | `"aws"` | |
| cluster_resource_manager | object | `{"config":{"cluster_resources":{"customData":[{"production":[{"projectQuotaCpu":{"value":"5"}},{"projectQuotaMemory":{"value":"4000Mi"}}]},{"staging":[{"projectQuotaCpu":{"value":"2"}},{"projectQuotaMemory":{"value":"3000Mi"}}]},{"development":[{"projectQuotaCpu":{"value":"4"}},{"projectQuotaMemory":{"value":"3000Mi"}}]}],"refreshInterval":"5m","standaloneDeployment":false,"templatePath":"/etc/flyte/clusterresource/templates"}},"enabled":true,"nodeSelector":{},"podAnnotations":{},"podEnv":{},"podLabels":{},"resources":{},"service_account_name":"flyteadmin","standaloneDeployment":false,"templates":[{"key":"aa_namespace","value":"apiVersion: v1\nkind: Namespace\nmetadata:\n name: {{ namespace }}\nspec:\n finalizers:\n - kubernetes\n"},{"key":"ab_project_resource_quota","value":"apiVersion: v1\nkind: ResourceQuota\nmetadata:\n name: project-quota\n namespace: {{ namespace }}\nspec:\n hard:\n limits.cpu: {{ projectQuotaCpu }}\n limits.memory: {{ projectQuotaMemory }}\n"}]}` | Configuration for the Cluster resource manager component. This is an optional component, that enables automatic cluster configuration. This is useful to set default quotas, manage namespaces etc that map to a project/domain |
| cluster_resource_manager | object | `{"config":{"cluster_resources":{"customData":[{"production":[{"projectQuotaCpu":{"value":"5"}},{"projectQuotaMemory":{"value":"4000Mi"}}]},{"staging":[{"projectQuotaCpu":{"value":"2"}},{"projectQuotaMemory":{"value":"3000Mi"}}]},{"development":[{"projectQuotaCpu":{"value":"4"}},{"projectQuotaMemory":{"value":"3000Mi"}}]}],"refreshInterval":"5m","standaloneDeployment":false,"templatePath":"/etc/flyte/clusterresource/templates"}},"enabled":true,"nodeSelector":{},"podAnnotations":{},"podEnv":{},"podLabels":{},"prometheus":{"enabled":false,"path":"/metrics","port":10254},"resources":{},"service_account_name":"flyteadmin","standaloneDeployment":false,"templates":[{"key":"aa_namespace","value":"apiVersion: v1\nkind: Namespace\nmetadata:\n name: {{ namespace }}\nspec:\n finalizers:\n - kubernetes\n"},{"key":"ab_project_resource_quota","value":"apiVersion: v1\nkind: ResourceQuota\nmetadata:\n name: project-quota\n namespace: {{ namespace }}\nspec:\n hard:\n limits.cpu: {{ projectQuotaCpu }}\n limits.memory: {{ projectQuotaMemory }}\n"}]}` | Configuration for the Cluster resource manager component. This is an optional component, that enables automatic cluster configuration. This is useful to set default quotas, manage namespaces etc that map to a project/domain |
| cluster_resource_manager.config | object | `{"cluster_resources":{"customData":[{"production":[{"projectQuotaCpu":{"value":"5"}},{"projectQuotaMemory":{"value":"4000Mi"}}]},{"staging":[{"projectQuotaCpu":{"value":"2"}},{"projectQuotaMemory":{"value":"3000Mi"}}]},{"development":[{"projectQuotaCpu":{"value":"4"}},{"projectQuotaMemory":{"value":"3000Mi"}}]}],"refreshInterval":"5m","standaloneDeployment":false,"templatePath":"/etc/flyte/clusterresource/templates"}}` | Configmap for ClusterResource parameters |
| cluster_resource_manager.config.cluster_resources | object | `{"customData":[{"production":[{"projectQuotaCpu":{"value":"5"}},{"projectQuotaMemory":{"value":"4000Mi"}}]},{"staging":[{"projectQuotaCpu":{"value":"2"}},{"projectQuotaMemory":{"value":"3000Mi"}}]},{"development":[{"projectQuotaCpu":{"value":"4"}},{"projectQuotaMemory":{"value":"3000Mi"}}]}],"refreshInterval":"5m","standaloneDeployment":false,"templatePath":"/etc/flyte/clusterresource/templates"}` | ClusterResource parameters Refer to the [structure](https://pkg.go.dev/github.com/lyft/flyteadmin@v0.3.37/pkg/runtime/interfaces#ClusterResourceConfig) to customize. |
| cluster_resource_manager.config.cluster_resources.refreshInterval | string | `"5m"` | How frequently to run the sync process |
Expand Down Expand Up @@ -209,15 +209,22 @@ helm install gateway bitnami/contour -n flyte
| flyteconsole.image.repository | string | `"cr.flyte.org/flyteorg/flyteconsole"` | Docker image for Flyteconsole deployment |
| flyteconsole.image.tag | string | `"v1.14.0"` | |
| flyteconsole.imagePullSecrets | list | `[]` | ImagePullSecrets to assign to the Flyteconsole deployment |
| flyteconsole.livenessProbe | object | `{}` | |
| flyteconsole.nodeSelector | object | `{}` | nodeSelector for Flyteconsole deployment |
| flyteconsole.podAnnotations | object | `{}` | Annotations for Flyteconsole pods |
| flyteconsole.podEnv | object | `{}` | Additional Flyteconsole container environment variables |
| flyteconsole.podLabels | object | `{}` | Labels for Flyteconsole pods |
| flyteconsole.priorityClassName | string | `""` | Sets priorityClassName for flyte console pod(s). |
| flyteconsole.readinessProbe | object | `{}` | |
| flyteconsole.replicaCount | int | `1` | Replicas count for Flyteconsole deployment |
| flyteconsole.resources | object | `{"limits":{"cpu":"500m","memory":"250Mi"},"requests":{"cpu":"10m","memory":"50Mi"}}` | Default resources requests and limits for Flyteconsole deployment |
| flyteconsole.securityContext | object | `{"fsGroupChangePolicy":"OnRootMismatch","runAsNonRoot":true,"runAsUser":1000,"seLinuxOptions":{"type":"spc_t"}}` | Sets securityContext for flyteconsole pod(s). |
| flyteconsole.service | object | `{"annotations":{},"type":"ClusterIP"}` | Service settings for Flyteconsole |
| flyteconsole.serviceMonitor | object | `{"enabled":false,"interval":"60s","labels":{},"scrapeTimeout":"30s"}` | Settings for flyteconsole service monitor |
| flyteconsole.serviceMonitor.enabled | bool | `false` | If enabled create the flyteconsole service monitor |
| flyteconsole.serviceMonitor.interval | string | `"60s"` | Sets the interval at which metrics will be scraped by prometheus |
| flyteconsole.serviceMonitor.labels | object | `{}` | Sets the labels for the service monitor which are required by the prometheus to auto-detect the service monitor and start scrapping the metrics |
| flyteconsole.serviceMonitor.scrapeTimeout | string | `"30s"` | Sets the timeout after which request to scrape metrics will time out |
| flyteconsole.tolerations | list | `[]` | tolerations for Flyteconsole deployment |
| flytepropeller.additionalContainers | list | `[]` | Appends additional containers to the deployment spec. May include template values. |
| flytepropeller.additionalVolumeMounts | list | `[]` | Appends additional volume mounts to the main container's spec. May include template values. |
Expand All @@ -238,6 +245,9 @@ helm install gateway bitnami/contour -n flyte
| flytepropeller.podEnv | object | `{}` | Additional Flytepropeller container environment variables |
| flytepropeller.podLabels | object | `{}` | Labels for Flytepropeller pods |
| flytepropeller.priorityClassName | string | `""` | Sets priorityClassName for propeller pod(s). |
| flytepropeller.prometheus.enabled | bool | `false` | |
| flytepropeller.prometheus.path | string | `"/metrics"` | |
| flytepropeller.prometheus.port | int | `10254` | |
| flytepropeller.replicaCount | int | `1` | Replicas count for Flytepropeller deployment |
| flytepropeller.resources | object | `{"limits":{"cpu":"200m","ephemeral-storage":"100Mi","memory":"200Mi"},"requests":{"cpu":"10m","ephemeral-storage":"50Mi","memory":"100Mi"}}` | Default resources requests and limits for Flytepropeller deployment |
| flytepropeller.securityContext | object | `{"fsGroup":65534,"fsGroupChangePolicy":"Always","runAsUser":1001}` | Sets securityContext for flytepropeller pod(s). |
Expand Down Expand Up @@ -295,8 +305,22 @@ helm install gateway bitnami/contour -n flyte
| storage.s3.authType | string | `"iam"` | type of authentication to use for S3 buckets, can either be iam or accesskey |
| storage.s3.secretKey | string | `""` | AWS IAM user secret access key to use for S3 bucket auth, only used if authType is set to accesskey |
| storage.type | string | `"sandbox"` | Sets the storage type. Supported values are sandbox, s3, gcs and custom. |
| webhook.autoscaling.enabled | bool | `false` | |
| webhook.autoscaling.maxReplicas | int | `10` | |
| webhook.autoscaling.metrics[0].resource.name | string | `"cpu"` | |
| webhook.autoscaling.metrics[0].resource.target.averageUtilization | int | `80` | |
| webhook.autoscaling.metrics[0].resource.target.type | string | `"Utilization"` | |
| webhook.autoscaling.metrics[0].type | string | `"Resource"` | |
| webhook.autoscaling.metrics[1].resource.name | string | `"memory"` | |
| webhook.autoscaling.metrics[1].resource.target.averageUtilization | int | `80` | |
| webhook.autoscaling.metrics[1].resource.target.type | string | `"Utilization"` | |
| webhook.autoscaling.metrics[1].type | string | `"Resource"` | |
| webhook.autoscaling.minReplicas | int | `1` | |
| webhook.enabled | bool | `true` | enable or disable secrets webhook |
| webhook.priorityClassName | string | `""` | Sets priorityClassName for webhook pod |
| webhook.prometheus.enabled | bool | `false` | |
| webhook.prometheus.path | string | `"/metrics"` | |
| webhook.prometheus.port | int | `10254` | |
| webhook.resources.requests.cpu | string | `"200m"` | |
| webhook.resources.requests.ephemeral-storage | string | `"500Mi"` | |
| webhook.resources.requests.memory | string | `"500Mi"` | |
Expand Down
1 change: 0 additions & 1 deletion charts/flyte-core/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ app.kubernetes.io/managed-by: {{ .Release.Service }}
{{ toYaml . }}
{{- end }}
{{- end -}}

{{- define "datacatalog.name" -}}
datacatalog
{{- end -}}
Expand Down
11 changes: 11 additions & 0 deletions charts/flyte-core/templates/clusterresourcesync/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ spec:
{{- with .Values.cluster_resource_manager.podAnnotations }}
{{- toYaml . | nindent 8 }}
{{- end }}
prometheus.io/path: {{ .Values.cluster_resource_manager.prometheus.path | quote }}
prometheus.io/port: {{ .Values.cluster_resource_manager.prometheus.port | quote }}
{{- with .Values.cluster_resource_manager.prometheus.enabled }}
prometheus.io/scrape: "true"
{{- end }}
labels: {{ include "flyteclusterresourcesync.podLabels" . | nindent 8 }}
spec:
containers:
Expand Down Expand Up @@ -55,6 +60,12 @@ spec:
- mountPath: /var/run/credentials
name: cluster-secrets
{{- end }}
{{- if .Values.cluster_resource_manager.prometheus.enabled }}
ports:
- containerPort: {{ .Values.cluster_resource_manager.prometheus.port }}
name: debug
protocol: TCP
{{- end }}
serviceAccountName: {{ .Values.cluster_resource_manager.service_account_name }}
volumes: {{- include "databaseSecret.volume" . | nindent 8 }}
- configMap:
Expand Down
13 changes: 13 additions & 0 deletions charts/flyte-core/templates/console/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ spec:
name: flyte-console-config
ports:
- containerPort: 8080
{{- if .Values.flyteconsole.serviceMonitor.enabled }}
- containerPort: 8081
name: http-metrics
protocol: TCP
{{- end }}
{{- if or .Values.flyteconsole.ga.enabled .Values.flyteconsole.podEnv }}
env:
{{- end }}
Expand All @@ -59,6 +64,14 @@ spec:
volumeMounts:
- mountPath: /srv/flyte
name: shared-data
{{- with .Values.flyteconsole.livenessProbe }}
livenessProbe:
{{- toYaml . | nindent 10 }}
{{- end }}
{{- with .Values.flyteconsole.readinessProbe }}
readinessProbe:
{{- toYaml . | nindent 10 }}
{{- end }}
volumes:
- emptyDir: {}
name: shared-data
Expand Down
19 changes: 19 additions & 0 deletions charts/flyte-core/templates/console/service-monitor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{{- if and .Values.flyteconsole.serviceMonitor.enabled .Values.flyteconsole.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: {{ template "flyteconsole.name" . }}
namespace: {{ template "flyte.namespace" . }}
labels:
{{- with .Values.flyteconsole.serviceMonitor.labels }}
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
endpoints:
- interval: {{ .Values.flyteconsole.serviceMonitor.interval }}
port: http-metrics
path: /metrics
scrapeTimeout: {{ .Values.flyteconsole.serviceMonitor.scrapeTimeout }}
selector:
matchLabels: {{ include "flyteconsole.selectorLabels" . | nindent 6 }}
{{- end }}
5 changes: 5 additions & 0 deletions charts/flyte-core/templates/console/service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,10 @@ spec:
port: 80
protocol: TCP
targetPort: 8080
{{- if .Values.flyteconsole.serviceMonitor.enabled }}
- name: http-metrics
port: 8081
protocol: TCP
{{- end }}
selector: {{ include "flyteconsole.selectorLabels" . | nindent 4 }}
{{- end }}
10 changes: 10 additions & 0 deletions charts/flyte-core/templates/propeller/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ spec:
{{- with .Values.flytepropeller.podAnnotations }}
{{- toYaml . | nindent 8 }}
{{- end }}
prometheus.io/path: {{ .Values.flytepropeller.prometheus.path | quote }}
prometheus.io/port: {{ .Values.flytepropeller.prometheus.port | quote }}
{{- with .Values.flytepropeller.prometheus.enabled }}
prometheus.io/scrape: "true"
{{- end }}
{{- if .Values.flytepropeller.manager }}
labels: {{ include "flytepropeller-manager.podLabels" . | nindent 8 }}
{{- else }}
Expand Down Expand Up @@ -78,6 +83,11 @@ spec:
{{- end }}
ports:
- containerPort: {{ index .Values.configmap.core.propeller "prof-port" }}
{{- if .Values.flytepropeller.prometheus.enabled }}
- containerPort: {{ .Values.flytepropeller.prometheus.port }}
name: debug
protocol: TCP
{{- end }}
resources: {{- toYaml .Values.flytepropeller.resources | nindent 10 }}
volumeMounts:
- name: config-volume
Expand Down
17 changes: 17 additions & 0 deletions charts/flyte-core/templates/propeller/webhook-hpa.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{{- if .Values.webhook.autoscaling.enabled }}
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: {{ template "flyte-pod-webhook.name" . }}
labels:
app: {{ template "flyte-pod-webhook.name" . }}
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: {{ template "flyte-pod-webhook.name" . }}
minReplicas: {{ .Values.webhook.autoscaling.minReplicas }}
maxReplicas: {{ .Values.webhook.autoscaling.maxReplicas }}
metrics:
{{ .Values.webhook.autoscaling.metrics | toYaml | nindent 4 }}
{{- end }}
10 changes: 10 additions & 0 deletions charts/flyte-core/templates/propeller/webhook.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ spec:
{{- with .Values.flytepropeller.podAnnotations }}
{{- toYaml . | nindent 8 }}
{{- end }}
prometheus.io/path: {{ .Values.webhook.prometheus.path | quote }}
prometheus.io/port: {{ .Values.webhook.prometheus.port | quote }}
{{- with .Values.webhook.prometheus.enabled }}
prometheus.io/scrape: "true"
{{- end }}
spec:
{{- with .Values.webhook.securityContext }}
securityContext: {{ tpl (toYaml .) $ | nindent 8 }}
Expand Down Expand Up @@ -102,6 +107,11 @@ spec:
{{- end }}
ports:
- containerPort: 9443
{{- if .Values.webhook.prometheus.enabled }}
- containerPort: {{ .Values.webhook.prometheus.port }}
name: debug
protocol: TCP
{{- end }}
securityContext:
allowPrivilegeEscalation: false
capabilities:
Expand Down
49 changes: 49 additions & 0 deletions charts/flyte-core/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,12 @@ flytepropeller:
interval: 60s
# -- Sets the timeout after which request to scrape metrics will time out
scrapeTimeout: 30s

prometheus:
enabled: false
path: "/metrics"
port: 10254

#
# FLYTECONSOLE SETTINGS
#
Expand Down Expand Up @@ -436,6 +442,21 @@ flyteconsole:
seLinuxOptions:
type: spc_t

# -- Settings for flyteconsole service monitor
serviceMonitor:
# -- If enabled create the flyteconsole service monitor
enabled: false
# -- Sets the interval at which metrics will be scraped by prometheus
interval: 60s
# -- Sets the timeout after which request to scrape metrics will time out
scrapeTimeout: 30s
# -- Sets the labels for the service monitor which are required by the
# prometheus to auto-detect the service monitor and start scrapping the metrics
labels: {}

livenessProbe: {}
readinessProbe: {}

# It will enable the redoc route in ingress
deployRedoc: false

Expand Down Expand Up @@ -492,6 +513,29 @@ webhook:
ephemeral-storage: 500Mi
memory: 500Mi

autoscaling:
enabled: false
minReplicas: 1
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 80
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80

prometheus:
enabled: false
path: "/metrics"
port: 10254

# ------------------------------------------------
#
# COMMON SETTINGS
Expand Down Expand Up @@ -951,6 +995,11 @@ cluster_resource_manager:
- projectQuotaMemory:
value: "3000Mi"

prometheus:
enabled: false
path: "/metrics"
port: 10254

# -- Resource templates that should be applied
templates:
# -- Template for namespaces resources
Expand Down
6 changes: 6 additions & 0 deletions deployment/eks/flyte_aws_scheduler_helm_generated.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1045,6 +1045,8 @@ spec:
metadata:
annotations:
configChecksum: "55ce597c10b17ef6e891f0c9242b17aafb3d7b4e4e414d0a5078d71ad9c804f"
prometheus.io/path: "/metrics"
prometheus.io/port: "10254"
labels:
app.kubernetes.io/name: flyteclusterresourcesync
app.kubernetes.io/instance: flyte
Expand Down Expand Up @@ -1270,6 +1272,8 @@ spec:
metadata:
annotations:
configChecksum: "045a4308f47bb9665d221d5d83667a7c9e05ca761134dc79fa8295dd8b611dd"
prometheus.io/path: "/metrics"
prometheus.io/port: "10254"
labels:
app.kubernetes.io/name: flytepropeller
app.kubernetes.io/instance: flyte
Expand Down Expand Up @@ -1352,6 +1356,8 @@ spec:
app.kubernetes.io/version: v1.12.1-rc0
annotations:
configChecksum: "045a4308f47bb9665d221d5d83667a7c9e05ca761134dc79fa8295dd8b611dd"
prometheus.io/path: "/metrics"
prometheus.io/port: "10254"
spec:
securityContext:
fsGroup: 65534
Expand Down
2 changes: 2 additions & 0 deletions deployment/eks/flyte_helm_controlplane_generated.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -750,6 +750,8 @@ spec:
metadata:
annotations:
configChecksum: "55ce597c10b17ef6e891f0c9242b17aafb3d7b4e4e414d0a5078d71ad9c804f"
prometheus.io/path: "/metrics"
prometheus.io/port: "10254"
labels:
app.kubernetes.io/name: flyteclusterresourcesync
app.kubernetes.io/instance: flyte
Expand Down
4 changes: 4 additions & 0 deletions deployment/eks/flyte_helm_dataplane_generated.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,8 @@ spec:
metadata:
annotations:
configChecksum: "045a4308f47bb9665d221d5d83667a7c9e05ca761134dc79fa8295dd8b611dd"
prometheus.io/path: "/metrics"
prometheus.io/port: "10254"
labels:
app.kubernetes.io/name: flytepropeller
app.kubernetes.io/instance: flyte
Expand Down Expand Up @@ -511,6 +513,8 @@ spec:
app.kubernetes.io/version: v1.12.1-rc0
annotations:
configChecksum: "045a4308f47bb9665d221d5d83667a7c9e05ca761134dc79fa8295dd8b611dd"
prometheus.io/path: "/metrics"
prometheus.io/port: "10254"
spec:
securityContext:
fsGroup: 65534
Expand Down
6 changes: 6 additions & 0 deletions deployment/eks/flyte_helm_generated.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1076,6 +1076,8 @@ spec:
metadata:
annotations:
configChecksum: "55ce597c10b17ef6e891f0c9242b17aafb3d7b4e4e414d0a5078d71ad9c804f"
prometheus.io/path: "/metrics"
prometheus.io/port: "10254"
labels:
app.kubernetes.io/name: flyteclusterresourcesync
app.kubernetes.io/instance: flyte
Expand Down Expand Up @@ -1400,6 +1402,8 @@ spec:
metadata:
annotations:
configChecksum: "045a4308f47bb9665d221d5d83667a7c9e05ca761134dc79fa8295dd8b611dd"
prometheus.io/path: "/metrics"
prometheus.io/port: "10254"
labels:
app.kubernetes.io/name: flytepropeller
app.kubernetes.io/instance: flyte
Expand Down Expand Up @@ -1482,6 +1486,8 @@ spec:
app.kubernetes.io/version: v1.12.1-rc0
annotations:
configChecksum: "045a4308f47bb9665d221d5d83667a7c9e05ca761134dc79fa8295dd8b611dd"
prometheus.io/path: "/metrics"
prometheus.io/port: "10254"
spec:
securityContext:
fsGroup: 65534
Expand Down
Loading

0 comments on commit 3ee7120

Please sign in to comment.