diff --git a/helm/slurm-cluster/Chart.yaml b/helm/slurm-cluster/Chart.yaml index d45b51ef8..002a8c059 100644 --- a/helm/slurm-cluster/Chart.yaml +++ b/helm/slurm-cluster/Chart.yaml @@ -7,7 +7,7 @@ icon: https://upload.wikimedia.org/wikipedia/commons/3/3a/Slurm_logo.svg sources: - https://github.com/SquareFactory/slurm-docker version: 0.1.0 -appVersion: '21.08.8-2-1' +appVersion: '22.05.2-1-1' maintainers: - name: Marc Nguyen email: marc@squarefactory.io diff --git a/helm/slurm-cluster/templates/open-ondemand/configmap.yml b/helm/slurm-cluster/templates/open-ondemand/configmap.yml index 915143361..9c738c5cd 100644 --- a/helm/slurm-cluster/templates/open-ondemand/configmap.yml +++ b/helm/slurm-cluster/templates/open-ondemand/configmap.yml @@ -3,10 +3,7 @@ kind: ConfigMap apiVersion: v1 metadata: name: "{{ template "slurm-cluster.ondemand.name" . }}-config" - namespace: '{{ .Release.Namespace }}' labels: - release: '{{ .Release.Name }}' - chart: '{{ .Chart.Name }}' app: "{{ template "slurm-cluster.ondemand.name" . }}" data: nginx_stage.yml: | diff --git a/helm/slurm-cluster/templates/open-ondemand/ingress.yml b/helm/slurm-cluster/templates/open-ondemand/ingress.yml index 740352ff3..6e60437df 100644 --- a/helm/slurm-cluster/templates/open-ondemand/ingress.yml +++ b/helm/slurm-cluster/templates/open-ondemand/ingress.yml @@ -3,10 +3,7 @@ apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: "{{ template "slurm-cluster.ondemand.name" . }}-http" - namespace: '{{ .Release.Namespace }}' labels: - release: '{{ .Release.Name }}' - chart: '{{ .Chart.Name }}' app: "{{ template "slurm-cluster.ondemand.name" . }}" {{- if .Values.ondemand.httpIngress.annotations }} annotations: @@ -38,10 +35,7 @@ apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: "{{ template "slurm-cluster.ondemand.name" . }}-oidc" - namespace: '{{ .Release.Namespace }}' labels: - release: '{{ .Release.Name }}' - chart: '{{ .Chart.Name }}' app: "{{ template "slurm-cluster.ondemand.name" . }}" {{- if .Values.ondemand.oidcIngress.annotations }} annotations: diff --git a/helm/slurm-cluster/templates/open-ondemand/service.yml b/helm/slurm-cluster/templates/open-ondemand/service.yml index 5864700cd..ce0c44e18 100644 --- a/helm/slurm-cluster/templates/open-ondemand/service.yml +++ b/helm/slurm-cluster/templates/open-ondemand/service.yml @@ -4,10 +4,7 @@ apiVersion: v1 kind: Service metadata: name: "{{ template "slurm-cluster.ondemand.name" $ }}" - namespace: '{{ $.Release.Namespace }}' labels: - release: '{{ $.Release.Name }}' - chart: '{{ $.Chart.Name }}' app: "{{ template "slurm-cluster.ondemand.name" $ }}" spec: type: {{ $serviceValues.type }} diff --git a/helm/slurm-cluster/templates/open-ondemand/statefulset.yml b/helm/slurm-cluster/templates/open-ondemand/statefulset.yml index c3f062031..6c1874f3d 100644 --- a/helm/slurm-cluster/templates/open-ondemand/statefulset.yml +++ b/helm/slurm-cluster/templates/open-ondemand/statefulset.yml @@ -3,10 +3,7 @@ apiVersion: apps/v1 kind: StatefulSet metadata: name: "{{ template "slurm-cluster.ondemand.name" . }}" - namespace: '{{ .Release.Namespace }}' labels: - release: '{{ .Release.Name }}' - chart: '{{ .Chart.Name }}' app: "{{ template "slurm-cluster.ondemand.name" . }}" spec: serviceName: "{{ template "slurm-cluster.ondemand.name" . }}" @@ -23,8 +20,6 @@ spec: metadata: name: "{{ template "slurm-cluster.ondemand.name" . }}" labels: - release: {{ .Release.Name | quote }} - chart: "{{ .Chart.Name }}" app: "{{ template "slurm-cluster.ondemand.name" . }}" app.kubernetes.io/name: "{{ template "slurm-cluster.ondemand.name" . }}" app.kubernetes.io/instance: "{{ template "slurm-cluster.ondemand.name" . }}" diff --git a/helm/slurm-cluster/templates/slurm-configmap.yml b/helm/slurm-cluster/templates/slurm-configmap.yml index 5995233d3..6d9980ea4 100644 --- a/helm/slurm-cluster/templates/slurm-configmap.yml +++ b/helm/slurm-cluster/templates/slurm-configmap.yml @@ -2,10 +2,8 @@ kind: ConfigMap apiVersion: v1 metadata: name: "{{ template "slurm-cluster.name" . }}-slurm-config" - namespace: '{{ .Release.Namespace }}' labels: - release: '{{ .Release.Name }}' - chart: '{{ .Chart.Name }}' + app: "{{ template "slurm-cluster.name" $ }}" data: slurm.conf: | ##-- Cluster definition diff --git a/helm/slurm-cluster/templates/slurm-controller/serviceperreplica.yml b/helm/slurm-cluster/templates/slurm-controller/serviceperreplica.yml index 1cc852c41..9c1281846 100644 --- a/helm/slurm-cluster/templates/slurm-controller/serviceperreplica.yml +++ b/helm/slurm-cluster/templates/slurm-controller/serviceperreplica.yml @@ -5,17 +5,13 @@ apiVersion: v1 kind: List metadata: name: "{{ template "slurm-cluster.controller.name" . }}-serviceperreplica" - namespace: '{{ $.Release.Namespace }}' items: {{- range $i, $e := until $count }} - apiVersion: v1 kind: Service metadata: name: "{{ template "slurm-cluster.controller.name" $ }}-{{ $i }}" - namespace: '{{ $.Release.Namespace }}' labels: - release: {{ $.Release.Name | quote }} - chart: "{{ $.Chart.Name }}" app: "{{ template "slurm-cluster.controller.name" $ }}" app.kubernetes.io/name: "{{ template "slurm-cluster.controller.name" $ }}" app.kubernetes.io/instance: "{{ template "slurm-cluster.controller.name" $ }}" diff --git a/helm/slurm-cluster/templates/slurm-controller/statefulset.yml b/helm/slurm-cluster/templates/slurm-controller/statefulset.yml index 0891b3594..ee6c8ec92 100644 --- a/helm/slurm-cluster/templates/slurm-controller/statefulset.yml +++ b/helm/slurm-cluster/templates/slurm-controller/statefulset.yml @@ -3,10 +3,7 @@ apiVersion: apps/v1 kind: StatefulSet metadata: name: "{{ template "slurm-cluster.controller.name" . }}" - namespace: '{{ .Release.Namespace }}' labels: - release: '{{ .Release.Name }}' - chart: '{{ .Chart.Name }}' app: "{{ template "slurm-cluster.controller.name" . }}" spec: serviceName: "{{ template "slurm-cluster.controller.name" . }}" @@ -23,8 +20,6 @@ spec: metadata: name: "{{ template "slurm-cluster.controller.name" . }}" labels: - release: {{ .Release.Name | quote }} - chart: "{{ .Chart.Name }}" app: "{{ template "slurm-cluster.controller.name" . }}" app.kubernetes.io/name: "{{ template "slurm-cluster.controller.name" . }}" app.kubernetes.io/instance: "{{ template "slurm-cluster.controller.name" . }}" diff --git a/helm/slurm-cluster/templates/slurm-db/network-attachment-definition.yaml b/helm/slurm-cluster/templates/slurm-db/network-attachment-definition.yaml index a775321a9..62c3b3a15 100644 --- a/helm/slurm-cluster/templates/slurm-db/network-attachment-definition.yaml +++ b/helm/slurm-cluster/templates/slurm-db/network-attachment-definition.yaml @@ -3,10 +3,7 @@ apiVersion: 'k8s.cni.cncf.io/v1' kind: NetworkAttachmentDefinition metadata: name: {{ template "slurm-cluster.db.name" . }}-net - namespace: {{ .Release.Namespace }} labels: - release: '{{ .Release.Name }}' - chart: '{{ .Chart.Name }}' app: "{{ template "slurm-cluster.db.name" . }}" spec: config: | diff --git a/helm/slurm-cluster/templates/slurm-db/service.yml b/helm/slurm-cluster/templates/slurm-db/service.yml index 4de636644..05ac2ee1b 100644 --- a/helm/slurm-cluster/templates/slurm-db/service.yml +++ b/helm/slurm-cluster/templates/slurm-db/service.yml @@ -4,10 +4,7 @@ apiVersion: v1 kind: Service metadata: name: "{{ template "slurm-cluster.db.name" $ }}" - namespace: '{{ $.Release.Namespace }}' labels: - release: {{ $.Release.Name | quote }} - chart: "{{ $.Chart.Name }}" app: "{{ template "slurm-cluster.db.name" $ }}" app.kubernetes.io/name: "{{ template "slurm-cluster.db.name" $ }}" app.kubernetes.io/instance: "{{ template "slurm-cluster.db.name" $ }}" diff --git a/helm/slurm-cluster/templates/slurm-db/statefulset.yml b/helm/slurm-cluster/templates/slurm-db/statefulset.yml index a9f37f56e..711a40d1d 100644 --- a/helm/slurm-cluster/templates/slurm-db/statefulset.yml +++ b/helm/slurm-cluster/templates/slurm-db/statefulset.yml @@ -3,10 +3,7 @@ apiVersion: apps/v1 kind: StatefulSet metadata: name: "{{ template "slurm-cluster.db.name" . }}" - namespace: '{{ .Release.Namespace }}' labels: - release: '{{ .Release.Name }}' - chart: '{{ .Chart.Name }}' app: "{{ template "slurm-cluster.db.name" . }}" spec: serviceName: "{{ template "slurm-cluster.db.name" . }}" @@ -23,8 +20,6 @@ spec: metadata: name: "{{ template "slurm-cluster.db.name" . }}" labels: - release: {{ .Release.Name | quote }} - chart: "{{ .Chart.Name }}" app: "{{ template "slurm-cluster.db.name" . }}" app.kubernetes.io/name: "{{ template "slurm-cluster.db.name" . }}" app.kubernetes.io/instance: "{{ template "slurm-cluster.db.name" . }}" diff --git a/helm/slurm-cluster/templates/slurm-login/configmap.yml b/helm/slurm-cluster/templates/slurm-login/configmap.yml index 35755d9d0..8e2e15063 100644 --- a/helm/slurm-cluster/templates/slurm-login/configmap.yml +++ b/helm/slurm-cluster/templates/slurm-login/configmap.yml @@ -3,10 +3,7 @@ kind: ConfigMap apiVersion: v1 metadata: name: "{{ template "slurm-cluster.rest.name" . }}-nginx-config" - namespace: '{{ .Release.Namespace }}' labels: - release: '{{ .Release.Name }}' - chart: '{{ .Chart.Name }}' app: "{{ template "slurm-cluster.login.name" . }}" data: nginx.conf: | diff --git a/helm/slurm-cluster/templates/slurm-login/deployment.yml b/helm/slurm-cluster/templates/slurm-login/deployment.yml index 9904dd680..cd1be6289 100644 --- a/helm/slurm-cluster/templates/slurm-login/deployment.yml +++ b/helm/slurm-cluster/templates/slurm-login/deployment.yml @@ -3,10 +3,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: "{{ template "slurm-cluster.login.name" . }}" - namespace: '{{ .Release.Namespace }}' labels: - release: '{{ .Release.Name }}' - chart: '{{ .Chart.Name }}' app: "{{ template "slurm-cluster.login.name" . }}" spec: selector: @@ -22,8 +19,6 @@ spec: metadata: name: "{{ template "slurm-cluster.login.name" . }}" labels: - release: {{ .Release.Name | quote }} - chart: "{{ .Chart.Name }}" app: "{{ template "slurm-cluster.login.name" . }}" app.kubernetes.io/name: "{{ template "slurm-cluster.login.name" . }}" app.kubernetes.io/instance: "{{ template "slurm-cluster.login.name" . }}" @@ -173,6 +168,52 @@ spec: subPath: nginx.conf readOnly: true {{- end }} + {{- if and .Values.login.enabled .Values.login.metrics.enabled }} + - name: "{{ template "slurm-cluster.name" . }}-metrics" + image: "{{ .Values.login.metrics.image }}" + imagePullPolicy: "{{ .Values.login.metrics.imagePullPolicy }}" + env: + - name: GPU_ACCT + value: "{{ .Values.login.metrics.gpuAccounting | ternary "true" "false" }}" + command: +{{ toYaml .Values.login.metrics.command | indent 12 }} + ports: + - containerPort: 8080 + name: metrics + livenessProbe: + exec: + command: + - /command/s6-svstat + - /var/run/s6-rc/servicedirs/slurm-prometheus-exporter/ +{{ toYaml .Values.login.metrics.livenessProbe | indent 12 }} + volumeMounts: + - name: slurm-conf + mountPath: /etc/slurm + readOnly: false + - name: slurm-spank + mountPath: /etc/slurm/plugstack.conf.d + readOnly: true + - name: sssd + mountPath: /secrets/sssd + readOnly: true + - name: munge + mountPath: /secrets/munge + readOnly: true + - mountPath: /tmp + name: tmp + subPath: metrics/tmp + - mountPath: /run + name: tmp + subPath: metrics/run + - mountPath: /var/log + name: tmp + subPath: metrics/log + {{- if .Values.login.metrics.volumeMounts }} +{{ toYaml .Values.login.metrics.volumeMounts | indent 12 }} + {{- end }} + resources: +{{ toYaml .Values.login.metrics.resources | indent 12 }} + {{- end }} volumes: - name: tmp emptyDir: diff --git a/helm/slurm-cluster/templates/slurm-login/network-attachment-definition.yaml b/helm/slurm-cluster/templates/slurm-login/network-attachment-definition.yaml index 79b5782d8..156ece136 100644 --- a/helm/slurm-cluster/templates/slurm-login/network-attachment-definition.yaml +++ b/helm/slurm-cluster/templates/slurm-login/network-attachment-definition.yaml @@ -3,10 +3,7 @@ apiVersion: 'k8s.cni.cncf.io/v1' kind: NetworkAttachmentDefinition metadata: name: {{ template "slurm-cluster.login.name" . }}-net - namespace: {{ .Release.Namespace }} labels: - release: '{{ .Release.Name }}' - chart: '{{ .Chart.Name }}' app: "{{ template "slurm-cluster.login.name" . }}" spec: config: | diff --git a/helm/slurm-cluster/templates/slurm-login/service-monitor.yaml b/helm/slurm-cluster/templates/slurm-login/service-monitor.yaml new file mode 100644 index 000000000..36146a2a8 --- /dev/null +++ b/helm/slurm-cluster/templates/slurm-login/service-monitor.yaml @@ -0,0 +1,55 @@ +{{- if and .Values.login.metrics.enabled .Values.login.metrics.monitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ template "slurm-cluster.name" . }}-metrics + labels: + app: {{ template "slurm-cluster.name" . }}-metrics + {{- if .Values.login.metrics.monitor.additionalLabels }} + {{- toYaml .Values.login.metrics.monitor.additionalLabels | nindent 4 }} + {{- end }} +spec: + jobLabel: {{ default "app.kubernetes.io/name" .Values.login.metrics.monitor.jobLabel }} + selector: + matchLabels: + {{- if .Values.login.metrics.monitor.selectorOverride }} + {{ toYaml .Values.login.metrics.monitor.selectorOverride | indent 6 }} + {{ else }} + app: "{{ template "slurm-cluster.login.name" $ }}" + app.kubernetes.io/name: "{{ template "slurm-cluster.login.name" $ }}" + app.kubernetes.io/instance: "{{ template "slurm-cluster.login.name" $ }}" + app.kubernetes.io/component: login + {{- end }} + endpoints: + - port: metrics + path: /metrics + scheme: {{ .Values.login.metrics.monitor.scheme }} + {{- with .Values.login.metrics.monitor.basicAuth }} + basicAuth: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.login.metrics.monitor.bearerTokenFile }} + bearerTokenFile: {{ . }} + {{- end }} + {{- with .Values.login.metrics.monitor.tlsConfig }} + tlsConfig: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.login.metrics.monitor.proxyUrl }} + proxyUrl: {{ . }} + {{- end }} + {{- with .Values.login.metrics.monitor.interval }} + interval: {{ . }} + {{- end }} + {{- with .Values.login.metrics.monitor.scrapeTimeout }} + scrapeTimeout: {{ . }} + {{- end }} + {{- with .Values.login.metrics.monitor.relabelings }} + relabelings: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.login.metrics.monitor.metricRelabelings }} + metricRelabelings: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/helm/slurm-cluster/templates/slurm-login/service.yml b/helm/slurm-cluster/templates/slurm-login/service.yml index f71255ea0..36e0d697b 100644 --- a/helm/slurm-cluster/templates/slurm-login/service.yml +++ b/helm/slurm-cluster/templates/slurm-login/service.yml @@ -4,10 +4,7 @@ apiVersion: v1 kind: Service metadata: name: "{{ template "slurm-cluster.login.name" $ }}" - namespace: '{{ $.Release.Namespace }}' labels: - release: {{ $.Release.Name | quote }} - chart: "{{ $.Chart.Name }}" app: "{{ template "slurm-cluster.login.name" $ }}" app.kubernetes.io/name: "{{ template "slurm-cluster.login.name" $ }}" app.kubernetes.io/instance: "{{ template "slurm-cluster.login.name" $ }}" @@ -48,6 +45,11 @@ spec: port: 6822 targetPort: http {{- end }} + {{- if .Values.login.metrics.enabled }} + - name: metrics + port: 8080 + targetPort: metrics + {{- end }} selector: app.kubernetes.io/name: "{{ template "slurm-cluster.login.name" $ }}" app.kubernetes.io/instance: "{{ template "slurm-cluster.login.name" $ }}" diff --git a/helm/slurm-cluster/templates/slurm-spank-configmap.yml b/helm/slurm-cluster/templates/slurm-spank-configmap.yml index e2b2e340d..909798b7a 100644 --- a/helm/slurm-cluster/templates/slurm-spank-configmap.yml +++ b/helm/slurm-cluster/templates/slurm-spank-configmap.yml @@ -2,10 +2,8 @@ kind: ConfigMap apiVersion: v1 metadata: name: "{{ template "slurm-cluster.name" . }}-slurm-spank-config" - namespace: '{{ .Release.Namespace }}' labels: - release: '{{ .Release.Name }}' - chart: '{{ .Chart.Name }}' + app: "{{ template "slurm-cluster.name" $ }}" data: spank.conf: | optional /usr/lib64/slurm/spank_pyxis.so diff --git a/helm/slurm-cluster/values.yaml b/helm/slurm-cluster/values.yaml index e6d7fa33e..515d2b2d7 100644 --- a/helm/slurm-cluster/values.yaml +++ b/helm/slurm-cluster/values.yaml @@ -111,7 +111,7 @@ controller: enabled: true replicas: 1 - image: 'ghcr.io/squarefactory/slurm:22.05.2-1-controller' + image: 'ghcr.io/squarefactory/slurm:22.05.2-1-2-controller' imagePullPolicy: 'IfNotPresent' labels: {} @@ -234,7 +234,7 @@ login: enabled: false replicas: 1 - image: 'ghcr.io/squarefactory/slurm:22.05.2-1-login' + image: 'ghcr.io/squarefactory/slurm:22.05.2-1-2-login' imagePullPolicy: 'IfNotPresent' labels: {} @@ -336,7 +336,7 @@ login: # Slurm REST API rest: enabled: false - image: 'ghcr.io/squarefactory/slurm:22.05.2-1-rest' + image: 'ghcr.io/squarefactory/slurm:22.05.2-1-2-rest' imagePullPolicy: 'IfNotPresent' command: ['/init'] @@ -350,6 +350,59 @@ login: limits: memory: '256Mi' + metrics: + enabled: false + + gpuAccounting: false + + ## You can customize the command to refresh the tls configs with: + ## command: ['sh', '-c', 'update-ca-trust && /init'] + command: ['/init'] + + image: ghcr.io/squarefactory/slurm:22.05.2-1-2-prometheus-exporter + imagePullPolicy: IfNotPresent + + # Extra volume mounts (use login.volumes to add volumes) + volumeMounts: [] + + resources: + {} + # requests: + # cpu: '100m' + # memory: '256Mi' + # limits: + # memory: '1Gi' + + livenessProbe: + initialDelaySeconds: 60 + timeoutSeconds: 1 + periodSeconds: 10 + successThreshold: 1 + failureThreshold: 3 + + monitor: + enabled: false + additionalLabels: {} + + jobLabel: '' + scheme: http + basicAuth: {} + bearerTokenFile: + tlsConfig: {} + + ## proxyUrl: URL of a proxy that should be used for scraping. + ## + proxyUrl: '' + + ## Override serviceMonitor selector + ## + selectorOverride: {} + + relabelings: [] + metricRelabelings: [] + interval: '' + scrapeTimeout: 10s + ondemand: enabled: false replicas: 1 @@ -517,7 +570,7 @@ db: enabled: false replicas: 1 - image: 'ghcr.io/squarefactory/slurm:22.05.2-1-db' + image: 'ghcr.io/squarefactory/slurm:22.05.2-1-2-db' imagePullPolicy: 'IfNotPresent' labels: {}