Skip to content

Commit

Permalink
feat(slurm): added metrics
Browse files Browse the repository at this point in the history
Signed-off-by: Nguyen Marc <nguyen_marc@live.fr>
  • Loading branch information
Darkness4 committed Aug 9, 2022
1 parent d0909ad commit 3dcc4ab
Show file tree
Hide file tree
Showing 18 changed files with 166 additions and 62 deletions.
2 changes: 1 addition & 1 deletion helm/slurm-cluster/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ icon: https://upload.wikimedia.org/wikipedia/commons/3/3a/Slurm_logo.svg
sources:
- https://github.com/SquareFactory/slurm-docker
version: 0.1.0
appVersion: '21.08.8-2-1'
appVersion: '22.05.2-1-1'
maintainers:
- name: Marc Nguyen
email: marc@squarefactory.io
Expand Down
3 changes: 0 additions & 3 deletions helm/slurm-cluster/templates/open-ondemand/configmap.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,7 @@ kind: ConfigMap
apiVersion: v1
metadata:
name: "{{ template "slurm-cluster.ondemand.name" . }}-config"
namespace: '{{ .Release.Namespace }}'
labels:
release: '{{ .Release.Name }}'
chart: '{{ .Chart.Name }}'
app: "{{ template "slurm-cluster.ondemand.name" . }}"
data:
nginx_stage.yml: |
Expand Down
6 changes: 0 additions & 6 deletions helm/slurm-cluster/templates/open-ondemand/ingress.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,7 @@ apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: "{{ template "slurm-cluster.ondemand.name" . }}-http"
namespace: '{{ .Release.Namespace }}'
labels:
release: '{{ .Release.Name }}'
chart: '{{ .Chart.Name }}'
app: "{{ template "slurm-cluster.ondemand.name" . }}"
{{- if .Values.ondemand.httpIngress.annotations }}
annotations:
Expand Down Expand Up @@ -38,10 +35,7 @@ apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: "{{ template "slurm-cluster.ondemand.name" . }}-oidc"
namespace: '{{ .Release.Namespace }}'
labels:
release: '{{ .Release.Name }}'
chart: '{{ .Chart.Name }}'
app: "{{ template "slurm-cluster.ondemand.name" . }}"
{{- if .Values.ondemand.oidcIngress.annotations }}
annotations:
Expand Down
3 changes: 0 additions & 3 deletions helm/slurm-cluster/templates/open-ondemand/service.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,7 @@ apiVersion: v1
kind: Service
metadata:
name: "{{ template "slurm-cluster.ondemand.name" $ }}"
namespace: '{{ $.Release.Namespace }}'
labels:
release: '{{ $.Release.Name }}'
chart: '{{ $.Chart.Name }}'
app: "{{ template "slurm-cluster.ondemand.name" $ }}"
spec:
type: {{ $serviceValues.type }}
Expand Down
5 changes: 0 additions & 5 deletions helm/slurm-cluster/templates/open-ondemand/statefulset.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,7 @@ apiVersion: apps/v1
kind: StatefulSet
metadata:
name: "{{ template "slurm-cluster.ondemand.name" . }}"
namespace: '{{ .Release.Namespace }}'
labels:
release: '{{ .Release.Name }}'
chart: '{{ .Chart.Name }}'
app: "{{ template "slurm-cluster.ondemand.name" . }}"
spec:
serviceName: "{{ template "slurm-cluster.ondemand.name" . }}"
Expand All @@ -23,8 +20,6 @@ spec:
metadata:
name: "{{ template "slurm-cluster.ondemand.name" . }}"
labels:
release: {{ .Release.Name | quote }}
chart: "{{ .Chart.Name }}"
app: "{{ template "slurm-cluster.ondemand.name" . }}"
app.kubernetes.io/name: "{{ template "slurm-cluster.ondemand.name" . }}"
app.kubernetes.io/instance: "{{ template "slurm-cluster.ondemand.name" . }}"
Expand Down
4 changes: 1 addition & 3 deletions helm/slurm-cluster/templates/slurm-configmap.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,8 @@ kind: ConfigMap
apiVersion: v1
metadata:
name: "{{ template "slurm-cluster.name" . }}-slurm-config"
namespace: '{{ .Release.Namespace }}'
labels:
release: '{{ .Release.Name }}'
chart: '{{ .Chart.Name }}'
app: "{{ template "slurm-cluster.name" $ }}"
data:
slurm.conf: |
##-- Cluster definition
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,13 @@ apiVersion: v1
kind: List
metadata:
name: "{{ template "slurm-cluster.controller.name" . }}-serviceperreplica"
namespace: '{{ $.Release.Namespace }}'
items:
{{- range $i, $e := until $count }}
- apiVersion: v1
kind: Service
metadata:
name: "{{ template "slurm-cluster.controller.name" $ }}-{{ $i }}"
namespace: '{{ $.Release.Namespace }}'
labels:
release: {{ $.Release.Name | quote }}
chart: "{{ $.Chart.Name }}"
app: "{{ template "slurm-cluster.controller.name" $ }}"
app.kubernetes.io/name: "{{ template "slurm-cluster.controller.name" $ }}"
app.kubernetes.io/instance: "{{ template "slurm-cluster.controller.name" $ }}"
Expand Down
5 changes: 0 additions & 5 deletions helm/slurm-cluster/templates/slurm-controller/statefulset.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,7 @@ apiVersion: apps/v1
kind: StatefulSet
metadata:
name: "{{ template "slurm-cluster.controller.name" . }}"
namespace: '{{ .Release.Namespace }}'
labels:
release: '{{ .Release.Name }}'
chart: '{{ .Chart.Name }}'
app: "{{ template "slurm-cluster.controller.name" . }}"
spec:
serviceName: "{{ template "slurm-cluster.controller.name" . }}"
Expand All @@ -23,8 +20,6 @@ spec:
metadata:
name: "{{ template "slurm-cluster.controller.name" . }}"
labels:
release: {{ .Release.Name | quote }}
chart: "{{ .Chart.Name }}"
app: "{{ template "slurm-cluster.controller.name" . }}"
app.kubernetes.io/name: "{{ template "slurm-cluster.controller.name" . }}"
app.kubernetes.io/instance: "{{ template "slurm-cluster.controller.name" . }}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,7 @@ apiVersion: 'k8s.cni.cncf.io/v1'
kind: NetworkAttachmentDefinition
metadata:
name: {{ template "slurm-cluster.db.name" . }}-net
namespace: {{ .Release.Namespace }}
labels:
release: '{{ .Release.Name }}'
chart: '{{ .Chart.Name }}'
app: "{{ template "slurm-cluster.db.name" . }}"
spec:
config: |
Expand Down
3 changes: 0 additions & 3 deletions helm/slurm-cluster/templates/slurm-db/service.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,7 @@ apiVersion: v1
kind: Service
metadata:
name: "{{ template "slurm-cluster.db.name" $ }}"
namespace: '{{ $.Release.Namespace }}'
labels:
release: {{ $.Release.Name | quote }}
chart: "{{ $.Chart.Name }}"
app: "{{ template "slurm-cluster.db.name" $ }}"
app.kubernetes.io/name: "{{ template "slurm-cluster.db.name" $ }}"
app.kubernetes.io/instance: "{{ template "slurm-cluster.db.name" $ }}"
Expand Down
5 changes: 0 additions & 5 deletions helm/slurm-cluster/templates/slurm-db/statefulset.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,7 @@ apiVersion: apps/v1
kind: StatefulSet
metadata:
name: "{{ template "slurm-cluster.db.name" . }}"
namespace: '{{ .Release.Namespace }}'
labels:
release: '{{ .Release.Name }}'
chart: '{{ .Chart.Name }}'
app: "{{ template "slurm-cluster.db.name" . }}"
spec:
serviceName: "{{ template "slurm-cluster.db.name" . }}"
Expand All @@ -23,8 +20,6 @@ spec:
metadata:
name: "{{ template "slurm-cluster.db.name" . }}"
labels:
release: {{ .Release.Name | quote }}
chart: "{{ .Chart.Name }}"
app: "{{ template "slurm-cluster.db.name" . }}"
app.kubernetes.io/name: "{{ template "slurm-cluster.db.name" . }}"
app.kubernetes.io/instance: "{{ template "slurm-cluster.db.name" . }}"
Expand Down
3 changes: 0 additions & 3 deletions helm/slurm-cluster/templates/slurm-login/configmap.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,7 @@ kind: ConfigMap
apiVersion: v1
metadata:
name: "{{ template "slurm-cluster.rest.name" . }}-nginx-config"
namespace: '{{ .Release.Namespace }}'
labels:
release: '{{ .Release.Name }}'
chart: '{{ .Chart.Name }}'
app: "{{ template "slurm-cluster.login.name" . }}"
data:
nginx.conf: |
Expand Down
51 changes: 46 additions & 5 deletions helm/slurm-cluster/templates/slurm-login/deployment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,7 @@ apiVersion: apps/v1
kind: Deployment
metadata:
name: "{{ template "slurm-cluster.login.name" . }}"
namespace: '{{ .Release.Namespace }}'
labels:
release: '{{ .Release.Name }}'
chart: '{{ .Chart.Name }}'
app: "{{ template "slurm-cluster.login.name" . }}"
spec:
selector:
Expand All @@ -22,8 +19,6 @@ spec:
metadata:
name: "{{ template "slurm-cluster.login.name" . }}"
labels:
release: {{ .Release.Name | quote }}
chart: "{{ .Chart.Name }}"
app: "{{ template "slurm-cluster.login.name" . }}"
app.kubernetes.io/name: "{{ template "slurm-cluster.login.name" . }}"
app.kubernetes.io/instance: "{{ template "slurm-cluster.login.name" . }}"
Expand Down Expand Up @@ -173,6 +168,52 @@ spec:
subPath: nginx.conf
readOnly: true
{{- end }}
{{- if and .Values.login.enabled .Values.login.metrics.enabled }}
- name: "{{ template "slurm-cluster.name" . }}-metrics"
image: "{{ .Values.login.metrics.image }}"
imagePullPolicy: "{{ .Values.login.metrics.imagePullPolicy }}"
env:
- name: GPU_ACCT
value: "{{ .Values.login.metrics.gpuAccounting | ternary "true" "false" }}"
command:
{{ toYaml .Values.login.metrics.command | indent 12 }}
ports:
- containerPort: 8080
name: metrics
livenessProbe:
exec:
command:
- /command/s6-svstat
- /var/run/s6-rc/servicedirs/slurm-prometheus-exporter/
{{ toYaml .Values.login.metrics.livenessProbe | indent 12 }}
volumeMounts:
- name: slurm-conf
mountPath: /etc/slurm
readOnly: false
- name: slurm-spank
mountPath: /etc/slurm/plugstack.conf.d
readOnly: true
- name: sssd
mountPath: /secrets/sssd
readOnly: true
- name: munge
mountPath: /secrets/munge
readOnly: true
- mountPath: /tmp
name: tmp
subPath: metrics/tmp
- mountPath: /run
name: tmp
subPath: metrics/run
- mountPath: /var/log
name: tmp
subPath: metrics/log
{{- if .Values.login.metrics.volumeMounts }}
{{ toYaml .Values.login.metrics.volumeMounts | indent 12 }}
{{- end }}
resources:
{{ toYaml .Values.login.metrics.resources | indent 12 }}
{{- end }}
volumes:
- name: tmp
emptyDir:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,7 @@ apiVersion: 'k8s.cni.cncf.io/v1'
kind: NetworkAttachmentDefinition
metadata:
name: {{ template "slurm-cluster.login.name" . }}-net
namespace: {{ .Release.Namespace }}
labels:
release: '{{ .Release.Name }}'
chart: '{{ .Chart.Name }}'
app: "{{ template "slurm-cluster.login.name" . }}"
spec:
config: |
Expand Down
55 changes: 55 additions & 0 deletions helm/slurm-cluster/templates/slurm-login/service-monitor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
{{- if and .Values.login.metrics.enabled .Values.login.metrics.monitor.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: {{ template "slurm-cluster.name" . }}-metrics
labels:
app: {{ template "slurm-cluster.name" . }}-metrics
{{- if .Values.login.metrics.monitor.additionalLabels }}
{{- toYaml .Values.login.metrics.monitor.additionalLabels | nindent 4 }}
{{- end }}
spec:
jobLabel: {{ default "app.kubernetes.io/name" .Values.login.metrics.monitor.jobLabel }}
selector:
matchLabels:
{{- if .Values.login.metrics.monitor.selectorOverride }}
{{ toYaml .Values.login.metrics.monitor.selectorOverride | indent 6 }}
{{ else }}
app: "{{ template "slurm-cluster.login.name" $ }}"
app.kubernetes.io/name: "{{ template "slurm-cluster.login.name" $ }}"
app.kubernetes.io/instance: "{{ template "slurm-cluster.login.name" $ }}"
app.kubernetes.io/component: login
{{- end }}
endpoints:
- port: metrics
path: /metrics
scheme: {{ .Values.login.metrics.monitor.scheme }}
{{- with .Values.login.metrics.monitor.basicAuth }}
basicAuth:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.login.metrics.monitor.bearerTokenFile }}
bearerTokenFile: {{ . }}
{{- end }}
{{- with .Values.login.metrics.monitor.tlsConfig }}
tlsConfig:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.login.metrics.monitor.proxyUrl }}
proxyUrl: {{ . }}
{{- end }}
{{- with .Values.login.metrics.monitor.interval }}
interval: {{ . }}
{{- end }}
{{- with .Values.login.metrics.monitor.scrapeTimeout }}
scrapeTimeout: {{ . }}
{{- end }}
{{- with .Values.login.metrics.monitor.relabelings }}
relabelings:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.login.metrics.monitor.metricRelabelings }}
metricRelabelings:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
8 changes: 5 additions & 3 deletions helm/slurm-cluster/templates/slurm-login/service.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,7 @@ apiVersion: v1
kind: Service
metadata:
name: "{{ template "slurm-cluster.login.name" $ }}"
namespace: '{{ $.Release.Namespace }}'
labels:
release: {{ $.Release.Name | quote }}
chart: "{{ $.Chart.Name }}"
app: "{{ template "slurm-cluster.login.name" $ }}"
app.kubernetes.io/name: "{{ template "slurm-cluster.login.name" $ }}"
app.kubernetes.io/instance: "{{ template "slurm-cluster.login.name" $ }}"
Expand Down Expand Up @@ -48,6 +45,11 @@ spec:
port: 6822
targetPort: http
{{- end }}
{{- if .Values.login.metrics.enabled }}
- name: metrics
port: 8080
targetPort: metrics
{{- end }}
selector:
app.kubernetes.io/name: "{{ template "slurm-cluster.login.name" $ }}"
app.kubernetes.io/instance: "{{ template "slurm-cluster.login.name" $ }}"
Expand Down
4 changes: 1 addition & 3 deletions helm/slurm-cluster/templates/slurm-spank-configmap.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,8 @@ kind: ConfigMap
apiVersion: v1
metadata:
name: "{{ template "slurm-cluster.name" . }}-slurm-spank-config"
namespace: '{{ .Release.Namespace }}'
labels:
release: '{{ .Release.Name }}'
chart: '{{ .Chart.Name }}'
app: "{{ template "slurm-cluster.name" $ }}"
data:
spank.conf: |
optional /usr/lib64/slurm/spank_pyxis.so
Loading

0 comments on commit 3dcc4ab

Please sign in to comment.