Skip to content

Commit

Permalink
Enable metrics via prometheus operator
Browse files Browse the repository at this point in the history
Expose metrics via prometheus.monitoring.coreos.com/v1

The exposed metrics are

| Metric        | Type | Meaning |
| --------------- | ---------------- | ---------------- |
|  `nfd_master_build_info`           | Gauge | Version from which nfd-master was built. |
|  `nfd_worker_build_info`           | Gauge | Version from which nfd-worker was built. |
|  `nfd_updated_nodes`           |  Gauge | Time taken to label a node |
|  `nfd_crd_processing_time`          |  Gauge | Time taken to process a NodeFeatureRule CRD |
| `nfd_feature_discovery_duration_seconds` |  Gauge | Time taken to discover features on a node |

Signed-off-by: Carlos Eduardo Arango Gutierrez <eduardoa@nvidia.com>
  • Loading branch information
ArangoGutierrez committed Jun 6, 2023
1 parent febebd4 commit 3bcd23f
Show file tree
Hide file tree
Showing 21 changed files with 421 additions and 12 deletions.
2 changes: 2 additions & 0 deletions cmd/nfd-master/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ func initFlags(flagset *flag.FlagSet) (*master.Args, *master.ConfigOverrideArgs)
"Enable NFD CRD API controller for processing NodeFeature and NodeFeatureRule objects.")
flagset.IntVar(&args.Port, "port", 8080,
"Port on which to listen for connections.")
flagset.IntVar(&args.MetricsPort, "metrics", 8081,
"Port on which to expose metrics.")
flagset.BoolVar(&args.Prune, "prune", false,
"Prune all NFD related attributes from all nodes of the cluaster and exit.")
flagset.BoolVar(&args.VerifyNodeName, "verify-node-name", false,
Expand Down
2 changes: 2 additions & 0 deletions cmd/nfd-worker/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ func initFlags(flagset *flag.FlagSet) (*worker.Args, *worker.ConfigOverrideArgs)
"Kubeconfig to use")
flagset.BoolVar(&args.Oneshot, "oneshot", false,
"Do not publish feature labels")
flagset.IntVar(&args.MetricsPort, "metrics", 8081,
"Port on which to expose metrics.")
flagset.StringVar(&args.Options, "options", "",
"Specify config options from command line. Config options are specified "+
"in the same format as in the config file (i.e. json or yaml). These options")
Expand Down
5 changes: 5 additions & 0 deletions deployment/base/master/master-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,8 @@ spec:
failureThreshold: 10
command:
- "nfd-master"
args:
- "-enable-nodefeature-api"
ports:
- name: metrics
containerPort: 8081
4 changes: 4 additions & 0 deletions deployment/base/worker-daemonset/worker-daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,7 @@ spec:
- "nfd-worker"
args:
- "-server=nfd-master:8080"
- "-enable-nodefeature-api"
ports:
- name: metrics
containerPort: 8081
8 changes: 8 additions & 0 deletions deployment/components/prometheus/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
apiVersion: kustomize.config.k8s.io/v1alpha1
kind: Component

namespace: node-feature-discovery

resources:
- monitor.yaml
- role.yaml
20 changes: 20 additions & 0 deletions deployment/components/prometheus/monitor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Prometheus Monitor Service (Metrics)
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: nfd-metrics
labels:
app: nfd
spec:
podMetricsEndpoints:
- honorLabels: true
interval: 10s
path: /metrics
targetPort: 8081
scheme: http
namespaceSelector:
matchNames:
- node-feature-discovery
selector:
matchExpressions:
- {key: app, operator: Exists}
49 changes: 49 additions & 0 deletions deployment/components/prometheus/role.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: prometheus-k8s
namespace: node-feature-discovery
rules:
- apiGroups:
- ""
resources:
- services
- endpoints
- pods
verbs:
- get
- list
- watch
- apiGroups:
- extensions
resources:
- ingresses
verbs:
- get
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs:
- get
- list
- watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: node-feature-discovery
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
- kind: ServiceAccount
name: prometheus-operator
namespace: monitoring
9 changes: 8 additions & 1 deletion deployment/helm/node-feature-discovery/templates/master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,14 @@ spec:
- "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key"
- "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt"
{{- end }}
{{- if .Values.metrics.enable }}
- "-metrics={{ .Values.metrics.port }}"
{{- end }}
{{- if .Values.metrics.enable }}
ports:
- name: metrics
containerPort: {{ .Values.metrics.port }}
{{- end }}
volumeMounts:
{{- if .Values.tls.enable }}
- name: nfd-master-cert
Expand All @@ -139,7 +147,6 @@ spec:
items:
- key: nfd-master.conf
path: nfd-master.conf

{{- with .Values.master.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
Expand Down
73 changes: 73 additions & 0 deletions deployment/helm/node-feature-discovery/templates/prometheus.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
{{- if .Values.metrics.enable }}
# Prometheus Monitor Service (Metrics)
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: nfd-metrics
labels:
app: nfd
spec:
podMetricsEndpoints:
- honorLabels: true
interval: 10s
path: /metrics
targetPort: {{ .Values.metrics.port }}
scheme: http
namespaceSelector:
matchNames:
- node-feature-discovery
selector:
matchExpressions:
- {key: app, operator: Exists}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: prometheus-k8s
namespace: node-feature-discovery
rules:
- apiGroups:
- ""
resources:
- services
- endpoints
- pods
verbs:
- get
- list
- watch
- apiGroups:
- extensions
resources:
- ingresses
verbs:
- get
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs:
- get
- list
- watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: node-feature-discovery
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
- kind: ServiceAccount
name: prometheus-operator
namespace: monitoring

{{- end }}
8 changes: 8 additions & 0 deletions deployment/helm/node-feature-discovery/templates/worker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,14 @@ spec:
- "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key"
- "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt"
{{- end }}
{{- if .Values.metrics.enable }}
- "-metrics={{ .Values.metrics.port }}"
{{- end }}
{{- if .Values.metrics.enable }}
ports:
- name: metrics
containerPort: {{ .Values.metrics.port }}
{{- end }}
volumeMounts:
- name: host-boot
mountPath: "/host-boot"
Expand Down
4 changes: 4 additions & 0 deletions deployment/helm/node-feature-discovery/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -493,3 +493,7 @@ topologyGC:
tls:
enable: false
certManager: false

metrics:
enable: false
port: 8081
10 changes: 10 additions & 0 deletions deployment/overlays/prometheus/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

namespace: node-feature-discovery

resources:
- namespace.yaml

components:
- ../../components/prometheus
4 changes: 4 additions & 0 deletions deployment/overlays/prometheus/namespace.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: node-feature-discovery
2 changes: 2 additions & 0 deletions docs/deployment/helm.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ We have introduced the following Chart parameters.
| `tls.enable` | bool | false | Specifies whether to use TLS for communications between components |
| `tls.certManager` | bool | false | If enabled, requires [cert-manager](https://cert-manager.io/docs/) to be installed and will automatically create the required TLS certificates |
| `enableNodeFeatureApi` | bool | false | Enable the [NodeFeature](../usage/custom-resources.md#nodefeature) CRD API for communicating node features. This will automatically disable the gRPC communication.
| `metrics.enable` | bool | false | Specifies whether to expose mtrics using prometheus |
| `metrics.port` | integer | 8081 | Port on which to expose metrics from components |

### Master pod parameters

Expand Down
23 changes: 23 additions & 0 deletions docs/deployment/kustomize.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ scenarios under
see [Master Worker Topologyupdater](#master-worker-topologyupdater) below
- [`topologyupdater`](https://github.com/kubernetes-sigs/node-feature-discovery/blob/{{site.release}}/deployment/overlays/topologyupdater):
see [Topology Updater](#topologyupdater) below
- [`Metrics`](https://github.com/kubernetes-sigs/node-feature-discovery/blob/{{site.release}}/deployment/overlays/prometheus):
see [Metrics](#metrics) below
- [`prune`](https://github.com/kubernetes-sigs/node-feature-discovery/blob/{{site.release}}/deployment/overlays/prune):
clean up the cluster after uninstallation, see
[Removing feature labels](uninstallation.md#removing-feature-labels)
Expand Down Expand Up @@ -137,6 +139,27 @@ kubectl apply -k https://github.com/kubernetes-sigs/node-feature-discovery/deplo

```

# Metrics

To Add metrics scraping for prometheus-operator,
To allow [prometheus-operator](https://github.com/prometheus-operator/prometheus-operator) to scrape metrics from node-feature-discovery,
run the following command:

```bash
kubectl apply -k https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/default?ref={{ site.release }}
kubectl apply -k https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/prometheus?ref={{ site.release }}
```

The exposed metrics are

| Metric | Type | Meaning |
| --------------- | ---------------- | ---------------- |
| `nfd_master_build_info` | Gauge | Version from which nfd-master was built. |
| `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built. |
| `nfd_updated_nodes` | Gauge | Time taken to label a node |
| `nfd_crd_processing_time` | Gauge | Time taken to process a NodeFeatureRule CRD |
| `nfd_feature_discovery_duration_seconds` | Gauge | Time taken to discover features on a node |

## Uninstallation

Simplest way is to invoke `kubectl delete` on the overlay that was used for
Expand Down
6 changes: 3 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ require (
github.com/onsi/ginkgo/v2 v2.9.1
github.com/onsi/gomega v1.27.4
github.com/opencontainers/runc v1.1.6
github.com/prometheus/client_golang v1.14.0
github.com/smartystreets/assertions v1.2.0
github.com/smartystreets/goconvey v1.6.4
github.com/stretchr/testify v1.8.1
Expand Down Expand Up @@ -129,7 +130,6 @@ require (
github.com/opencontainers/selinux v1.11.0 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/client_golang v1.14.0 // indirect
github.com/prometheus/client_model v0.3.0 // indirect
github.com/prometheus/common v0.37.0 // indirect
github.com/prometheus/procfs v0.8.0 // indirect
Expand Down Expand Up @@ -161,14 +161,14 @@ require (
go.opentelemetry.io/proto/otlp v0.19.0 // indirect
go.uber.org/atomic v1.7.0 // indirect
go.uber.org/multierr v1.6.0 // indirect
go.uber.org/zap v1.19.0 // indirect
go.uber.org/zap v1.24.0 // indirect
golang.org/x/crypto v0.1.0 // indirect
golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b // indirect
golang.org/x/sync v0.1.0 // indirect
golang.org/x/sys v0.7.0 // indirect
golang.org/x/term v0.7.0 // indirect
golang.org/x/text v0.9.0 // indirect
golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 // indirect
golang.org/x/time v0.3.0 // indirect
golang.org/x/tools v0.7.0 // indirect
google.golang.org/api v0.60.0 // indirect
google.golang.org/appengine v1.6.7 // indirect
Expand Down
11 changes: 4 additions & 7 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,6 @@ github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a h1:idn718Q4
github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY=
github.com/aws/aws-sdk-go v1.35.24/go.mod h1:tlPOdRjfxPBpNIwqDj61rmsnA85v9jc0Ps9+muhnW+k=
github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8=
github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA=
github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
Expand Down Expand Up @@ -714,14 +713,13 @@ go.starlark.net v0.0.0-20200306205701-8dd3e2ee1dd5 h1:+FNtrFTmVw0YZGpBGX56XDee33
go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE=
go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw=
go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A=
go.uber.org/goleak v1.2.1 h1:NBol2c7O1ZokfZ0LEU9K6Whx/KnwvepVetCUhtKja4A=
go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0=
go.uber.org/multierr v1.6.0 h1:y6IPFStTAIT5Ytl7/XYmHvzXQ7S3g/IeZW9hyZ5thw4=
go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU=
go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q=
go.uber.org/zap v1.19.0 h1:mZQZefskPPCMIBCSEH0v2/iUqqLrYtaeqwD6FUGUnFE=
go.uber.org/zap v1.19.0/go.mod h1:xg/QME4nWcxGxrpdeYfq7UvYrLh66cuVKdrbD1XF/NI=
go.uber.org/zap v1.24.0 h1:FiJd5l1UOLj0wCgbSE0rwwXHzEdAZS6hiiSnxJN/D60=
go.uber.org/zap v1.24.0/go.mod h1:2kMP+WWQ8aoFoedH3T2sq6iJ2yDWpHbP0f6MQbS9Gkg=
golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
Expand Down Expand Up @@ -984,8 +982,9 @@ golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 h1:vVKdlvoWBphwdxWKrFZEuM0kGgGLxUOYcY4U/2Vjg44=
golang.org/x/time v0.0.0-20220210224613-90d013bbcef8/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4=
golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
Expand All @@ -1003,7 +1002,6 @@ golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgw
golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191108193012-7d206e10da11/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191113191852-77e3bb0ad9e7/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191115202509-3a792d9c32b2/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
Expand Down Expand Up @@ -1235,7 +1233,6 @@ gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gotest.tools/v3 v3.0.2/go.mod h1:3SzNCllyD9/Y+b5r9JIKQ474KzkZyqLqEfYqMsX94Bk=
Expand Down
Loading

0 comments on commit 3bcd23f

Please sign in to comment.