From 0b56f0ae533f04faefda404d62929dafbbae88ed Mon Sep 17 00:00:00 2001 From: Pawan Prakash Sharma <pawan@mayadata.io> Date: Thu, 9 Jan 2020 23:10:13 +0530 Subject: [PATCH] feat(alert): adding sample prometheus rules for ZFSPV (#32) Provide sample instructions on setting up prometheus via prometheus-operator and then configuring a sample rule to monitor the volume space utilization, and once available space is less than 10%, it will start firing the alert. ``` 100 * kubelet_volume_stats_available_bytes{job="kubelet"} / kubelet_volume_stats_capacity_bytes{job="kubelet"} < 10 ``` Signed-off-by: Pawan <pawan@mayadata.io> --- deploy/sample/alertmanager-service.yaml | 14 + deploy/sample/prometheus-alert.rules | 27 ++ deploy/sample/prometheus-service.yaml | 14 + docs/prometheus-monitoring.md | 396 ++++++++++++++++++++++++ 4 files changed, 451 insertions(+) create mode 100644 deploy/sample/alertmanager-service.yaml create mode 100644 deploy/sample/prometheus-alert.rules create mode 100644 deploy/sample/prometheus-service.yaml create mode 100644 docs/prometheus-monitoring.md diff --git a/deploy/sample/alertmanager-service.yaml b/deploy/sample/alertmanager-service.yaml new file mode 100644 index 000000000..a6afab6da --- /dev/null +++ b/deploy/sample/alertmanager-service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: alertmanager-service +spec: + type: NodePort + ports: + - name: web + nodePort: 30093 + port: 9093 + protocol: TCP + targetPort: web + selector: + alertmanager: prometheus-operator-alertmanager diff --git a/deploy/sample/prometheus-alert.rules b/deploy/sample/prometheus-alert.rules new file mode 100644 index 000000000..c84c7fda3 --- /dev/null +++ b/deploy/sample/prometheus-alert.rules @@ -0,0 +1,27 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app: prometheus-operator + chart: prometheus-operator-8.5.4 + heritage: Tiller + release: prometheus-operator + name: prometheus-operator-zfs-alertmanager.rules + namespace: default +spec: + groups: + - name: zfsalertmanager.rules + rules: + - alert: ZFSVolumeUsageCritical + annotations: + message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim + }} in Namespace {{ $labels.namespace }} is only {{ printf "%0.2f" $value + }}% free. + expr: | + 100 * kubelet_volume_stats_available_bytes{job="kubelet"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet"} + < 10 + for: 1m + labels: + severity: critical diff --git a/deploy/sample/prometheus-service.yaml b/deploy/sample/prometheus-service.yaml new file mode 100644 index 000000000..d1b374814 --- /dev/null +++ b/deploy/sample/prometheus-service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: prometheus-service +spec: + type: NodePort + ports: + - name: web + nodePort: 30090 + port: 9090 + protocol: TCP + targetPort: web + selector: + prometheus: prometheus-operator-prometheus diff --git a/docs/prometheus-monitoring.md b/docs/prometheus-monitoring.md new file mode 100644 index 000000000..246077ca6 --- /dev/null +++ b/docs/prometheus-monitoring.md @@ -0,0 +1,396 @@ +### Setup helm + +This step uses helm the kubernetes package manager. If you not setup the helm then do the below the configuration, otherwise move to next step. + +``` +$ helm version +Client: &version.Version{SemVer:"v2.16.1", GitCommit:"bbdfe5e7803a12bbdf97e94cd847859890cf4050", GitTreeState:"clean"} +Server: &version.Version{SemVer:"v2.16.1", GitCommit:"bbdfe5e7803a12bbdf97e94cd847859890cf4050", GitTreeState:"clean"} + +$ helm init +Tiller (the Helm server-side component) has been installed into your Kubernetes Cluster. + +Please note: by default, Tiller is deployed with an insecure 'allow unauthenticated users' policy. +To prevent this, run `helm init` with the --tiller-tls-verify flag. +For more information on securing your installation see: https://docs.helm.sh/using_helm/#securing-your-helm-installation + +$ kubectl create serviceaccount --namespace kube-system tiller +serviceaccount/tiller created + +$ kubectl create clusterrolebinding tiller-cluster-rule --clusterrole=cluster-admin --serviceaccount=kube-system:tiller +clusterrolebinding.rbac.authorization.k8s.io/tiller-cluster-rule created + +$ kubectl patch deploy --namespace kube-system tiller-deploy -p '{"spec":{"template":{"spec":{"serviceAccount":"tiller"}}}}' +deployment.extensions/tiller-deploy patched +``` + +### Install Prometheus Operator + +Once the helm is ready and related titler pods is up and running , use the Prometheus chart from the helm repository + +``` +$ helm install stable/prometheus-operator --name prometheus-operator +NAME: prometheus-operator +LAST DEPLOYED: Thu Jan 9 12:50:03 2020 +NAMESPACE: default +STATUS: DEPLOYED + +RESOURCES: +==> v1/Alertmanager +NAME AGE +prometheus-operator-alertmanager 54s + +==> v1/ClusterRole +NAME AGE +prometheus-operator-alertmanager 54s +prometheus-operator-grafana-clusterrole 54s +prometheus-operator-operator 54s +prometheus-operator-operator-psp 54s +prometheus-operator-prometheus 54s +prometheus-operator-prometheus-psp 54s +psp-prometheus-operator-kube-state-metrics 54s +psp-prometheus-operator-prometheus-node-exporter 54s + +==> v1/ClusterRoleBinding +NAME AGE +prometheus-operator-alertmanager 54s +prometheus-operator-grafana-clusterrolebinding 54s +prometheus-operator-operator 54s +prometheus-operator-operator-psp 54s +prometheus-operator-prometheus 54s +prometheus-operator-prometheus-psp 54s +psp-prometheus-operator-kube-state-metrics 54s +psp-prometheus-operator-prometheus-node-exporter 54s + +==> v1/ConfigMap +NAME AGE +prometheus-operator-apiserver 54s +prometheus-operator-cluster-total 54s +prometheus-operator-controller-manager 54s +prometheus-operator-etcd 54s +prometheus-operator-grafana 54s +prometheus-operator-grafana-config-dashboards 54s +prometheus-operator-grafana-datasource 54s +prometheus-operator-grafana-test 54s +prometheus-operator-k8s-resources-cluster 54s +prometheus-operator-k8s-resources-namespace 54s +prometheus-operator-k8s-resources-node 54s +prometheus-operator-k8s-resources-pod 54s +prometheus-operator-k8s-resources-workload 54s +prometheus-operator-k8s-resources-workloads-namespace 54s +prometheus-operator-kubelet 54s +prometheus-operator-namespace-by-pod 54s +prometheus-operator-namespace-by-workload 54s +prometheus-operator-node-cluster-rsrc-use 54s +prometheus-operator-node-rsrc-use 54s +prometheus-operator-nodes 54s +prometheus-operator-persistentvolumesusage 54s +prometheus-operator-pod-total 54s +prometheus-operator-pods 54s +prometheus-operator-prometheus 54s +prometheus-operator-proxy 54s +prometheus-operator-scheduler 54s +prometheus-operator-statefulset 54s +prometheus-operator-workload-total 54s + +==> v1/DaemonSet +NAME AGE +prometheus-operator-prometheus-node-exporter 54s + +==> v1/Deployment +NAME AGE +prometheus-operator-grafana 54s +prometheus-operator-kube-state-metrics 54s +prometheus-operator-operator 54s + +==> v1/Pod(related) +NAME AGE +prometheus-operator-grafana-85bb5d49d-bffdg 54s +prometheus-operator-kube-state-metrics-5d46566c59-p8k6s 54s +prometheus-operator-operator-64844759f7-rpwws 54s +prometheus-operator-prometheus-node-exporter-p9rl8 54s + +==> v1/Prometheus +NAME AGE +prometheus-operator-prometheus 54s + +==> v1/PrometheusRule +NAME AGE +prometheus-operator-alertmanager.rules 54s +prometheus-operator-etcd 54s +prometheus-operator-general.rules 54s +prometheus-operator-k8s.rules 54s +prometheus-operator-kube-apiserver-error 54s +prometheus-operator-kube-apiserver.rules 54s +prometheus-operator-kube-prometheus-node-recording.rules 54s +prometheus-operator-kube-scheduler.rules 54s +prometheus-operator-kubernetes-absent 54s +prometheus-operator-kubernetes-apps 54s +prometheus-operator-kubernetes-resources 54s +prometheus-operator-kubernetes-storage 54s +prometheus-operator-kubernetes-system 54s +prometheus-operator-kubernetes-system-apiserver 54s +prometheus-operator-kubernetes-system-controller-manager 54s +prometheus-operator-kubernetes-system-kubelet 54s +prometheus-operator-kubernetes-system-scheduler 54s +prometheus-operator-node-exporter 54s +prometheus-operator-node-exporter.rules 54s +prometheus-operator-node-network 54s +prometheus-operator-node-time 54s +prometheus-operator-node.rules 54s +prometheus-operator-prometheus 54s +prometheus-operator-prometheus-operator 54s + +==> v1/Role +NAME AGE +prometheus-operator-grafana-test 54s + +==> v1/RoleBinding +NAME AGE +prometheus-operator-grafana-test 54s + +==> v1/Secret +NAME AGE +alertmanager-prometheus-operator-alertmanager 54s +prometheus-operator-grafana 54s + +==> v1/Service +NAME AGE +prometheus-operator-alertmanager 54s +prometheus-operator-coredns 54s +prometheus-operator-grafana 54s +prometheus-operator-kube-controller-manager 54s +prometheus-operator-kube-etcd 54s +prometheus-operator-kube-proxy 54s +prometheus-operator-kube-scheduler 54s +prometheus-operator-kube-state-metrics 54s +prometheus-operator-operator 54s +prometheus-operator-prometheus 54s +prometheus-operator-prometheus-node-exporter 54s + +==> v1/ServiceAccount +NAME AGE +prometheus-operator-alertmanager 54s +prometheus-operator-grafana 54s +prometheus-operator-grafana-test 54s +prometheus-operator-kube-state-metrics 54s +prometheus-operator-operator 54s +prometheus-operator-prometheus 54s +prometheus-operator-prometheus-node-exporter 54s + +==> v1/ServiceMonitor +NAME AGE +prometheus-operator-alertmanager 53s +prometheus-operator-apiserver 53s +prometheus-operator-coredns 53s +prometheus-operator-grafana 53s +prometheus-operator-kube-controller-manager 53s +prometheus-operator-kube-etcd 53s +prometheus-operator-kube-proxy 53s +prometheus-operator-kube-scheduler 53s +prometheus-operator-kube-state-metrics 53s +prometheus-operator-kubelet 53s +prometheus-operator-node-exporter 53s +prometheus-operator-operator 53s +prometheus-operator-prometheus 53s + +==> v1beta1/ClusterRole +NAME AGE +prometheus-operator-kube-state-metrics 54s + +==> v1beta1/ClusterRoleBinding +NAME AGE +prometheus-operator-kube-state-metrics 54s + +==> v1beta1/MutatingWebhookConfiguration +NAME AGE +prometheus-operator-admission 54s + +==> v1beta1/PodSecurityPolicy +NAME AGE +prometheus-operator-alertmanager 54s +prometheus-operator-grafana 54s +prometheus-operator-grafana-test 54s +prometheus-operator-kube-state-metrics 54s +prometheus-operator-operator 54s +prometheus-operator-prometheus 54s +prometheus-operator-prometheus-node-exporter 54s + +==> v1beta1/Role +NAME AGE +prometheus-operator-grafana 54s + +==> v1beta1/RoleBinding +NAME AGE +prometheus-operator-grafana 54s + +==> v1beta1/ValidatingWebhookConfiguration +NAME AGE +prometheus-operator-admission 53s + + +NOTES: +The Prometheus Operator has been installed. Check its status by running: + kubectl --namespace default get pods -l "release=prometheus-operator" + + Visit https://github.com/coreos/prometheus-operator for instructions on how + to create & configure Alertmanager and Prometheus instances using the Operator. +``` + +Lookup all the required pods are up and running + +``` +$ kubectl get pods -l "release=prometheus-operator" +NAME READY STATUS RESTARTS AGE +prometheus-operator-grafana-85bb5d49d-bffdg 2/2 Running 0 2m21s +prometheus-operator-operator-64844759f7-rpwws 2/2 Running 0 2m21s +prometheus-operator-prometheus-node-exporter-p9rl8 1/1 Running 0 2m21s +``` + +### Setup alert rule + +Please check the rules there in the system :- + +``` +$ kubectl get PrometheusRule +NAME AGE +prometheus-operator-alertmanager.rules 4m21s +prometheus-operator-etcd 4m21s +prometheus-operator-general.rules 4m21s +prometheus-operator-k8s.rules 4m21s +prometheus-operator-kube-apiserver-error 4m21s +prometheus-operator-kube-apiserver.rules 4m21s +prometheus-operator-kube-prometheus-node-recording.rules 4m21s +prometheus-operator-kube-scheduler.rules 4m21s +prometheus-operator-kubernetes-absent 4m21s +prometheus-operator-kubernetes-apps 4m21s +prometheus-operator-kubernetes-resources 4m21s +prometheus-operator-kubernetes-storage 4m21s +prometheus-operator-kubernetes-system 4m21s +prometheus-operator-kubernetes-system-apiserver 4m21s +prometheus-operator-kubernetes-system-controller-manager 4m21s +prometheus-operator-kubernetes-system-kubelet 4m21s +prometheus-operator-kubernetes-system-scheduler 4m21s +prometheus-operator-node-exporter 4m21s +prometheus-operator-node-exporter.rules 4m21s +prometheus-operator-node-network 4m21s +prometheus-operator-node-time 4m21s +prometheus-operator-node.rules 4m21s +prometheus-operator-prometheus 4m21s +prometheus-operator-prometheus-operator 4m21s +``` + +You can edit any of the default rule or setup the new rule to get the alerts. Here is the sample alert rule if available storage space is less than 10% then start throwing the alert :- + +```yaml +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app: prometheus-operator + chart: prometheus-operator-8.5.4 + heritage: Tiller + release: prometheus-operator + name: prometheus-operator-zfs-alertmanager.rules + namespace: default +spec: + groups: + - name: zfsalertmanager.rules + rules: + - alert: ZFSVolumeUsageCritical + annotations: + message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim + }} in Namespace {{ $labels.namespace }} is only {{ printf "%0.2f" $value + }}% free. + expr: | + 100 * kubelet_volume_stats_available_bytes{job="kubelet"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet"} + < 10 + for: 1m + labels: + severity: critical +``` + +Apply the above yaml so that Prometheus can fire the alert when available space is less than 10% + + +### Check the Prometheus alert + +To be able to view the Prometheus web UI, expose it through a Service. A simple way to do this is to use a Service of type NodePort + +``` +$ cat prometheus-service.yaml + +apiVersion: v1 +kind: Service +metadata: + name: prometheus-service +spec: + type: NodePort + ports: + - name: web + nodePort: 30090 + port: 9090 + protocol: TCP + targetPort: web + selector: + prometheus: prometheus-operator-prometheus +``` + +apply the above yaml + +``` +$ kubectl apply -f prometheus-service.yaml +service/prometheus-service created +``` + +Now you can access the alert manager UI via "nodes-external-ip:30090" + +``` +$ kubectl get nodes -owide +NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME +gke-zfspv-pawan-default-pool-3e407350-xvzp Ready <none> 103m v1.15.4-gke.22 10.168.0.45 34.94.3.140 Ubuntu 18.04.3 LTS 5.0.0-1022-gke docker://19.3.2 +``` + +In this case we can access the alert manager via url http://34.94.3.140:30090/ + +### Check the Alert Manager + +To be able to view the Alert Manager web UI, expose it through a Service of type NodePort + +``` +$ cat alertmanager-service.yaml + +apiVersion: v1 +kind: Service +metadata: + name: alertmanager-service +spec: + type: NodePort + ports: + - name: web + nodePort: 30093 + port: 9093 + protocol: TCP + targetPort: web + selector: + alertmanager: prometheus-operator-alertmanager +``` + +apply the above yaml + +``` +$ kubectl apply -f alertmanager-service.yaml +service/alertmanager-service created +``` + +Now you can access the alert manager UI via "nodes-external-ip:30093" + +``` +$ kubectl get nodes -owide +NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME +gke-zfspv-pawan-default-pool-3e407350-xvzp Ready <none> 103m v1.15.4-gke.22 10.168.0.45 34.94.3.140 Ubuntu 18.04.3 LTS 5.0.0-1022-gke docker://19.3.2 +``` + +In this case we can access the alert manager via url http://34.94.3.140:30093/