diff --git a/tutorials/gke-node-agent-metrics-cloud-monitoring/Dockerfile b/tutorials/gke-node-agent-metrics-cloud-monitoring/Dockerfile new file mode 100644 index 0000000000..454442e191 --- /dev/null +++ b/tutorials/gke-node-agent-metrics-cloud-monitoring/Dockerfile @@ -0,0 +1,25 @@ +# Base image for containerized monitoring agent +ARG BASE_IMAGE_TAG=latest +FROM marketplace.gcr.io/google/debian9:${BASE_IMAGE_TAG} + +USER root + +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + gnupg2 \ + ca-certificates + +ADD https://dl.google.com/cloudagents/install-monitoring-agent.sh /install-monitoring-agent.sh + +RUN bash /install-monitoring-agent.sh + +RUN apt-get clean \ + && rm -rf /var/lib/apt/lists/*_* + + +COPY collectd.conf /etc/collectd/collectd.conf +COPY run.sh /run.sh + +RUN ["chmod", "+x", "/run.sh"] + +CMD /run.sh diff --git a/tutorials/gke-node-agent-metrics-cloud-monitoring/agent.yaml b/tutorials/gke-node-agent-metrics-cloud-monitoring/agent.yaml new file mode 100644 index 0000000000..720e66f423 --- /dev/null +++ b/tutorials/gke-node-agent-metrics-cloud-monitoring/agent.yaml @@ -0,0 +1,28 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: [IMAGE_NAME] + labels: + k8s-app: [IMAGE_NAME] +spec: + selector: + matchLabels: + name: [IMAGE_NAME] + template: + metadata: + labels: + name: [IMAGE_NAME] + spec: + containers: + - name: [IMAGE_NAME] + image: gcr.io/[PROJECT_ID]/[IMAGE_NAME] + securityContext: + privileged: true + volumeMounts: + - name: host + mountPath: /mnt/host + readOnly: true + volumes: + - name: host + hostPath: + path: / diff --git a/tutorials/gke-node-agent-metrics-cloud-monitoring/cloudbuild.yaml b/tutorials/gke-node-agent-metrics-cloud-monitoring/cloudbuild.yaml new file mode 100644 index 0000000000..73be0b91df --- /dev/null +++ b/tutorials/gke-node-agent-metrics-cloud-monitoring/cloudbuild.yaml @@ -0,0 +1,6 @@ +steps: +- name: 'gcr.io/cloud-builders/docker' + args: ['build', '-t', 'gcr.io/[PROJECT_ID]/[IMAGE_NAME]', '.'] +- name: 'gcr.io/cloud-builders/docker' + args: ['push', 'gcr.io/[PROJECT_ID]/[IMAGE_NAME]'] +images: ['gcr.io/[PROJECT_ID]/[IMAGE_NAME]'] diff --git a/tutorials/gke-node-agent-metrics-cloud-monitoring/collectd.conf b/tutorials/gke-node-agent-metrics-cloud-monitoring/collectd.conf new file mode 100644 index 0000000000..58a59f7db3 --- /dev/null +++ b/tutorials/gke-node-agent-metrics-cloud-monitoring/collectd.conf @@ -0,0 +1,78 @@ +Interval 60 + +Hostname "" + +# The Stackdriver agent does not use fully qualified domain names. +FQDNLookup false + +# if you have other config, especially for plugins, you can drop them +# into this directory +Include "/etc/collectd/collectd.d/" + +LoadPlugin df + + FSType "devfs" + IgnoreSelected true + ReportByDevice true + ValuesPercentage true + + +LoadPlugin cpu + + ValuesPercentage true + ReportByCpu false + +LoadPlugin swap + + ValuesPercentage true + +LoadPlugin interface +LoadPlugin disk +LoadPlugin load +LoadPlugin memory + + ValuesPercentage true + +LoadPlugin processes +LoadPlugin tcpconns + + + ProcessMatch "all" ".*" + Detail "ps_cputime" + Detail "ps_disk_octets" + Detail "ps_rss" + Detail "ps_vm" + + + + # No config - collectd fails parsing configuration if tag is empty. + + + + AllPortsSummary true + + +LoadPlugin match_regex +LoadPlugin target_set +LoadPlugin stackdriver_agent +LoadPlugin write_gcm +LoadPlugin write_log +LoadPlugin aggregation +LoadPlugin match_throttle_metadata_keys + + + Format JSON + + +PostCacheChain "PostCache" + + + + OKToThrottle true + + + Plugin "write_gcm" + Plugin "write_log" + + + diff --git a/tutorials/gke-node-agent-metrics-cloud-monitoring/images/sd-agent-metrics.png b/tutorials/gke-node-agent-metrics-cloud-monitoring/images/sd-agent-metrics.png new file mode 100644 index 0000000000..2950433ba5 Binary files /dev/null and b/tutorials/gke-node-agent-metrics-cloud-monitoring/images/sd-agent-metrics.png differ diff --git a/tutorials/gke-node-agent-metrics-cloud-monitoring/images/sd-explorer.png b/tutorials/gke-node-agent-metrics-cloud-monitoring/images/sd-explorer.png new file mode 100644 index 0000000000..d0d6db5e6b Binary files /dev/null and b/tutorials/gke-node-agent-metrics-cloud-monitoring/images/sd-explorer.png differ diff --git a/tutorials/gke-node-agent-metrics-cloud-monitoring/index.md b/tutorials/gke-node-agent-metrics-cloud-monitoring/index.md new file mode 100644 index 0000000000..2a2e04b46d --- /dev/null +++ b/tutorials/gke-node-agent-metrics-cloud-monitoring/index.md @@ -0,0 +1,112 @@ +--- +title: Collect additional GKE node metrics using collectd with Cloud Monitoring +description: Learn how to deploy the Cloud Monitoring agent on GKE nodes to expose additional VM metrics on GKE nodes. +author: aaronsutton,echiugoog +tags: host metrics +date_published: 2020-08-07 +--- + +Only a few metrics are available by default on GKE nodes. You can deploy a Cloud Monitoring agent to expose additional metrics for added visibility into the +health of your GKE nodes. + +## Objectives + +Expose additional host metrics using the Cloud Monitoring agent on GKE nodes. + +Host metrics available by default: + +* CPU usage +* Disk I/O +* Network traffic + +Metrics added with the Cloud Monitoring agent: + +* CPU load +* CPU steal +* Memory usage +* Swap usage +* Disk usage +* Open TCP connections +* Processes + +For details about the metrics exposed by the Cloud Monitoring agent, see [Agent metrics](https://cloud.google.com/monitoring/api/metrics_agent). + +Even more metrics can be added by customizing +[`collectd.conf`](https://github.com/GoogleCloudPlatform/community/blob/master/tutorials/gke-node-agent-metrics-cloud-monitoring/collectd.conf) to meet your +needs. + +## Before you begin + +1. Create a Google Cloud project and GKE cluster, as shown in [this quickstart tutorial](https://cloud.google.com/kubernetes-engine/docs/quickstart). +1. Install the [Google Cloud SDK](https://cloud.google.com/sdk/). +1. Clone this repository: + + git clone https://github.com/GoogleCloudPlatform/community.git + + The files for this tutorial are in the + [`/tutorials/gke-node-agent-metrics-cloud-monitoring`](https://github.com/GoogleCloudPlatform/community/blob/master/tutorials/gke-node-agent-metrics-cloud-monitoring) directory. + +## Build the container iamge + +1. Update `cloudbuild.yaml` by replacing the following values: + + * `[PROJECT_ID]` is your Google Cloud project ID. + * `[IMAGE_NAME]` is the name of the container image. + +1. Build the container image with Cloud Build: + + gcloud builds submit --config cloudbuild.yaml . + + When the build finishes, the image will be published to Container Registry. + +## Deploy the daemonset + +1. Update `agent.yaml` by replacing the following values: + + * `[PROJECT_ID]` is your Google Cloud project ID + * `[IMAGE_NAME]` is the name of the container image that you used when building the container image. + +1. Deploy: + + kubectl apply -f agent.yaml + +1. Check that the daemonset deployed and is ready: + + kubectl get ds + + The output should be similar to the following, where [IMAGE_NAME] is the name of your container image: + + NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE + [IMAGE_NAME] 1 1 1 1 1 29s + +## (optional) Customize the Cloud Monitoring agent + +1. Edit `collectd.conf` to expose additional metrics. +1. Add any new dependencies required for metric collection. +1. Rebuild the container image and redeploy the daemonset. + +## Viewing the metrics + +After deploying the daemonset, the additional metrics should begin to flow to Cloud Monitoring automatically. To view the metrics, go to the +[**Monitoring**](https://console.cloud.google.com/monitoring) page in the Cloud Console. + +One way of examining metrics is using the [Metrics Explorer](https://console.cloud.google.com/monitoring/metrics-explorer). Because the new metrics being +collected are GKE node metrics, they are visible for the Compute Engine VM instance resource type with the metric names beginning with `agent.googleapis.com`: + +![Metrics explorer](https://storage.googleapis.com/gcp-community/tutorials/gke-node-agent-metrics-cloud-monitoring/sd-explorer.png) + +If you take a detailed look at the node itself within Cloud Monitoring, you can see the additional metrics graphed within the VM instance dashboard agent tab. +Go to the [**Dashboards**](https://console.cloud.google.com/monitoring/dashboards) page, and then click **VM Instances** and the instance you're interested in +viewing metrics for. + +![Monitoring agent metrics](https://storage.googleapis.com/gcp-community/tutorials/gke-node-agent-metrics-cloud-monitoring/sd-agent-metrics.png) + +## Cleanup + +1. Delete the daemonset: + + kubectl delete ds [IMAGE_NAME] + +1. Delete the cluster you created in the **Before you begin** section: + + gcloud container clusters delete [CLUSTER_NAME] diff --git a/tutorials/gke-node-agent-metrics-cloud-monitoring/run.sh b/tutorials/gke-node-agent-metrics-cloud-monitoring/run.sh new file mode 100644 index 0000000000..ca57824c84 --- /dev/null +++ b/tutorials/gke-node-agent-metrics-cloud-monitoring/run.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +configuration_file="/etc/collectd/collectd.conf" +monitored_resource=$(curl --silent -f -H 'Metadata-Flavor: Google' http://169.254.169.254/computeMetadata/v1/instance/id 2>/dev/null) + +sed -i "s/%MONITORED_RESOURCE%/$monitored_resource/" "$configuration_file" + +/opt/stackdriver/collectd/sbin/stackdriver-collectd -f -C "$configuration_file"