Skip to content

Commit

Permalink
Replace KSM with built-in metrics (#129)
Browse files Browse the repository at this point in the history
* Replace KSM with built-in metrics

* Fix black and PEP8 formatting
  • Loading branch information
mkjpryor authored Jun 10, 2024
1 parent e165f2a commit 06bde20
Show file tree
Hide file tree
Showing 8 changed files with 200 additions and 76 deletions.
12 changes: 2 additions & 10 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,7 @@ RUN groupadd --gid $APP_GID $APP_GROUP && \
# Don't buffer stdout and stderr as it breaks realtime logging
ENV PYTHONUNBUFFERED 1

# By default, run the operator using kopf
# By default, run the operator
USER $APP_UID
ENTRYPOINT ["tini", "-g", "--"]
CMD [ \
"kopf", \
"run", \
"--module", \
"azimuth_caas_operator.operator", \
"--all-namespaces", \
"--liveness", \
"http://0.0.0.0:8000/healthz" \
]
CMD ["python", "-m", "azimuth_caas_operator"]
23 changes: 23 additions & 0 deletions azimuth_caas_operator/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import asyncio

import kopf

from . import metrics


async def main():
"""
Run the operator and the metrics server together.
"""
# This import is required to pick up the operator handlers
from . import operator # noqa

kopf.configure()
tasks = await kopf.spawn_tasks(
clusterwide=True, liveness_endpoint="http://0.0.0.0:8000/healthz"
)
tasks.append(asyncio.create_task(metrics.metrics_server()))
await kopf.run_tasks(tasks)


asyncio.run(main())
144 changes: 144 additions & 0 deletions azimuth_caas_operator/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import asyncio
import functools

from aiohttp import web

import easykube

from .models import registry


class Metric:
"""Represents a metric."""

# The name of the metric
name = None
# The type of the metric - info or guage
type = "info"
# The description of the metric
description = None
# The API version of the resource
api_version = registry.API_VERSION
# The resource that the metric is for
resource = None

def __init__(self):
self._objs = []

def add_obj(self, obj):
self._objs.append(obj)

def labels(self, obj):
"""The labels for the given object."""
raise NotImplementedError

def value(self, obj):
"""The value for the given object."""
return 1

def records(self):
"""Returns the records for the metric, i.e. a list of (labels, value) tuples."""
for obj in self._objs:
yield self.labels(obj), self.value(obj)


class ClusterTypePhase(Metric):
"""Metric for the phase of a cluster type."""

name = "azimuth_caas_clustertypes_phase"
description = "Cluster type phase"
resource = "clustertypes"

def labels(self, obj):
return {
"cluster_type_name": obj.metadata.name,
"cluster_type_version": obj.metadata["resourceVersion"],
"phase": obj.get("status", {}).get("phase", "Unknown"),
}


class ClusterPhase(Metric):
"""Metric for the phase of a cluster."""

name = "azimuth_caas_clusters_phase"
description = "Cluster phase"
resource = "clusters"

def labels(self, obj):
return {
"cluster_namespace": obj.metadata.namespace,
"cluster_name": obj.metadata.name,
"cluster_type_name": obj.spec["clusterTypeName"],
"cluster_type_version": obj.spec["clusterTypeVersion"],
"phase": obj.get("status", {}).get("phase", "Unknown"),
}


def escape(content):
"""Escape the given content for use in metric output."""
return content.replace("\\", r"\\").replace("\n", r"\n").replace('"', r"\"")


def render_openmetrics(*metrics):
"""Renders the metrics using OpenMetrics text format."""
output = []
for metric in metrics:
output.append(f"# TYPE {metric.name} {metric.type}\n")
if metric.description:
output.append(f"# HELP {metric.name} {escape(metric.description)}\n")

for labels, value in metric.records():
if labels:
labelstr = "{{{0}}}".format(
",".join([f'{k}="{escape(v)}"' for k, v in sorted(labels.items())])
)
else:
labelstr = ""
output.append(f"{metric.name}{labelstr} {value}\n")
output.append("# EOF\n")

return (
"application/openmetrics-text; version=1.0.0; charset=utf-8",
"".join(output).encode("utf-8"),
)


async def metrics_handler(ekclient, request):
"""Produce metrics for the operator."""
ekapi = ekclient.api(registry.API_VERSION)

cluster_type_phase_metric = ClusterTypePhase()
cluster_phase_metric = ClusterPhase()

clustertypes = await ekapi.resource("clustertypes")
async for cluster_type in clustertypes.list():
cluster_type_phase_metric.add_obj(cluster_type)

clusters = await ekapi.resource("clusters")
async for cluster in clusters.list(all_namespaces=True):
cluster_phase_metric.add_obj(cluster)

content_type, content = render_openmetrics(
cluster_type_phase_metric, cluster_phase_metric
)
return web.Response(headers={"Content-Type": content_type}, body=content)


async def metrics_server():
"""Launch a lightweight HTTP server to serve the metrics endpoint."""
ekclient = easykube.Configuration.from_environment().async_client()

app = web.Application()
app.add_routes([web.get("/metrics", functools.partial(metrics_handler, ekclient))])

runner = web.AppRunner(app, handle_signals=False)
await runner.setup()

site = web.TCPSite(runner, "0.0.0.0", "8080", shutdown_timeout=1.0)
await site.start()

# Sleep until we need to clean up
try:
await asyncio.Event().wait()
finally:
await asyncio.shield(runner.cleanup())
5 changes: 0 additions & 5 deletions charts/operator/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,3 @@ dependencies:
- name: ara
version: ">=0-0"
repository: file://../ara
- name: kube-state-metrics
repository: https://prometheus-community.github.io/helm-charts
version: 5.15.3
alias: metrics
condition: metrics.enabled
4 changes: 4 additions & 0 deletions charts/operator/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ spec:
value: {{ quote .Values.config.ansibleRunnerImage.repository }}
- name: ANSIBLE_RUNNER_IMAGE_TAG
value: {{ default .Chart.AppVersion .Values.config.ansibleRunnerImage.tag | quote }}
ports:
- name: metrics
containerPort: 8080
protocol: TCP
livenessProbe:
{{- toYaml .Values.livenessProbe | nindent 12 }}
startupProbe:
Expand Down
13 changes: 13 additions & 0 deletions charts/operator/templates/service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
apiVersion: v1
kind: Service
metadata:
name: {{ include "azimuth-caas-operator.fullname" . }}
labels: {{ include "azimuth-caas-operator.labels" . | nindent 4 }}
spec:
type: ClusterIP
ports:
- name: metrics
port: 8080
targetPort: metrics
protocol: TCP
selector: {{ include "azimuth-caas-operator.selectorLabels" . | nindent 4 }}
14 changes: 14 additions & 0 deletions charts/operator/templates/servicemonitor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{{- if and .Values.metrics.enabled .Values.metrics.prometheus.monitor.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: {{ include "azimuth-caas-operator.fullname" . }}
labels: {{ include "azimuth-caas-operator.labels" . | nindent 4 }}
spec:
endpoints:
- honorLabels: true
port: metrics
jobLabel: app.kubernetes.io/name
selector:
matchLabels: {{ include "azimuth-caas-operator.selectorLabels" . | nindent 6 }}
{{- end }}
61 changes: 0 additions & 61 deletions charts/operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,64 +69,3 @@ metrics:
enabled: true
monitor:
enabled: true
honorLabels: true
# Disable all the default collectors
collectors: []
# Allow kube-state-metrics read-only access to our CRDs
rbac:
create: true
extraRules:
- apiGroups:
- caas.azimuth.stackhpc.com
resources:
- clusters
- clustertypes
verbs:
- list
- watch
- get
# Configure kube-state-metrics to report only on our custom resources
extraArgs:
- --custom-resource-state-only=true
customResourceState:
enabled: true
config:
kind: CustomResourceStateMetrics
spec:
resources:
- groupVersionKind:
group: caas.azimuth.stackhpc.com
version: v1alpha1
kind: Cluster
metricNamePrefix: azimuth_caas_clusters
labelsFromPath:
cluster_namespace: [metadata, namespace]
cluster_name: [metadata, name]
cluster_type_name: [spec, clusterTypeName]
cluster_type_version: [spec, clusterTypeVersion]
metrics:
- name: phase
help: "Cluster phase"
each:
type: Info
info:
labelsFromPath:
phase: [status, phase]

- groupVersionKind:
group: caas.azimuth.stackhpc.com
version: v1alpha1
kind: ClusterType
metricNamePrefix: azimuth_caas_clustertypes
labelsFromPath:
cluster_type_namespace: [metadata, namespace]
cluster_type_name: [metadata, name]
cluster_type_version: [metadata, resourceVersion]
metrics:
- name: phase
help: "Cluster type phase"
each:
type: Info
info:
labelsFromPath:
phase: [status, phase]

0 comments on commit 06bde20

Please sign in to comment.