Skip to content

Commit

Permalink
refactor(metrics): add podContainerReadyDuration and refine the logging
Browse files Browse the repository at this point in the history
  • Loading branch information
zyy17 committed Dec 3, 2024
1 parent 3b0b025 commit 1f2d734
Showing 1 changed file with 24 additions and 7 deletions.
31 changes: 24 additions & 7 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,16 @@ var (
[]string{"namespace", "resource", "pod", "container", "node", "role"},
)

podContainerReadyDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: metricName("pod_container_ready_duration_seconds"),
Help: "The duration from pod started to container ready.",

// Exponential buckets from 1s to 10min.
Buckets: prometheus.ExponentialBucketsRange(1, 600, 12),
},
[]string{"namespace", "resource", "pod", "container", "node", "role"},
)

podImagePullingDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: metricName("pod_image_pulling_duration_milliseconds"),
Help: "The duration for pod image pulling.",
Expand Down Expand Up @@ -205,16 +215,23 @@ func (c *MetricsCollector) collectPodMetrics(ctx context.Context, clusterName st
podInitializingDuration.WithLabelValues(
pod.Namespace, clusterName, pod.Name, pod.Spec.NodeName, string(role),
).Observe(duration.Seconds())
klog.Infof("pod '%s/%s' initializing duration: '%v'", pod.Namespace, pod.Name, duration)
klog.Infof("pod '%s/%s' from scheduled to initialized duration: '%v'", pod.Namespace, pod.Name, duration)

// Collect container startup duration.
// The calculation is: pod.Status.ContainerStatuses[*].State.Running.StartedAt - pod.Status.Conditions[corev1.PodInitialized].LastTransitionTime.Time.
// Collect container startup and ready duration.
for _, container := range pod.Status.ContainerStatuses {
duration := container.State.Running.StartedAt.Time.Sub(*initializedTime)
// The calculation is: pod.Status.ContainerStatuses[*].State.Running.StartedAt - pod.Status.Conditions[corev1.PodInitialized].LastTransitionTime.Time.
startupDuration := container.State.Running.StartedAt.Time.Sub(*initializedTime)
podContainerStartupDuration.WithLabelValues(
pod.Namespace, clusterName, pod.Name, container.Name, pod.Spec.NodeName, string(role),
).Observe(duration.Seconds())
klog.Infof("pod '%s/%s' container '%s' startup duration: '%v'", pod.Namespace, pod.Name, container.Name, duration)
).Observe(startupDuration.Seconds())

// The calculation is: pod.Status.Conditions[corev1.PodReady].LastTransitionTime.Time - pod.Status.ContainerStatuses[*].State.Running.StartedAt.
readyDuration := readyTime.Sub(container.State.Running.StartedAt.Time)
podContainerReadyDuration.WithLabelValues(
pod.Namespace, clusterName, pod.Name, container.Name, pod.Spec.NodeName, string(role),
).Observe(readyDuration.Seconds())

klog.Infof("pod '%s/%s' container '%s' from initialized to running duration: '%v', from running to ready duration: '%v'", pod.Namespace, pod.Name, container.Name, startupDuration, readyDuration)
}

// Collect pod startup duration.
Expand All @@ -223,7 +240,7 @@ func (c *MetricsCollector) collectPodMetrics(ctx context.Context, clusterName st
podStartupDuration.WithLabelValues(
pod.Namespace, clusterName, pod.Name, pod.Spec.NodeName, string(role),
).Observe(duration.Seconds())
klog.Infof("pod '%s/%s' startup duration: '%v'", pod.Namespace, pod.Name, duration)
klog.Infof("pod '%s/%s' from created to ready duration: '%v'", pod.Namespace, pod.Name, duration)

if err := c.collectPodImagePullingDuration(ctx, clusterName, pod, role); err != nil {
return err
Expand Down

0 comments on commit 1f2d734

Please sign in to comment.