From 182a9c851466a00922cfc2ec7d16c768ba286b37 Mon Sep 17 00:00:00 2001 From: Yevgeniy Miretskiy Date: Tue, 8 Sep 2020 13:20:03 -0400 Subject: [PATCH] bulkio: Count corrupt schedules. Differentiate between "bad schedules" -- schedules we could not execute, and corrupt ones (schedules we couldn't even parse). Release Notes: None Release Justification: No impact metrics change. --- pkg/jobs/job_scheduler.go | 6 ++++-- pkg/jobs/schedule_metrics.go | 18 ++++++++++++++---- pkg/ts/catalog/chart_catalog.go | 3 ++- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/pkg/jobs/job_scheduler.go b/pkg/jobs/job_scheduler.go index 1cd51fe4ac92..dc36a78e617a 100644 --- a/pkg/jobs/job_scheduler.go +++ b/pkg/jobs/job_scheduler.go @@ -115,6 +115,7 @@ const recheckRunningAfter = 1 * time.Minute type loopStats struct { rescheduleWait, rescheduleSkip, started int64 readyToRun, jobsRunning int64 + malformed int64 } func (s *loopStats) updateMetrics(m *SchedulerMetrics) { @@ -123,6 +124,7 @@ func (s *loopStats) updateMetrics(m *SchedulerMetrics) { m.NumRunning.Update(s.jobsRunning) m.RescheduleSkip.Update(s.rescheduleSkip) m.RescheduleWait.Update(s.rescheduleWait) + m.NumMalformedSchedules.Update(s.malformed) } func (s *jobScheduler) processSchedule( @@ -290,7 +292,7 @@ func (s *jobScheduler) executeSchedules( for _, row := range rows { schedule, numRunning, err := s.unmarshalScheduledJob(row, cols) if err != nil { - s.metrics.NumBadSchedules.Inc(1) + stats.malformed++ log.Errorf(ctx, "error parsing schedule: %+v", row) continue } @@ -303,7 +305,7 @@ func (s *jobScheduler) executeSchedules( } // Failed to process schedule. - s.metrics.NumBadSchedules.Inc(1) + s.metrics.NumErrSchedules.Inc(1) log.Errorf(ctx, "error processing schedule %d: %+v", schedule.ScheduleID(), processErr) diff --git a/pkg/jobs/schedule_metrics.go b/pkg/jobs/schedule_metrics.go index 3504c3aeb878..f613e2b5296f 100644 --- a/pkg/jobs/schedule_metrics.go +++ b/pkg/jobs/schedule_metrics.go @@ -42,7 +42,10 @@ type SchedulerMetrics struct { // Number of schedules rescheduled due to WAIT policy. RescheduleWait *metric.Gauge // Number of schedules that could not be processed due to an error. - NumBadSchedules *metric.Counter + NumErrSchedules *metric.Gauge + // Number of schedules that are malformed: that is, the schedules + // we cannot parse, or even attempt to execute. + NumMalformedSchedules *metric.Gauge } // MakeSchedulerMetrics returns metrics for scheduled job daemon. @@ -83,9 +86,16 @@ func MakeSchedulerMetrics() SchedulerMetrics { Unit: metric.Unit_COUNT, }), - NumBadSchedules: metric.NewCounter(metric.Metadata{ - Name: "schedules.corrupt", - Help: "Number of corrupt/bad schedules", + NumErrSchedules: metric.NewGauge(metric.Metadata{ + Name: "schedules.error", + Help: "Number of schedules which did not execute successfully", + Measurement: "Schedules", + Unit: metric.Unit_COUNT, + }), + + NumMalformedSchedules: metric.NewGauge(metric.Metadata{ + Name: "schedules.malformed", + Help: "Number of malformed schedules", Measurement: "Schedules", Unit: metric.Unit_COUNT, }), diff --git a/pkg/ts/catalog/chart_catalog.go b/pkg/ts/catalog/chart_catalog.go index 1183cbceb04c..b7696210c0d9 100644 --- a/pkg/ts/catalog/chart_catalog.go +++ b/pkg/ts/catalog/chart_catalog.go @@ -2225,7 +2225,8 @@ var charts = []sectionDescription{ { Title: "Total", Metrics: []string{ - "schedules.corrupt", + "schedules.malformed", + "schedules.error", "schedules.total.started", "schedules.total.succeeded", "schedules.total.failed",