Skip to content

Commit

Permalink
bulkio: Count corrupt schedules.
Browse files Browse the repository at this point in the history
Differentiate between "bad schedules" -- schedules
we could not execute, and corrupt ones (schedules
we couldn't even parse).

Release Notes: None
Release Justification: No impact metrics change.
  • Loading branch information
Yevgeniy Miretskiy committed Sep 18, 2020
1 parent 176c08c commit 182a9c8
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 7 deletions.
6 changes: 4 additions & 2 deletions pkg/jobs/job_scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ const recheckRunningAfter = 1 * time.Minute
type loopStats struct {
rescheduleWait, rescheduleSkip, started int64
readyToRun, jobsRunning int64
malformed int64
}

func (s *loopStats) updateMetrics(m *SchedulerMetrics) {
Expand All @@ -123,6 +124,7 @@ func (s *loopStats) updateMetrics(m *SchedulerMetrics) {
m.NumRunning.Update(s.jobsRunning)
m.RescheduleSkip.Update(s.rescheduleSkip)
m.RescheduleWait.Update(s.rescheduleWait)
m.NumMalformedSchedules.Update(s.malformed)
}

func (s *jobScheduler) processSchedule(
Expand Down Expand Up @@ -290,7 +292,7 @@ func (s *jobScheduler) executeSchedules(
for _, row := range rows {
schedule, numRunning, err := s.unmarshalScheduledJob(row, cols)
if err != nil {
s.metrics.NumBadSchedules.Inc(1)
stats.malformed++
log.Errorf(ctx, "error parsing schedule: %+v", row)
continue
}
Expand All @@ -303,7 +305,7 @@ func (s *jobScheduler) executeSchedules(
}

// Failed to process schedule.
s.metrics.NumBadSchedules.Inc(1)
s.metrics.NumErrSchedules.Inc(1)
log.Errorf(ctx,
"error processing schedule %d: %+v", schedule.ScheduleID(), processErr)

Expand Down
18 changes: 14 additions & 4 deletions pkg/jobs/schedule_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,10 @@ type SchedulerMetrics struct {
// Number of schedules rescheduled due to WAIT policy.
RescheduleWait *metric.Gauge
// Number of schedules that could not be processed due to an error.
NumBadSchedules *metric.Counter
NumErrSchedules *metric.Gauge
// Number of schedules that are malformed: that is, the schedules
// we cannot parse, or even attempt to execute.
NumMalformedSchedules *metric.Gauge
}

// MakeSchedulerMetrics returns metrics for scheduled job daemon.
Expand Down Expand Up @@ -83,9 +86,16 @@ func MakeSchedulerMetrics() SchedulerMetrics {
Unit: metric.Unit_COUNT,
}),

NumBadSchedules: metric.NewCounter(metric.Metadata{
Name: "schedules.corrupt",
Help: "Number of corrupt/bad schedules",
NumErrSchedules: metric.NewGauge(metric.Metadata{
Name: "schedules.error",
Help: "Number of schedules which did not execute successfully",
Measurement: "Schedules",
Unit: metric.Unit_COUNT,
}),

NumMalformedSchedules: metric.NewGauge(metric.Metadata{
Name: "schedules.malformed",
Help: "Number of malformed schedules",
Measurement: "Schedules",
Unit: metric.Unit_COUNT,
}),
Expand Down
3 changes: 2 additions & 1 deletion pkg/ts/catalog/chart_catalog.go
Original file line number Diff line number Diff line change
Expand Up @@ -2225,7 +2225,8 @@ var charts = []sectionDescription{
{
Title: "Total",
Metrics: []string{
"schedules.corrupt",
"schedules.malformed",
"schedules.error",
"schedules.total.started",
"schedules.total.succeeded",
"schedules.total.failed",
Expand Down

0 comments on commit 182a9c8

Please sign in to comment.