From a11471c65f8aa6f35c2b355c84f26234454323ba Mon Sep 17 00:00:00 2001 From: satoru Date: Thu, 9 May 2019 15:58:48 +0800 Subject: [PATCH 1/3] Add a metric to track the delay of the downstream --- drainer/metrics.go | 9 +++++++++ drainer/syncer.go | 2 ++ 2 files changed, 11 insertions(+) diff --git a/drainer/metrics.go b/drainer/metrics.go index 572345e96..da494c9fa 100644 --- a/drainer/metrics.go +++ b/drainer/metrics.go @@ -69,6 +69,14 @@ var ( Help: "save checkpoint tso of drainer.", }) + checkpointDelayGauge = prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: "binlog", + Subsystem: "drainer", + Name: "checkpoint_delay_seconds", + Help: "How much the downstream checkpoint lag behind", + }) + executeHistogram = prometheus.NewHistogram( prometheus.HistogramOpts{ Namespace: "binlog", @@ -123,6 +131,7 @@ func init() { registry.MustRegister(ddlJobsCounter) registry.MustRegister(errorCount) registry.MustRegister(checkpointTSOGauge) + registry.MustRegister(checkpointDelayGauge) registry.MustRegister(eventCounter) registry.MustRegister(executeHistogram) registry.MustRegister(binlogReachDurationHistogram) diff --git a/drainer/syncer.go b/drainer/syncer.go index 90fc55d12..10ff4c5b1 100644 --- a/drainer/syncer.go +++ b/drainer/syncer.go @@ -221,6 +221,8 @@ func (s *Syncer) handleSuccess(fakeBinlog chan *pb.Binlog, lastTS *int64) { lastSaveTS = ts eventCounter.WithLabelValues("savepoint").Add(1) } + delay := oracle.GetPhysical(time.Now()) - oracle.ExtractPhysical(uint64(ts)) + checkpointDelayGauge.Set(float64(delay) / 1e3) } } From 1f8851910ed2b5ac17d24dd54e363a2e32b7f3ad Mon Sep 17 00:00:00 2001 From: satoru Date: Mon, 20 May 2019 17:09:02 +0800 Subject: [PATCH 2/3] Use histogram instead --- drainer/metrics.go | 7 ++++--- drainer/syncer.go | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/drainer/metrics.go b/drainer/metrics.go index da494c9fa..d55427d4b 100644 --- a/drainer/metrics.go +++ b/drainer/metrics.go @@ -69,12 +69,13 @@ var ( Help: "save checkpoint tso of drainer.", }) - checkpointDelayGauge = prometheus.NewGauge( - prometheus.GaugeOpts{ + checkpointDelayHistogram = prometheus.NewHistogram( + prometheus.HistogramOpts{ Namespace: "binlog", Subsystem: "drainer", Name: "checkpoint_delay_seconds", Help: "How much the downstream checkpoint lag behind", + Buckets: prometheus.ExponentialBuckets(0.00005, 2, 18), }) executeHistogram = prometheus.NewHistogram( @@ -131,7 +132,7 @@ func init() { registry.MustRegister(ddlJobsCounter) registry.MustRegister(errorCount) registry.MustRegister(checkpointTSOGauge) - registry.MustRegister(checkpointDelayGauge) + registry.MustRegister(checkpointDelayHistogram) registry.MustRegister(eventCounter) registry.MustRegister(executeHistogram) registry.MustRegister(binlogReachDurationHistogram) diff --git a/drainer/syncer.go b/drainer/syncer.go index 10ff4c5b1..2f6e7dfad 100644 --- a/drainer/syncer.go +++ b/drainer/syncer.go @@ -222,7 +222,7 @@ func (s *Syncer) handleSuccess(fakeBinlog chan *pb.Binlog, lastTS *int64) { eventCounter.WithLabelValues("savepoint").Add(1) } delay := oracle.GetPhysical(time.Now()) - oracle.ExtractPhysical(uint64(ts)) - checkpointDelayGauge.Set(float64(delay) / 1e3) + checkpointDelayHistogram.Observe(float64(delay) / 1e3) } } From a34ba812b63b9fcb0735decb1bc5f4551ac48c53 Mon Sep 17 00:00:00 2001 From: satoru Date: Thu, 23 May 2019 14:04:51 +0800 Subject: [PATCH 3/3] Use buckets suitable for drainer delay --- drainer/metrics.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drainer/metrics.go b/drainer/metrics.go index d55427d4b..971f36968 100644 --- a/drainer/metrics.go +++ b/drainer/metrics.go @@ -75,7 +75,7 @@ var ( Subsystem: "drainer", Name: "checkpoint_delay_seconds", Help: "How much the downstream checkpoint lag behind", - Buckets: prometheus.ExponentialBuckets(0.00005, 2, 18), + Buckets: prometheus.ExponentialBuckets(0.001, 2, 22), }) executeHistogram = prometheus.NewHistogram(