diff --git a/domain/BUILD.bazel b/domain/BUILD.bazel index 005c7323331fc..69566a3bd8271 100644 --- a/domain/BUILD.bazel +++ b/domain/BUILD.bazel @@ -38,6 +38,7 @@ go_library( "//domain/resourcegroup", "//errno", "//infoschema", + "//infoschema/metrics", "//infoschema/perfschema", "//keyspace", "//kv", diff --git a/domain/domain.go b/domain/domain.go index cd0bbf1cf9c29..d039313b896e2 100644 --- a/domain/domain.go +++ b/domain/domain.go @@ -48,6 +48,7 @@ import ( "github.com/pingcap/tidb/domain/resourcegroup" "github.com/pingcap/tidb/errno" "github.com/pingcap/tidb/infoschema" + infoschema_metrics "github.com/pingcap/tidb/infoschema/metrics" "github.com/pingcap/tidb/infoschema/perfschema" "github.com/pingcap/tidb/keyspace" "github.com/pingcap/tidb/kv" @@ -216,6 +217,10 @@ func (do *Domain) EtcdClient() *clientv3.Client { // 4. the changed table IDs if it is not full load // 5. an error if any func (do *Domain) loadInfoSchema(startTS uint64) (infoschema.InfoSchema, bool, int64, *transaction.RelatedSchemaChange, error) { + beginTime := time.Now() + defer func() { + infoschema_metrics.LoadSchemaDurationTotal.Observe(time.Since(beginTime).Seconds()) + }() snapshot := do.store.GetSnapshot(kv.NewVersion(startTS)) m := meta.NewSnapshotMeta(snapshot) neededSchemaVersion, err := m.GetSchemaVersionWithNonEmptyDiff() @@ -252,6 +257,7 @@ func (do *Domain) loadInfoSchema(startTS uint64) (infoschema.InfoSchema, bool, i if currentSchemaVersion != 0 && neededSchemaVersion > currentSchemaVersion && neededSchemaVersion-currentSchemaVersion < LoadSchemaDiffVersionGapThreshold { is, relatedChanges, err := do.tryLoadSchemaDiffs(m, currentSchemaVersion, neededSchemaVersion) if err == nil { + infoschema_metrics.LoadSchemaDurationLoadDiff.Observe(time.Since(startTime).Seconds()) do.infoCache.Insert(is, uint64(schemaTs)) logutil.BgLogger().Info("diff load InfoSchema success", zap.Int64("currentSchemaVersion", currentSchemaVersion), @@ -285,6 +291,7 @@ func (do *Domain) loadInfoSchema(startTS uint64) (infoschema.InfoSchema, bool, i if err != nil { return nil, false, currentSchemaVersion, nil, err } + infoschema_metrics.LoadSchemaDurationLoadAll.Observe(time.Since(startTime).Seconds()) logutil.BgLogger().Info("full load InfoSchema success", zap.Int64("currentSchemaVersion", currentSchemaVersion), zap.Int64("neededSchemaVersion", neededSchemaVersion), @@ -477,6 +484,7 @@ func (do *Domain) GetSnapshotInfoSchema(snapshotTS uint64) (infoschema.InfoSchem return is, nil } is, _, _, _, err := do.loadInfoSchema(snapshotTS) + infoschema_metrics.LoadSchemaCounterSnapshot.Inc() return is, err } @@ -578,7 +586,6 @@ func (do *Domain) Reload() error { is, hitCache, oldSchemaVersion, changes, err = do.loadInfoSchema(version) } } - metrics.LoadSchemaDuration.Observe(time.Since(startTime).Seconds()) if err != nil { metrics.LoadSchemaCounter.WithLabelValues("failed").Inc() return err diff --git a/infoschema/metrics/metrics.go b/infoschema/metrics/metrics.go index 994a28a228f44..6e21b8de198e3 100644 --- a/infoschema/metrics/metrics.go +++ b/infoschema/metrics/metrics.go @@ -28,6 +28,12 @@ var ( HitLatestCounter prometheus.Counter HitTSCounter prometheus.Counter HitVersionCounter prometheus.Counter + + LoadSchemaCounterSnapshot prometheus.Counter + + LoadSchemaDurationTotal prometheus.Observer + LoadSchemaDurationLoadDiff prometheus.Observer + LoadSchemaDurationLoadAll prometheus.Observer ) func init() { @@ -43,4 +49,10 @@ func InitMetricsVars() { HitLatestCounter = metrics.InfoCacheCounters.WithLabelValues("hit", "latest") HitTSCounter = metrics.InfoCacheCounters.WithLabelValues("hit", "ts") HitVersionCounter = metrics.InfoCacheCounters.WithLabelValues("hit", "version") + + LoadSchemaCounterSnapshot = metrics.LoadSchemaCounter.WithLabelValues("snapshot") + + LoadSchemaDurationTotal = metrics.LoadSchemaDuration.WithLabelValues("total") + LoadSchemaDurationLoadDiff = metrics.LoadSchemaDuration.WithLabelValues("load-diff") + LoadSchemaDurationLoadAll = metrics.LoadSchemaDuration.WithLabelValues("load-all") } diff --git a/metrics/domain.go b/metrics/domain.go index fe92653dcf96f..7a562710a8d87 100644 --- a/metrics/domain.go +++ b/metrics/domain.go @@ -24,7 +24,7 @@ var ( LoadSchemaCounter *prometheus.CounterVec // LoadSchemaDuration records the duration of load schema. - LoadSchemaDuration prometheus.Histogram + LoadSchemaDuration *prometheus.HistogramVec // InfoCacheCounters are the counters of get/hit. InfoCacheCounters *prometheus.CounterVec @@ -59,14 +59,14 @@ func InitDomainMetrics() { Help: "Counter of load schema", }, []string{LblType}) - LoadSchemaDuration = NewHistogram( + LoadSchemaDuration = NewHistogramVec( prometheus.HistogramOpts{ Namespace: "tidb", Subsystem: "domain", Name: "load_schema_duration_seconds", Help: "Bucketed histogram of processing time (s) in load schema.", Buckets: prometheus.ExponentialBuckets(0.001, 2, 20), // 1ms ~ 524s - }) + }, []string{LblAction}) InfoCacheCounters = NewCounterVec( prometheus.CounterOpts{ diff --git a/metrics/grafana/tidb.json b/metrics/grafana/tidb.json index 1481c4c26c554..f1b51a48048fa 100644 --- a/metrics/grafana/tidb.json +++ b/metrics/grafana/tidb.json @@ -12091,13 +12091,15 @@ "legend": { "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "max": true, "min": false, "rightSide": true, "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, @@ -12117,10 +12119,10 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tidb_domain_load_schema_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", + "expr": "histogram_quantile(0.99, sum(rate(tidb_domain_load_schema_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, action))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{instance}}", + "legendFormat": "{{action}}", "metric": "", "refId": "A", "step": 10 @@ -12130,7 +12132,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Load Schema Duration", + "title": "Load Schema Action Duration", "tooltip": { "msResolution": false, "shared": true, @@ -12302,13 +12304,15 @@ "legend": { "alignAsTable": true, "avg": false, - "current": false, - "max": false, + "current": true, + "max": true, "min": false, "rightSide": true, "show": true, + "sort": "current", + "sortDesc": true, "total": false, - "values": false + "values": true }, "lines": true, "linewidth": 1, @@ -12365,7 +12369,7 @@ { "format": "short", "label": null, - "logBase": 10, + "logBase": 1, "max": null, "min": null, "show": true @@ -12496,6 +12500,114 @@ "align": false, "alignLevel": null } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "TiDB schema cache operations per second.", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 24 + }, + "hiddenSeries": false, + "id": 314, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(tidb_domain_infocache_counters{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (action,type)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{action}}-{{type}}", + "metric": "", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Schema Cache OPS", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], "repeat": null,