From 89ab706f91d8d7615f0e34e6de88324d92bffeb8 Mon Sep 17 00:00:00 2001 From: Allen Zhong Date: Thu, 5 Aug 2021 12:35:13 +0800 Subject: [PATCH 1/4] ignore exporters when check data_dir overlap (#1510) --- pkg/cluster/spec/validate.go | 10 +++++++++- pkg/cluster/spec/validate_test.go | 26 ++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/pkg/cluster/spec/validate.go b/pkg/cluster/spec/validate.go index 5e6df41f78..0679eb70f4 100644 --- a/pkg/cluster/spec/validate.go +++ b/pkg/cluster/spec/validate.go @@ -227,7 +227,7 @@ func CheckClusterDirOverlap(entries []DirEntry) error { if d1.instance.IsImported() && d2.instance.IsImported() { continue } - // overlap is alloed in the case one side is imported and the other is monitor, + // overlap is allowed in the case one side is imported and the other is monitor, // we assume that the monitor is deployed with the first instance in that host, // it implies that the monitor is imported too. if (strings.HasPrefix(d1.dirKind, "monitor") && d2.instance.IsImported()) || @@ -235,6 +235,14 @@ func CheckClusterDirOverlap(entries []DirEntry) error { continue } + // overlap is allowed in the case one side is data dir of a monitor instance, + // as the *_exporter don't need data dir, the field is only kept for compatiability + // with legacy tidb-ansible deployments. + if (strings.HasPrefix(d1.dirKind, "monitor data directory")) || + (strings.HasPrefix(d2.dirKind, "monitor data directory")) { + continue + } + properties := map[string]string{ "ThisDirKind": d1.dirKind, "ThisDir": d1.dir, diff --git a/pkg/cluster/spec/validate_test.go b/pkg/cluster/spec/validate_test.go index 15846fee70..ebd6f4e2c2 100644 --- a/pkg/cluster/spec/validate_test.go +++ b/pkg/cluster/spec/validate_test.go @@ -1045,6 +1045,32 @@ tikv_servers: status_port: 32180 log_dir: "/home/tidb6wu/tidb1-data/tikv-32160-log" data_dir: "/home/tidb6wu/tidb1-data/tikv-32160" +`, + ` +monitored: + node_exporter_port: 9100 + blackbox_exporter_port: 9115 + deploy_dir: /data/deploy/monitor-9100 + data_dir: /data/deploy/monitor-9100 + log_dir: /data/deploy/monitor-9100/log +pd_servers: + - host: n0 + name: pd0 + imported: true + deploy_dir: /data/deploy + data_dir: /data/deploy/data.pd + log_dir: /data/deploy/log + - host: n1 + name: pd1 + log_dir: "/data/deploy/pd-2379/log" + data_dir: "/data/pd-2379" + deploy_dir: "/data/deploy/pd-2379" +cdc_servers: + - host: n1 + port: 8300 + deploy_dir: /data/deploy/ticdc-8300 + data_dir: /data1/ticdc-8300 + log_dir: /data/deploy/ticdc-8300/log `, } for _, s := range goodTopos { From 9a1ad59928c00b88c8844ca6aa76ad139be7e6d9 Mon Sep 17 00:00:00 2001 From: Allen Zhong Date: Thu, 5 Aug 2021 14:37:13 +0800 Subject: [PATCH 2/4] bump version 1.5.4 (#1509) --- CHANGELOG.md | 12 ++++++++++++ pkg/version/version.go | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 80b886a3ce..7692af76fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,17 @@ TiUP Changelog +## [1.5.4] 2021-08-05 + +### Fixes + +- Allow editing of `lerner_config` field in TiFlash spec ([#1494](https://github.com/pingcap/tiup/pull/1494), [@AstroProfundis](https://github.com/AstroProfundis)) +- Fix incorrect timeout for telemetry requests ([#1500](https://github.com/pingcap/tiup/pull/1500), [@AstroProfundis](https://github.com/AstroProfundis)) +- Ingore `data_dir` of monitor agents when checking for directory overlaps ([#1510](https://github.com/pingcap/tiup/pull/1510), [@AstroProfundis](https://github.com/AstroProfundis)) + +### Improvements + +- Distinguish cookie names of multiple grafana instances on the same host ([#1491](https://github.com/pingcap/tiup/pull/1491), [@AstroProfundis](https://github.com/AstroProfundis)) + ## [1.5.3] 2021-07-15 ### Fixes diff --git a/pkg/version/version.go b/pkg/version/version.go index 1eeb79bf95..4a4c494a92 100644 --- a/pkg/version/version.go +++ b/pkg/version/version.go @@ -23,7 +23,7 @@ var ( // TiUPVerMinor is the minor version of TiUP TiUPVerMinor = 5 // TiUPVerPatch is the patch version of TiUP - TiUPVerPatch = 3 + TiUPVerPatch = 4 // TiUPVerName is an alternative name of the version TiUPVerName = "tiup" // GitHash is the current git commit hash From 8195da32895bbd0c7ac20766242e50206e9866df Mon Sep 17 00:00:00 2001 From: Allen Zhong Date: Tue, 10 Aug 2021 15:29:15 +0800 Subject: [PATCH 3/4] cluster: skip updating topology when reload with --skip-restart (#1513) --- pkg/cluster/manager/reload.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pkg/cluster/manager/reload.go b/pkg/cluster/manager/reload.go index ba92170664..25ef1e65cf 100644 --- a/pkg/cluster/manager/reload.go +++ b/pkg/cluster/manager/reload.go @@ -103,7 +103,7 @@ func (m *Manager) Reload(name string, gOpt operator.Options, skipRestart, skipCo if err != nil { return err } - if topo.Type() == spec.TopoTypeTiDB { + if topo.Type() == spec.TopoTypeTiDB && !skipRestart { b.UpdateTopology( name, m.specManager.Path(name), @@ -117,11 +117,11 @@ func (m *Manager) Reload(name string, gOpt operator.Options, skipRestart, skipCo b.ParallelStep("+ Refresh monitor configs", gOpt.Force, monitorConfigTasks...) } - tlsCfg, err := topo.TLSConfig(m.specManager.Path(name, spec.TLSCertKeyDir)) - if err != nil { - return err - } if !skipRestart { + tlsCfg, err := topo.TLSConfig(m.specManager.Path(name, spec.TLSCertKeyDir)) + if err != nil { + return err + } b.Func("UpgradeCluster", func(ctx context.Context) error { return operator.Upgrade(ctx, topo, gOpt, tlsCfg) }) From 21446bede6a2d5a838c395eb45841fbaa02cef79 Mon Sep 17 00:00:00 2001 From: Allen Zhong Date: Tue, 10 Aug 2021 15:55:15 +0800 Subject: [PATCH 4/4] cluster: add ability to ignore monitor agent for instances (#1492) --- components/dm/command/scale_in.go | 9 ++- components/dm/spec/logic.go | 1 + components/dm/spec/topology_dm.go | 28 ++++++--- pkg/cluster/manager/builder.go | 18 ++++++ pkg/cluster/manager/cleanup.go | 9 +++ pkg/cluster/manager/deploy.go | 7 +++ pkg/cluster/manager/reload.go | 8 +++ pkg/cluster/manager/upgrade.go | 10 ++++ pkg/cluster/operation/action.go | 95 ++++++++++++++++++++++++------- pkg/cluster/operation/destroy.go | 15 +++-- pkg/cluster/spec/alertmanager.go | 6 ++ pkg/cluster/spec/cdc.go | 6 ++ pkg/cluster/spec/drainer.go | 6 ++ pkg/cluster/spec/grafana.go | 6 ++ pkg/cluster/spec/pd.go | 6 ++ pkg/cluster/spec/prometheus.go | 6 ++ pkg/cluster/spec/pump.go | 6 ++ pkg/cluster/spec/spec.go | 1 + pkg/cluster/spec/tidb.go | 6 ++ pkg/cluster/spec/tiflash.go | 6 ++ pkg/cluster/spec/tikv.go | 6 ++ pkg/cluster/spec/tispark.go | 60 +++++++++++-------- pkg/cluster/spec/validate.go | 71 ++++++++++++++++++++++- pkg/cluster/spec/validate_test.go | 52 +++++++++++++++++ tests/tiup-dm/test_cmd.sh | 6 +- 25 files changed, 390 insertions(+), 60 deletions(-) diff --git a/components/dm/command/scale_in.go b/components/dm/command/scale_in.go index 59b639df4c..f412ee925d 100644 --- a/components/dm/command/scale_in.go +++ b/components/dm/command/scale_in.go @@ -128,6 +128,13 @@ func ScaleInDMCluster( dmMasterClient = api.NewDMMasterClient(dmMasterEndpoint, 10*time.Second, nil) + noAgentHosts := set.NewStringSet() + topo.IterInstance(func(inst dm.Instance) { + if inst.IgnoreMonitorAgent() { + noAgentHosts.Insert(inst.GetHost()) + } + }) + // Delete member from cluster for _, component := range topo.ComponentsByStartOrder() { for _, instance := range component.Instances() { @@ -135,7 +142,7 @@ func ScaleInDMCluster( continue } - if err := operator.StopComponent(ctx, []dm.Instance{instance}, options.OptTimeout); err != nil { + if err := operator.StopComponent(ctx, []dm.Instance{instance}, noAgentHosts, options.OptTimeout); err != nil { return errors.Annotatef(err, "failed to stop %s", component.Name()) } diff --git a/components/dm/spec/logic.go b/components/dm/spec/logic.go index daee4c31a6..f54b71da70 100644 --- a/components/dm/spec/logic.go +++ b/components/dm/spec/logic.go @@ -45,6 +45,7 @@ type ( SSH() (string, int) GetMainPort() int IsImported() bool + IgnoreMonitorAgent() bool } ) diff --git a/components/dm/spec/topology_dm.go b/components/dm/spec/topology_dm.go index 2a5f0b5851..ba2527e538 100644 --- a/components/dm/spec/topology_dm.go +++ b/components/dm/spec/topology_dm.go @@ -118,10 +118,11 @@ func AllDMComponentNames() (roles []string) { // MasterSpec represents the Master topology specification in topology.yaml type MasterSpec struct { - Host string `yaml:"host"` - SSHPort int `yaml:"ssh_port,omitempty" validate:"ssh_port:editable"` - Imported bool `yaml:"imported,omitempty"` - Patched bool `yaml:"patched,omitempty"` + Host string `yaml:"host"` + SSHPort int `yaml:"ssh_port,omitempty" validate:"ssh_port:editable"` + Imported bool `yaml:"imported,omitempty"` + Patched bool `yaml:"patched,omitempty"` + IgnoreExporter bool `yaml:"ignore_exporter,omitempty"` // Use Name to get the name with a default value if it's empty. Name string `yaml:"name,omitempty"` Port int `yaml:"port,omitempty" default:"8261"` @@ -178,12 +179,18 @@ func (s *MasterSpec) IsImported() bool { return s.Imported } +// IgnoreMonitorAgent returns if the node does not have monitor agents available +func (s *MasterSpec) IgnoreMonitorAgent() bool { + return s.IgnoreExporter +} + // WorkerSpec represents the Master topology specification in topology.yaml type WorkerSpec struct { - Host string `yaml:"host"` - SSHPort int `yaml:"ssh_port,omitempty" validate:"ssh_port:editable"` - Imported bool `yaml:"imported,omitempty"` - Patched bool `yaml:"patched,omitempty"` + Host string `yaml:"host"` + SSHPort int `yaml:"ssh_port,omitempty" validate:"ssh_port:editable"` + Imported bool `yaml:"imported,omitempty"` + Patched bool `yaml:"patched,omitempty"` + IgnoreExporter bool `yaml:"ignore_exporter,omitempty"` // Use Name to get the name with a default value if it's empty. Name string `yaml:"name,omitempty"` Port int `yaml:"port,omitempty" default:"8262"` @@ -233,6 +240,11 @@ func (s *WorkerSpec) IsImported() bool { return s.Imported } +// IgnoreMonitorAgent returns if the node does not have monitor agents available +func (s *WorkerSpec) IgnoreMonitorAgent() bool { + return s.IgnoreExporter +} + // UnmarshalYAML sets default values when unmarshaling the topology file func (s *Specification) UnmarshalYAML(unmarshal func(interface{}) error) error { type topology Specification diff --git a/pkg/cluster/manager/builder.go b/pkg/cluster/manager/builder.go index 5c02104dd9..98d3adbc04 100644 --- a/pkg/cluster/manager/builder.go +++ b/pkg/cluster/manager/builder.go @@ -258,6 +258,7 @@ func buildScaleOutTask( } hasImported := false + noAgentHosts := set.NewStringSet() mergedTopo.IterInstance(func(inst spec.Instance) { deployDir := spec.Abs(base.User, inst.DeployDir()) @@ -278,6 +279,11 @@ func buildScaleOutTask( hasImported = true } + // add the instance to ignore list if it marks itself as ignore_exporter + if inst.IgnoreMonitorAgent() { + noAgentHosts.Insert(inst.GetHost()) + } + // Refresh all configuration t := tb.InitConfig(name, base.Version, @@ -307,6 +313,7 @@ func buildScaleOutTask( m, name, uninitializedHosts, + noAgentHosts, topo.BaseTopo().GlobalOptions, topo.BaseTopo().MonitoredOptions, base.Version, @@ -370,6 +377,7 @@ func buildMonitoredDeployTask( m *Manager, name string, uniqueHosts map[string]hostInfo, // host -> ssh-port, os, arch + noAgentHosts set.StringSet, // hosts that do not deploy monitor agents globalOptions *spec.GlobalOptions, monitoredOptions *spec.MonitoredOptions, version string, @@ -386,6 +394,11 @@ func buildMonitoredDeployTask( version := m.bindVersion(comp, version) for host, info := range uniqueHosts { + // skip deploying monitoring agents if the instance is marked so + if noAgentHosts.Exist(host) { + continue + } + // populate unique comp-os-arch set key := fmt.Sprintf("%s-%s-%s", comp, info.os, info.arch) if found := uniqueCompOSArch.Exist(key); !found { @@ -493,6 +506,7 @@ func buildRefreshMonitoredConfigTasks( specManager *spec.SpecManager, name string, uniqueHosts map[string]hostInfo, // host -> ssh-port, os, arch + noAgentHosts set.StringSet, globalOptions spec.GlobalOptions, monitoredOptions *spec.MonitoredOptions, sshTimeout, exeTimeout uint64, @@ -507,6 +521,10 @@ func buildRefreshMonitoredConfigTasks( // monitoring agents for _, comp := range []string{spec.ComponentNodeExporter, spec.ComponentBlackboxExporter} { for host, info := range uniqueHosts { + if noAgentHosts.Exist(host) { + continue + } + deployDir := spec.Abs(globalOptions.User, monitoredOptions.DeployDir) // data dir would be empty for components which don't need it dataDir := monitoredOptions.DataDir diff --git a/pkg/cluster/manager/cleanup.go b/pkg/cluster/manager/cleanup.go index 2d9d2e785f..336b91beb0 100644 --- a/pkg/cluster/manager/cleanup.go +++ b/pkg/cluster/manager/cleanup.go @@ -58,6 +58,15 @@ func (m *Manager) CleanCluster(name string, gOpt operator.Options, cleanOpt oper retainDataNodes := set.NewStringSet(cleanOpt.RetainDataNodes...) for _, ins := range instances { + // not cleaning files of monitor agents if the instance does not have one + switch ins.ComponentName() { + case spec.ComponentNodeExporter, + spec.ComponentBlackboxExporter: + if ins.IgnoreMonitorAgent() { + continue + } + } + // Some data of instances will be retained dataRetained := retainDataRoles.Exist(ins.ComponentName()) || retainDataNodes.Exist(ins.ID()) || retainDataNodes.Exist(ins.GetHost()) diff --git a/pkg/cluster/manager/deploy.go b/pkg/cluster/manager/deploy.go index b63bf7bfc7..4c0c559443 100644 --- a/pkg/cluster/manager/deploy.go +++ b/pkg/cluster/manager/deploy.go @@ -177,6 +177,7 @@ func (m *Manager) Deploy( // Initialize environment uniqueHosts := make(map[string]hostInfo) // host -> ssh-port, os, arch + noAgentHosts := set.NewStringSet() globalOptions := base.GlobalOptions // generate CA and client cert for TLS enabled cluster @@ -212,6 +213,11 @@ func (m *Manager) Deploy( return // skip the host to avoid issues } + // add the instance to ignore list if it marks itself as ignore_exporter + if inst.IgnoreMonitorAgent() { + noAgentHosts.Insert(inst.GetHost()) + } + uniqueHosts[inst.GetHost()] = hostInfo{ ssh: inst.GetSSHPort(), os: inst.OS(), @@ -373,6 +379,7 @@ func (m *Manager) Deploy( m, name, uniqueHosts, + noAgentHosts, globalOptions, topo.GetMonitoredOptions(), clusterVersion, diff --git a/pkg/cluster/manager/reload.go b/pkg/cluster/manager/reload.go index 25ef1e65cf..779a518fa9 100644 --- a/pkg/cluster/manager/reload.go +++ b/pkg/cluster/manager/reload.go @@ -27,6 +27,7 @@ import ( operator "github.com/pingcap/tiup/pkg/cluster/operation" "github.com/pingcap/tiup/pkg/cluster/spec" "github.com/pingcap/tiup/pkg/logger/log" + "github.com/pingcap/tiup/pkg/set" "github.com/pingcap/tiup/pkg/tui" ) @@ -69,7 +70,13 @@ func (m *Manager) Reload(name string, gOpt operator.Options, skipRestart, skipCo base := metadata.GetBaseMeta() uniqueHosts := make(map[string]hostInfo) // host -> ssh-port, os, arch + noAgentHosts := set.NewStringSet() topo.IterInstance(func(inst spec.Instance) { + // add the instance to ignore list if it marks itself as ignore_exporter + if inst.IgnoreMonitorAgent() { + noAgentHosts.Insert(inst.GetHost()) + } + if _, found := uniqueHosts[inst.GetHost()]; !found { uniqueHosts[inst.GetHost()] = hostInfo{ ssh: inst.GetSSHPort(), @@ -84,6 +91,7 @@ func (m *Manager) Reload(name string, gOpt operator.Options, skipRestart, skipCo m.specManager, name, uniqueHosts, + noAgentHosts, *topo.BaseTopo().GlobalOptions, topo.GetMonitoredOptions(), sshTimeout, diff --git a/pkg/cluster/manager/upgrade.go b/pkg/cluster/manager/upgrade.go index fef5ac3c16..2991024739 100644 --- a/pkg/cluster/manager/upgrade.go +++ b/pkg/cluster/manager/upgrade.go @@ -76,6 +76,16 @@ func (m *Manager) Upgrade(name string, clusterVersion string, opt operator.Optio for _, comp := range topo.ComponentsByUpdateOrder() { for _, inst := range comp.Instances() { compName := inst.ComponentName() + + // ignore monitor agents for instances marked as ignore_exporter + switch compName { + case spec.ComponentNodeExporter, + spec.ComponentBlackboxExporter: + if inst.IgnoreMonitorAgent() { + continue + } + } + version := m.bindVersion(inst.ComponentName(), clusterVersion) // Download component from repository diff --git a/pkg/cluster/operation/action.go b/pkg/cluster/operation/action.go index 7929818f15..2ddd3f6a80 100644 --- a/pkg/cluster/operation/action.go +++ b/pkg/cluster/operation/action.go @@ -59,21 +59,28 @@ func Enable( components := cluster.ComponentsByStartOrder() components = FilterComponent(components, roleFilter) monitoredOptions := cluster.GetMonitoredOptions() + noAgentHosts := set.NewStringSet() instCount := map[string]int{} cluster.IterInstance(func(inst spec.Instance) { - instCount[inst.GetHost()]++ + if inst.IgnoreMonitorAgent() { + noAgentHosts.Insert(inst.GetHost()) + } else { + instCount[inst.GetHost()]++ + } }) for _, comp := range components { insts := FilterInstance(comp.Instances(), nodeFilter) - err := EnableComponent(ctx, insts, options, isEnable) + err := EnableComponent(ctx, insts, noAgentHosts, options, isEnable) if err != nil { return errors.Annotatef(err, "failed to enable/disable %s", comp.Name()) } for _, inst := range insts { - instCount[inst.GetHost()]-- + if !inst.IgnoreMonitorAgent() { + instCount[inst.GetHost()]-- + } } } @@ -90,7 +97,7 @@ func Enable( hosts = append(hosts, host) } - return EnableMonitored(ctx, hosts, monitoredOptions, options.OptTimeout, isEnable) + return EnableMonitored(ctx, hosts, noAgentHosts, monitoredOptions, options.OptTimeout, isEnable) } // Start the cluster. @@ -106,15 +113,24 @@ func Start( components := cluster.ComponentsByStartOrder() components = FilterComponent(components, roleFilter) monitoredOptions := cluster.GetMonitoredOptions() + noAgentHosts := set.NewStringSet() + + cluster.IterInstance(func(inst spec.Instance) { + if inst.IgnoreMonitorAgent() { + noAgentHosts.Insert(inst.GetHost()) + } + }) for _, comp := range components { insts := FilterInstance(comp.Instances(), nodeFilter) - err := StartComponent(ctx, insts, options, tlsCfg) + err := StartComponent(ctx, insts, noAgentHosts, options, tlsCfg) if err != nil { return errors.Annotatef(err, "failed to start %s", comp.Name()) } for _, inst := range insts { - uniqueHosts.Insert(inst.GetHost()) + if !inst.IgnoreMonitorAgent() { + uniqueHosts.Insert(inst.GetHost()) + } } } @@ -126,7 +142,7 @@ func Start( for host := range uniqueHosts { hosts = append(hosts, host) } - return StartMonitored(ctx, hosts, monitoredOptions, options.OptTimeout) + return StartMonitored(ctx, hosts, noAgentHosts, monitoredOptions, options.OptTimeout) } // Stop the cluster. @@ -141,20 +157,27 @@ func Stop( components := cluster.ComponentsByStopOrder() components = FilterComponent(components, roleFilter) monitoredOptions := cluster.GetMonitoredOptions() + noAgentHosts := set.NewStringSet() instCount := map[string]int{} cluster.IterInstance(func(inst spec.Instance) { - instCount[inst.GetHost()]++ + if inst.IgnoreMonitorAgent() { + noAgentHosts.Insert(inst.GetHost()) + } else { + instCount[inst.GetHost()]++ + } }) for _, comp := range components { insts := FilterInstance(comp.Instances(), nodeFilter) - err := StopComponent(ctx, insts, options.OptTimeout) + err := StopComponent(ctx, insts, noAgentHosts, options.OptTimeout) if err != nil && !options.Force { return errors.Annotatef(err, "failed to stop %s", comp.Name()) } for _, inst := range insts { - instCount[inst.GetHost()]-- + if !inst.IgnoreMonitorAgent() { + instCount[inst.GetHost()]-- + } } } @@ -170,7 +193,7 @@ func Stop( hosts = append(hosts, host) } - if err := StopMonitored(ctx, hosts, monitoredOptions, options.OptTimeout); err != nil && !options.Force { + if err := StopMonitored(ctx, hosts, noAgentHosts, monitoredOptions, options.OptTimeout); err != nil && !options.Force { return err } return nil @@ -223,26 +246,26 @@ func Restart( } // StartMonitored start BlackboxExporter and NodeExporter -func StartMonitored(ctx context.Context, hosts []string, options *spec.MonitoredOptions, timeout uint64) error { - return systemctlMonitor(ctx, hosts, options, "start", timeout) +func StartMonitored(ctx context.Context, hosts []string, noAgentHosts set.StringSet, options *spec.MonitoredOptions, timeout uint64) error { + return systemctlMonitor(ctx, hosts, noAgentHosts, options, "start", timeout) } // StopMonitored stop BlackboxExporter and NodeExporter -func StopMonitored(ctx context.Context, hosts []string, options *spec.MonitoredOptions, timeout uint64) error { - return systemctlMonitor(ctx, hosts, options, "stop", timeout) +func StopMonitored(ctx context.Context, hosts []string, noAgentHosts set.StringSet, options *spec.MonitoredOptions, timeout uint64) error { + return systemctlMonitor(ctx, hosts, noAgentHosts, options, "stop", timeout) } // EnableMonitored enable/disable monitor service in a cluster -func EnableMonitored(ctx context.Context, hosts []string, options *spec.MonitoredOptions, timeout uint64, isEnable bool) error { +func EnableMonitored(ctx context.Context, hosts []string, noAgentHosts set.StringSet, options *spec.MonitoredOptions, timeout uint64, isEnable bool) error { action := "disable" if isEnable { action = "enable" } - return systemctlMonitor(ctx, hosts, options, action, timeout) + return systemctlMonitor(ctx, hosts, noAgentHosts, options, action, timeout) } -func systemctlMonitor(ctx context.Context, hosts []string, options *spec.MonitoredOptions, action string, timeout uint64) error { +func systemctlMonitor(ctx context.Context, hosts []string, noAgentHosts set.StringSet, options *spec.MonitoredOptions, action string, timeout uint64) error { ports := monitorPortMap(options) for _, comp := range []string{spec.ComponentNodeExporter, spec.ComponentBlackboxExporter} { log.Infof("%s component %s", actionPrevMsgs[action], comp) @@ -250,6 +273,10 @@ func systemctlMonitor(ctx context.Context, hosts []string, options *spec.Monitor errg, _ := errgroup.WithContext(ctx) for _, host := range hosts { host := host + if noAgentHosts.Exist(host) { + log.Debugf("Ignored %s component %s for %s", action, comp, host) + continue + } nctx := checkpoint.NewContext(ctx) errg.Go(func() error { log.Infof("\t%s instance %s", actionPrevMsgs[action], host) @@ -388,7 +415,7 @@ func systemctl(ctx context.Context, executor ctxt.Executor, service string, acti } // EnableComponent enable/disable the instances -func EnableComponent(ctx context.Context, instances []spec.Instance, options Options, isEnable bool) error { +func EnableComponent(ctx context.Context, instances []spec.Instance, noAgentHosts set.StringSet, options Options, isEnable bool) error { if len(instances) == 0 { return nil } @@ -405,6 +432,16 @@ func EnableComponent(ctx context.Context, instances []spec.Instance, options Opt for _, ins := range instances { ins := ins + // skip certain instances + switch name { + case spec.ComponentNodeExporter, + spec.ComponentBlackboxExporter: + if noAgentHosts.Exist(ins.GetHost()) { + log.Debugf("Ignored enabling/disabling %s for %s:%d", name, ins.GetHost(), ins.GetPort()) + continue + } + } + // the checkpoint part of context can't be shared between goroutines // since it's used to trace the stack, so we must create a new layer // of checkpoint context every time put it into a new goroutine. @@ -422,7 +459,7 @@ func EnableComponent(ctx context.Context, instances []spec.Instance, options Opt } // StartComponent start the instances. -func StartComponent(ctx context.Context, instances []spec.Instance, options Options, tlsCfg *tls.Config) error { +func StartComponent(ctx context.Context, instances []spec.Instance, noAgentHosts set.StringSet, options Options, tlsCfg *tls.Config) error { if len(instances) == 0 { return nil } @@ -442,6 +479,14 @@ func StartComponent(ctx context.Context, instances []spec.Instance, options Opti for _, ins := range instances { ins := ins + switch name { + case spec.ComponentNodeExporter, + spec.ComponentBlackboxExporter: + if noAgentHosts.Exist(ins.GetHost()) { + log.Debugf("Ignored starting %s for %s:%d", name, ins.GetHost(), ins.GetPort()) + continue + } + } // the checkpoint part of context can't be shared between goroutines // since it's used to trace the stack, so we must create a new layer @@ -484,7 +529,7 @@ func stopInstance(ctx context.Context, ins spec.Instance, timeout uint64) error } // StopComponent stop the instances. -func StopComponent(ctx context.Context, instances []spec.Instance, timeout uint64) error { +func StopComponent(ctx context.Context, instances []spec.Instance, noAgentHosts set.StringSet, timeout uint64) error { if len(instances) == 0 { return nil } @@ -496,6 +541,14 @@ func StopComponent(ctx context.Context, instances []spec.Instance, timeout uint6 for _, ins := range instances { ins := ins + switch name { + case spec.ComponentNodeExporter, + spec.ComponentBlackboxExporter: + if noAgentHosts.Exist(ins.GetHost()) { + log.Debugf("Ignored stopping %s for %s:%d", name, ins.GetHost(), ins.GetPort()) + continue + } + } // the checkpoint part of context can't be shared between goroutines // since it's used to trace the stack, so we must create a new layer diff --git a/pkg/cluster/operation/destroy.go b/pkg/cluster/operation/destroy.go index b9190dc766..3e81bd86a4 100644 --- a/pkg/cluster/operation/destroy.go +++ b/pkg/cluster/operation/destroy.go @@ -88,13 +88,20 @@ func Destroy( // StopAndDestroyInstance stop and destroy the instance, // if this instance is the host's last one, and the host has monitor deployed, -// we need to destroy the monitor, either +// we need to destroy the monitor, too func StopAndDestroyInstance(ctx context.Context, cluster spec.Topology, instance spec.Instance, options Options, destroyNode bool) error { ignoreErr := options.Force compName := instance.ComponentName() + noAgentHosts := set.NewStringSet() + + cluster.IterInstance(func(inst spec.Instance) { + if inst.IgnoreMonitorAgent() { + noAgentHosts.Insert(inst.GetHost()) + } + }) // just try to stop and destroy - if err := StopComponent(ctx, []spec.Instance{instance}, options.OptTimeout); err != nil { + if err := StopComponent(ctx, []spec.Instance{instance}, noAgentHosts, options.OptTimeout); err != nil { if !ignoreErr { return errors.Annotatef(err, "failed to stop %s", compName) } @@ -111,8 +118,8 @@ func StopAndDestroyInstance(ctx context.Context, cluster spec.Topology, instance // monitoredOptions for dm cluster is nil monitoredOptions := cluster.GetMonitoredOptions() - if monitoredOptions != nil { - if err := StopMonitored(ctx, []string{instance.GetHost()}, monitoredOptions, options.OptTimeout); err != nil { + if monitoredOptions != nil && !instance.IgnoreMonitorAgent() { + if err := StopMonitored(ctx, []string{instance.GetHost()}, noAgentHosts, monitoredOptions, options.OptTimeout); err != nil { if !ignoreErr { return errors.Annotatef(err, "failed to stop monitor") } diff --git a/pkg/cluster/spec/alertmanager.go b/pkg/cluster/spec/alertmanager.go index cadf533474..d2e8d63f21 100644 --- a/pkg/cluster/spec/alertmanager.go +++ b/pkg/cluster/spec/alertmanager.go @@ -32,6 +32,7 @@ type AlertmanagerSpec struct { SSHPort int `yaml:"ssh_port,omitempty" validate:"ssh_port:editable"` Imported bool `yaml:"imported,omitempty"` Patched bool `yaml:"patched,omitempty"` + IgnoreExporter bool `yaml:"ignore_exporter,omitempty"` WebPort int `yaml:"web_port" default:"9093"` ClusterPort int `yaml:"cluster_port" default:"9094"` DeployDir string `yaml:"deploy_dir,omitempty"` @@ -64,6 +65,11 @@ func (s *AlertmanagerSpec) IsImported() bool { return s.Imported } +// IgnoreMonitorAgent returns if the node does not have monitor agents available +func (s *AlertmanagerSpec) IgnoreMonitorAgent() bool { + return s.IgnoreExporter +} + // AlertManagerComponent represents Alertmanager component. type AlertManagerComponent struct{ Topology } diff --git a/pkg/cluster/spec/cdc.go b/pkg/cluster/spec/cdc.go index 2b503a771c..24577176b8 100644 --- a/pkg/cluster/spec/cdc.go +++ b/pkg/cluster/spec/cdc.go @@ -33,6 +33,7 @@ type CDCSpec struct { SSHPort int `yaml:"ssh_port,omitempty" validate:"ssh_port:editable"` Imported bool `yaml:"imported,omitempty"` Patched bool `yaml:"patched,omitempty"` + IgnoreExporter bool `yaml:"ignore_exporter,omitempty"` Port int `yaml:"port" default:"8300"` DeployDir string `yaml:"deploy_dir,omitempty"` DataDir string `yaml:"data_dir,omitempty"` @@ -67,6 +68,11 @@ func (s *CDCSpec) IsImported() bool { return s.Imported } +// IgnoreMonitorAgent returns if the node does not have monitor agents available +func (s *CDCSpec) IgnoreMonitorAgent() bool { + return s.IgnoreExporter +} + // CDCComponent represents CDC component. type CDCComponent struct{ Topology *Specification } diff --git a/pkg/cluster/spec/drainer.go b/pkg/cluster/spec/drainer.go index e3ba1e51c5..2798476fb9 100644 --- a/pkg/cluster/spec/drainer.go +++ b/pkg/cluster/spec/drainer.go @@ -33,6 +33,7 @@ type DrainerSpec struct { SSHPort int `yaml:"ssh_port,omitempty" validate:"ssh_port:editable"` Imported bool `yaml:"imported,omitempty"` Patched bool `yaml:"patched,omitempty"` + IgnoreExporter bool `yaml:"ignore_exporter,omitempty"` Port int `yaml:"port" default:"8249"` DeployDir string `yaml:"deploy_dir,omitempty"` DataDir string `yaml:"data_dir,omitempty"` @@ -66,6 +67,11 @@ func (s *DrainerSpec) IsImported() bool { return s.Imported } +// IgnoreMonitorAgent returns if the node does not have monitor agents available +func (s *DrainerSpec) IgnoreMonitorAgent() bool { + return s.IgnoreExporter +} + // DrainerComponent represents Drainer component. type DrainerComponent struct{ Topology *Specification } diff --git a/pkg/cluster/spec/grafana.go b/pkg/cluster/spec/grafana.go index 3528085077..02a284411c 100644 --- a/pkg/cluster/spec/grafana.go +++ b/pkg/cluster/spec/grafana.go @@ -35,6 +35,7 @@ type GrafanaSpec struct { SSHPort int `yaml:"ssh_port,omitempty" validate:"ssh_port:editable"` Imported bool `yaml:"imported,omitempty"` Patched bool `yaml:"patched,omitempty"` + IgnoreExporter bool `yaml:"ignore_exporter,omitempty"` Port int `yaml:"port" default:"3000"` DeployDir string `yaml:"deploy_dir,omitempty"` ResourceControl meta.ResourceControl `yaml:"resource_control,omitempty" validate:"resource_control:editable"` @@ -68,6 +69,11 @@ func (s *GrafanaSpec) IsImported() bool { return s.Imported } +// IgnoreMonitorAgent returns if the node does not have monitor agents available +func (s *GrafanaSpec) IgnoreMonitorAgent() bool { + return s.IgnoreExporter +} + // GrafanaComponent represents Grafana component. type GrafanaComponent struct{ Topology } diff --git a/pkg/cluster/spec/pd.go b/pkg/cluster/spec/pd.go index cd953cd814..84a6c74800 100644 --- a/pkg/cluster/spec/pd.go +++ b/pkg/cluster/spec/pd.go @@ -39,6 +39,7 @@ type PDSpec struct { SSHPort int `yaml:"ssh_port,omitempty" validate:"ssh_port:editable"` Imported bool `yaml:"imported,omitempty"` Patched bool `yaml:"patched,omitempty"` + IgnoreExporter bool `yaml:"ignore_exporter,omitempty"` // Use Name to get the name with a default value if it's empty. Name string `yaml:"name"` ClientPort int `yaml:"client_port" default:"2379"` @@ -96,6 +97,11 @@ func (s *PDSpec) IsImported() bool { return s.Imported } +// IgnoreMonitorAgent returns if the node does not have monitor agents available +func (s *PDSpec) IgnoreMonitorAgent() bool { + return s.IgnoreExporter +} + // PDComponent represents PD component. type PDComponent struct{ Topology *Specification } diff --git a/pkg/cluster/spec/prometheus.go b/pkg/cluster/spec/prometheus.go index f573830db1..8ce1bfc3f6 100644 --- a/pkg/cluster/spec/prometheus.go +++ b/pkg/cluster/spec/prometheus.go @@ -37,6 +37,7 @@ type PrometheusSpec struct { SSHPort int `yaml:"ssh_port,omitempty" validate:"ssh_port:editable"` Imported bool `yaml:"imported,omitempty"` Patched bool `yaml:"patched,omitempty"` + IgnoreExporter bool `yaml:"ignore_exporter,omitempty"` Port int `yaml:"port" default:"9090"` DeployDir string `yaml:"deploy_dir,omitempty"` DataDir string `yaml:"data_dir,omitempty"` @@ -83,6 +84,11 @@ func (s *PrometheusSpec) IsImported() bool { return s.Imported } +// IgnoreMonitorAgent returns if the node does not have monitor agents available +func (s *PrometheusSpec) IgnoreMonitorAgent() bool { + return s.IgnoreExporter +} + // MonitorComponent represents Monitor component. type MonitorComponent struct{ Topology } diff --git a/pkg/cluster/spec/pump.go b/pkg/cluster/spec/pump.go index 9bae4a5eb6..1ba15ec634 100644 --- a/pkg/cluster/spec/pump.go +++ b/pkg/cluster/spec/pump.go @@ -32,6 +32,7 @@ type PumpSpec struct { SSHPort int `yaml:"ssh_port,omitempty" validate:"ssh_port:editable"` Imported bool `yaml:"imported,omitempty"` Patched bool `yaml:"patched,omitempty"` + IgnoreExporter bool `yaml:"ignore_exporter,omitempty"` Port int `yaml:"port" default:"8250"` DeployDir string `yaml:"deploy_dir,omitempty"` DataDir string `yaml:"data_dir,omitempty"` @@ -64,6 +65,11 @@ func (s *PumpSpec) IsImported() bool { return s.Imported } +// IgnoreMonitorAgent returns if the node does not have monitor agents available +func (s *PumpSpec) IgnoreMonitorAgent() bool { + return s.IgnoreExporter +} + // PumpComponent represents Pump component. type PumpComponent struct{ Topology *Specification } diff --git a/pkg/cluster/spec/spec.go b/pkg/cluster/spec/spec.go index b61c25ddff..60f6842e47 100644 --- a/pkg/cluster/spec/spec.go +++ b/pkg/cluster/spec/spec.go @@ -57,6 +57,7 @@ type ( SSH() (string, int) GetMainPort() int IsImported() bool + IgnoreMonitorAgent() bool } // GlobalOptions represents the global options for all groups in topology diff --git a/pkg/cluster/spec/tidb.go b/pkg/cluster/spec/tidb.go index 5235e17196..0c073432c7 100644 --- a/pkg/cluster/spec/tidb.go +++ b/pkg/cluster/spec/tidb.go @@ -34,6 +34,7 @@ type TiDBSpec struct { SSHPort int `yaml:"ssh_port,omitempty" validate:"ssh_port:editable"` Imported bool `yaml:"imported,omitempty"` Patched bool `yaml:"patched,omitempty"` + IgnoreExporter bool `yaml:"ignore_exporter,omitempty"` Port int `yaml:"port" default:"4000"` StatusPort int `yaml:"status_port" default:"10080"` DeployDir string `yaml:"deploy_dir,omitempty"` @@ -65,6 +66,11 @@ func (s *TiDBSpec) IsImported() bool { return s.Imported } +// IgnoreMonitorAgent returns if the node does not have monitor agents available +func (s *TiDBSpec) IgnoreMonitorAgent() bool { + return s.IgnoreExporter +} + // TiDBComponent represents TiDB component. type TiDBComponent struct{ Topology *Specification } diff --git a/pkg/cluster/spec/tiflash.go b/pkg/cluster/spec/tiflash.go index 2914c7c826..ec22cec36f 100644 --- a/pkg/cluster/spec/tiflash.go +++ b/pkg/cluster/spec/tiflash.go @@ -42,6 +42,7 @@ type TiFlashSpec struct { SSHPort int `yaml:"ssh_port,omitempty" validate:"ssh_port:editable"` Imported bool `yaml:"imported,omitempty"` Patched bool `yaml:"patched,omitempty"` + IgnoreExporter bool `yaml:"ignore_exporter,omitempty"` TCPPort int `yaml:"tcp_port" default:"9000"` HTTPPort int `yaml:"http_port" default:"8123"` FlashServicePort int `yaml:"flash_service_port" default:"3930"` @@ -91,6 +92,11 @@ func (s *TiFlashSpec) IsImported() bool { return s.Imported } +// IgnoreMonitorAgent returns if the node does not have monitor agents available +func (s *TiFlashSpec) IgnoreMonitorAgent() bool { + return s.IgnoreExporter +} + // key names for storage config const ( TiFlashStorageKeyMainDirs string = "storage.main.dir" diff --git a/pkg/cluster/spec/tikv.go b/pkg/cluster/spec/tikv.go index 0df7ef933f..108a1f2bb5 100644 --- a/pkg/cluster/spec/tikv.go +++ b/pkg/cluster/spec/tikv.go @@ -50,6 +50,7 @@ type TiKVSpec struct { SSHPort int `yaml:"ssh_port,omitempty" validate:"ssh_port:editable"` Imported bool `yaml:"imported,omitempty"` Patched bool `yaml:"patched,omitempty"` + IgnoreExporter bool `yaml:"ignore_exporter,omitempty"` Port int `yaml:"port" default:"20160"` StatusPort int `yaml:"status_port" default:"20180"` AdvertiseStatusAddr string `yaml:"advertise_status_addr,omitempty"` @@ -111,6 +112,11 @@ func (s *TiKVSpec) IsImported() bool { return s.Imported } +// IgnoreMonitorAgent returns if the node does not have monitor agents available +func (s *TiKVSpec) IgnoreMonitorAgent() bool { + return s.IgnoreExporter +} + // Labels returns the labels of TiKV func (s *TiKVSpec) Labels() (map[string]string, error) { lbs := make(map[string]string) diff --git a/pkg/cluster/spec/tispark.go b/pkg/cluster/spec/tispark.go index 025983d9c2..3a41b08ca8 100644 --- a/pkg/cluster/spec/tispark.go +++ b/pkg/cluster/spec/tispark.go @@ -36,19 +36,20 @@ import ( // TiSparkMasterSpec is the topology specification for TiSpark master node type TiSparkMasterSpec struct { - Host string `yaml:"host"` - ListenHost string `yaml:"listen_host,omitempty"` - SSHPort int `yaml:"ssh_port,omitempty" validate:"ssh_port:editable"` - Imported bool `yaml:"imported,omitempty"` - Patched bool `yaml:"patched,omitempty"` - Port int `yaml:"port" default:"7077"` - WebPort int `yaml:"web_port" default:"8080"` - DeployDir string `yaml:"deploy_dir,omitempty"` - JavaHome string `yaml:"java_home,omitempty" validate:"java_home:editable"` - SparkConfigs map[string]interface{} `yaml:"spark_config,omitempty" validate:"spark_config:ignore"` - SparkEnvs map[string]string `yaml:"spark_env,omitempty" validate:"spark_env:ignore"` - Arch string `yaml:"arch,omitempty"` - OS string `yaml:"os,omitempty"` + Host string `yaml:"host"` + ListenHost string `yaml:"listen_host,omitempty"` + SSHPort int `yaml:"ssh_port,omitempty" validate:"ssh_port:editable"` + Imported bool `yaml:"imported,omitempty"` + Patched bool `yaml:"patched,omitempty"` + IgnoreExporter bool `yaml:"ignore_exporter,omitempty"` + Port int `yaml:"port" default:"7077"` + WebPort int `yaml:"web_port" default:"8080"` + DeployDir string `yaml:"deploy_dir,omitempty"` + JavaHome string `yaml:"java_home,omitempty" validate:"java_home:editable"` + SparkConfigs map[string]interface{} `yaml:"spark_config,omitempty" validate:"spark_config:ignore"` + SparkEnvs map[string]string `yaml:"spark_env,omitempty" validate:"spark_env:ignore"` + Arch string `yaml:"arch,omitempty"` + OS string `yaml:"os,omitempty"` } // Role returns the component role of the instance @@ -71,19 +72,25 @@ func (s *TiSparkMasterSpec) IsImported() bool { return s.Imported } +// IgnoreMonitorAgent returns if the node does not have monitor agents available +func (s *TiSparkMasterSpec) IgnoreMonitorAgent() bool { + return s.IgnoreExporter +} + // TiSparkWorkerSpec is the topology specification for TiSpark slave nodes type TiSparkWorkerSpec struct { - Host string `yaml:"host"` - ListenHost string `yaml:"listen_host,omitempty"` - SSHPort int `yaml:"ssh_port,omitempty" validate:"ssh_port:editable"` - Imported bool `yaml:"imported,omitempty"` - Patched bool `yaml:"patched,omitempty"` - Port int `yaml:"port" default:"7078"` - WebPort int `yaml:"web_port" default:"8081"` - DeployDir string `yaml:"deploy_dir,omitempty"` - JavaHome string `yaml:"java_home,omitempty" validate:"java_home:editable"` - Arch string `yaml:"arch,omitempty"` - OS string `yaml:"os,omitempty"` + Host string `yaml:"host"` + ListenHost string `yaml:"listen_host,omitempty"` + SSHPort int `yaml:"ssh_port,omitempty" validate:"ssh_port:editable"` + Imported bool `yaml:"imported,omitempty"` + Patched bool `yaml:"patched,omitempty"` + IgnoreExporter bool `yaml:"ignore_exporter,omitempty"` + Port int `yaml:"port" default:"7078"` + WebPort int `yaml:"web_port" default:"8081"` + DeployDir string `yaml:"deploy_dir,omitempty"` + JavaHome string `yaml:"java_home,omitempty" validate:"java_home:editable"` + Arch string `yaml:"arch,omitempty"` + OS string `yaml:"os,omitempty"` } // Role returns the component role of the instance @@ -106,6 +113,11 @@ func (s *TiSparkWorkerSpec) IsImported() bool { return s.Imported } +// IgnoreMonitorAgent returns if the node does not have monitor agents available +func (s *TiSparkWorkerSpec) IgnoreMonitorAgent() bool { + return s.IgnoreExporter +} + // TiSparkMasterComponent represents TiSpark master component. type TiSparkMasterComponent struct{ Topology *Specification } diff --git a/pkg/cluster/spec/validate.go b/pkg/cluster/spec/validate.go index 0679eb70f4..2234d1bc5a 100644 --- a/pkg/cluster/spec/validate.go +++ b/pkg/cluster/spec/validate.go @@ -272,7 +272,7 @@ Please modify the topology file and try again. return nil } -// CheckClusterPortConflict checks cluster dir conflict +// CheckClusterPortConflict checks cluster port conflict func CheckClusterPortConflict(clusterList map[string]Metadata, clusterName string, topo Topology) error { type Entry struct { clusterName string @@ -361,6 +361,7 @@ func CheckClusterPortConflict(clusterList map[string]Metadata, clusterName strin } if p1.port == p2.port { + // build the conflict info properties := map[string]string{ "ThisPort": strconv.Itoa(p1.port), "ThisComponent": p1.componentName, @@ -370,6 +371,15 @@ func CheckClusterPortConflict(clusterList map[string]Metadata, clusterName strin "ExistComponent": p2.componentName, "ExistHost": p2.instance.GetHost(), } + // if one of the instances marks itself as ignore_exporter, do not report + // the monitoring agent ports conflict and just skip + if (p1.componentName == RoleMonitor || p2.componentName == RoleMonitor) && + (p1.instance.IgnoreMonitorAgent() || p2.instance.IgnoreMonitorAgent()) { + zap.L().Debug("Ignored deploy port conflict", zap.Any("info", properties)) + continue + } + + // build error message zap.L().Info("Meet deploy port conflict", zap.Any("info", properties)) return errDeployPortConflict.New("Deploy port conflicts to an existing cluster").WithProperty(tui.SuggestionFromTemplate(` The port you specified in the topology file is: @@ -958,6 +968,64 @@ func (s *Specification) validateTiFlashConfigs() error { return nil } +// validateMonitorAgent checks for conflicts in topology for different ignore_exporter +// settings for multiple instances on the same host / IP +func (s *Specification) validateMonitorAgent() error { + type ( + conflict struct { + ignore bool + cfg string + } + ) + agentStats := map[string]conflict{} + topoSpec := reflect.ValueOf(s).Elem() + topoType := reflect.TypeOf(s).Elem() + + for i := 0; i < topoSpec.NumField(); i++ { + if isSkipField(topoSpec.Field(i)) { + continue + } + + compSpecs := topoSpec.Field(i) + for index := 0; index < compSpecs.Len(); index++ { + compSpec := reflect.Indirect(compSpecs.Index(index)) + // skip nodes imported from TiDB-Ansible + if compSpec.Addr().Interface().(InstanceSpec).IsImported() { + continue + } + + // check hostname + host := compSpec.FieldByName("Host").String() + cfg := strings.Split(topoType.Field(i).Tag.Get("yaml"), ",")[0] // without meta + if host == "" { + return errors.Errorf("`%s` contains empty host field", cfg) + } + + // agent conflicts + stat := conflict{} + if j, found := findField(compSpec, "IgnoreExporter"); found { + stat.ignore = compSpec.Field(j).Bool() + stat.cfg = cfg + } + + prev, exist := agentStats[host] + if exist { + if prev.ignore != stat.ignore { + return &meta.ValidateErr{ + Type: meta.TypeMismatch, + Target: "ignore_exporter", + LHS: fmt.Sprintf("%s:%v", prev.cfg, prev.ignore), + RHS: fmt.Sprintf("%s:%v", stat.cfg, stat.ignore), + Value: host, + } + } + } + agentStats[host] = stat + } + } + return nil +} + // Validate validates the topology specification and produce error if the // specification invalid (e.g: port conflicts or directory conflicts) func (s *Specification) Validate() error { @@ -971,6 +1039,7 @@ func (s *Specification) Validate() error { s.validatePDNames, s.validateTiSparkSpec, s.validateTiFlashConfigs, + s.validateMonitorAgent, } for _, v := range validators { diff --git a/pkg/cluster/spec/validate_test.go b/pkg/cluster/spec/validate_test.go index ebd6f4e2c2..8adc9fcd92 100644 --- a/pkg/cluster/spec/validate_test.go +++ b/pkg/cluster/spec/validate_test.go @@ -411,6 +411,30 @@ tispark_masters: c.Assert(err.Error(), Equals, "component tispark is not supported in TLS enabled cluster") } +func (s *metaSuiteTopo) TestMonitorAgentValidation(c *C) { + topo := Specification{} + err := yaml.Unmarshal([]byte(` +pd_servers: + - host: 172.16.5.138 + port: 1234 + - host: 172.16.5.139 + ignore_exporter: true +`), &topo) + c.Assert(err, IsNil) + + topo = Specification{} + err = yaml.Unmarshal([]byte(` +pd_servers: + - host: 172.16.5.138 + port: 1234 +tikv_servers: + - host: 172.16.5.138 + ignore_exporter: true +`), &topo) + c.Assert(err, NotNil) + c.Assert(err.Error(), Equals, "ignore_exporter mismatch for '172.16.5.138' between 'tikv_servers:true' and 'pd_servers:false'") +} + func (s *metaSuiteTopo) TestCrossClusterPortConflicts(c *C) { topo1 := Specification{} err := yaml.Unmarshal([]byte(` @@ -430,6 +454,10 @@ pd_servers: - host: 172.16.5.138 client_port: 2234 peer_port: 2235 + - host: 172.16.5.139 + client_port: 2236 + peer_port: 2237 + ignore_exporter: true `), &topo2) c.Assert(err, IsNil) @@ -471,6 +499,30 @@ It conflicts to a port in the existing cluster: Please change to use another port or another host.`) + // monitoring agent port conflict but the instance marked as ignore_exporter + topo3 = Specification{} + err = yaml.Unmarshal([]byte(` +tidb_servers: +- host: 172.16.5.138 + ignore_exporter: true +`), &topo3) + c.Assert(err, IsNil) + err = CheckClusterPortConflict(clsList, "topo", &topo3) + c.Assert(err, IsNil) + + // monitoring agent port conflict but the existing instance marked as ignore_exporter + topo3 = Specification{} + err = yaml.Unmarshal([]byte(` +monitored: + node_exporter_port: 9102 + blackbox_exporter_port: 9116 +tidb_servers: +- host: 172.16.5.139 +`), &topo3) + c.Assert(err, IsNil) + err = CheckClusterPortConflict(clsList, "topo", &topo3) + c.Assert(err, IsNil) + // component port conflict topo4 := Specification{} err = yaml.Unmarshal([]byte(` diff --git a/tests/tiup-dm/test_cmd.sh b/tests/tiup-dm/test_cmd.sh index 3e861e6e42..6c896f661d 100755 --- a/tests/tiup-dm/test_cmd.sh +++ b/tests/tiup-dm/test_cmd.sh @@ -17,7 +17,11 @@ mkdir -p ~/.tiup/bin && cp -f ./root.json ~/.tiup/bin/ tiup-dm --yes deploy $name $version $topo -i ~/.ssh/id_rsa # topology doesn't contains the section `monitored` will not deploy node_exporter, blackbox_exporter -! tiup-dm exec $name -N $ipprefix.101 --command "ls /etc/systemd/system/{node,blackbox}_exporter-*.service" +tiup-dm exec $name -N $ipprefix.101 --command "ls /etc/systemd/system/{node,blackbox}_exporter-*.service" || export has_exporter=1 +if [[ $has_exporter -eq 0 ]]; then + echo "monitoring agents should not be deployed for dm cluster if \"monitored\" section is not set." + exit 1; +fi tiup-dm list | grep "$name" # debug https://github.com/pingcap/tiup/issues/666