Skip to content
This repository has been archived by the owner on Nov 24, 2023. It is now read-only.

dm-master: fix DM-master can not exit when starting timeout #513

Merged
merged 3 commits into from
Mar 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions dm/master/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ const (
// startEtcd starts an embedded etcd server.
func startEtcd(etcdCfg *embed.Config,
gRPCSvr func(*grpc.Server),
httpHandles map[string]http.Handler) (*embed.Etcd, error) {
httpHandles map[string]http.Handler, startTimeout time.Duration) (*embed.Etcd, error) {
// attach extra gRPC and HTTP server
if gRPCSvr != nil {
etcdCfg.ServiceRegister = gRPCSvr
Expand All @@ -57,10 +57,17 @@ func startEtcd(etcdCfg *embed.Config,

select {
case <-e.Server.ReadyNotify():
case <-time.After(etcdStartTimeout):
case <-time.After(startTimeout):
// if fail to startup, the etcd server may be still blocking in
// https://github.com/etcd-io/etcd/blob/3cf2f69b5738fb702ba1a935590f36b52b18979b/embed/serve.go#L92
// then `e.Close` will block in
// https://github.com/etcd-io/etcd/blob/3cf2f69b5738fb702ba1a935590f36b52b18979b/embed/etcd.go#L377
// because `close(sctx.serversC)` has not been called in
// https://github.com/etcd-io/etcd/blob/3cf2f69b5738fb702ba1a935590f36b52b18979b/embed/serve.go#L200.
// so for `ReadyNotify` timeout, we choose to only call `e.Server.Stop()` now,
// and we should exit the DM-master process after returned with error from this function.
e.Server.Stop()
e.Close()
return nil, terror.ErrMasterStartEmbedEtcdFail.Generatef("start embed etcd timeout %v", etcdStartTimeout)
return nil, terror.ErrMasterStartEmbedEtcdFail.Generatef("start embed etcd timeout %v", startTimeout)
}
return e, nil
}
Expand Down
26 changes: 24 additions & 2 deletions dm/master/etcd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,28 @@ func (t *testEtcdSuite) SetUpSuite(c *check.C) {
log.InitLogger(&log.Config{})
}

func (t *testEtcdSuite) TestStartEtcdFail(c *check.C) {
cfgCluster := NewConfig()
cfgCluster.Name = "dm-master-1"
cfgCluster.DataDir = c.MkDir()
cfgCluster.MasterAddr = tempurl.Alloc()[len("http://"):]
cfgCluster.PeerUrls = tempurl.Alloc()
c.Assert(cfgCluster.adjust(), check.IsNil)

// add another non-existing member for bootstrapping.
cfgCluster.InitialCluster = fmt.Sprintf("%s=%s,%s=%s",
cfgCluster.Name, cfgCluster.AdvertisePeerUrls,
"dm-master-2", tempurl.Alloc())
c.Assert(cfgCluster.adjust(), check.IsNil)

// start an etcd cluster
cfgClusterEtcd, err := cfgCluster.genEmbedEtcdConfig()
c.Assert(err, check.IsNil)
e, err := startEtcd(cfgClusterEtcd, nil, nil, 3*time.Second)
c.Assert(terror.ErrMasterStartEmbedEtcdFail.Equal(err), check.IsTrue)
c.Assert(e, check.IsNil)
}

func (t *testEtcdSuite) TestPrepareJoinEtcd(c *check.C) {
cfgCluster := NewConfig() // used to start an etcd cluster
cfgCluster.Name = "dm-master-1"
Expand Down Expand Up @@ -103,7 +125,7 @@ func (t *testEtcdSuite) TestPrepareJoinEtcd(c *check.C) {
c.Assert(os.RemoveAll(memberDP), check.IsNil) // remove previous data

// start an etcd cluster
e1, err := startEtcd(cfgClusterEtcd, nil, nil)
e1, err := startEtcd(cfgClusterEtcd, nil, nil, etcdStartTimeout)
c.Assert(err, check.IsNil)
defer e1.Close()

Expand Down Expand Up @@ -156,7 +178,7 @@ func (t *testEtcdSuite) TestPrepareJoinEtcd(c *check.C) {
// start the joining etcd
cfgAfterEtcd, err := cfgAfter.genEmbedEtcdConfig()
c.Assert(err, check.IsNil)
e2, err := startEtcd(cfgAfterEtcd, nil, nil)
e2, err := startEtcd(cfgAfterEtcd, nil, nil, etcdStartTimeout)
c.Assert(err, check.IsNil)
defer e2.Close()

Expand Down
2 changes: 1 addition & 1 deletion dm/master/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ func (s *Server) Start(ctx context.Context) (err error) {
gRPCSvr := func(gs *grpc.Server) { pb.RegisterMasterServer(gs, s) }

// start embed etcd server, gRPC API server and HTTP (API, status and debug) server.
s.etcd, err = startEtcd(etcdCfg, gRPCSvr, userHandles)
s.etcd, err = startEtcd(etcdCfg, gRPCSvr, userHandles, etcdStartTimeout)
if err != nil {
return
}
Expand Down