diff --git a/locksmithctl/locksmithctl.go b/locksmithctl/locksmithctl.go index db60933..3eaa6ed 100644 --- a/locksmithctl/locksmithctl.go +++ b/locksmithctl/locksmithctl.go @@ -17,6 +17,7 @@ package main import ( "crypto/tls" "crypto/x509" + "errors" "flag" "fmt" "io/ioutil" @@ -25,6 +26,7 @@ import ( "os" "path" "strings" + "syscall" "text/tabwriter" "time" @@ -216,25 +218,39 @@ func getClient() (*lock.EtcdLockClient, error) { transport.TLSClientConfig = tlsconf } - cfg := client.Config{ - Endpoints: globalFlags.Endpoints, - Transport: transport, - Username: globalFlags.EtcdUsername, - Password: globalFlags.EtcdPassword, - } + // This loop is a hack to bring a kind a resilience in case of unreachable endpoint. + // It has been shown in the CI (cl.locksmith.cluster) that etcd/v2 recent upgrade has broke the resiliency + // of the endpoint. + // It can be safely removed once the `etcd` V3 upgrade done. + // More details https://github.com/kinvolk/coreos-overlay/pull/1161#issuecomment-891906580. + for _, ep := range globalFlags.Endpoints { + cfg := client.Config{ + Endpoints: []string{ep}, + Transport: transport, + Username: globalFlags.EtcdUsername, + Password: globalFlags.EtcdPassword, + } - ec, err := client.New(cfg) - if err != nil { - return nil, err - } + ec, err := client.New(cfg) + if err != nil { + return nil, fmt.Errorf("creating etcd client: %w", err) + } - kapi := client.NewKeysAPI(ec) + kapi := client.NewKeysAPI(ec) - lc, err := lock.NewEtcdLockClient(kapi, globalFlags.Group) - if err != nil { - return nil, err + lc, err := lock.NewEtcdLockClient(kapi, globalFlags.Group) + if err != nil { + if errors.Is(err, syscall.ECONNREFUSED) { + continue + } + + return nil, fmt.Errorf("creating etcd lock client: %w", err) + } + + return lc, nil } - return lc, err + + return nil, fmt.Errorf("no etcd endpoints available, tried: %s", strings.Join(globalFlags.Endpoints, ",")) } // flagsFromEnv parses all registered flags in the given flagSet,