From 5e254ba420e2a7aac4cd265ddb9a4cd5dbfd2b90 Mon Sep 17 00:00:00 2001 From: Dan Bond Date: Thu, 27 Apr 2023 18:17:41 +0000 Subject: [PATCH 1/5] backport of commit c8efed4585c30fed38222999f7f8e9be01324350 --- agent/agent.go | 80 +++++++++++++++++++++++++++++++++++++---- agent/config/builder.go | 4 +-- agent/config/config.go | 1 + agent/config/runtime.go | 12 +++++++ 4 files changed, 88 insertions(+), 9 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 55efcb455271..7af010e38ade 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -4,6 +4,7 @@ import ( "context" "crypto/tls" "encoding/json" + "errors" "fmt" "io" "net" @@ -45,12 +46,13 @@ import ( grpcDNS "github.com/hashicorp/consul/agent/grpc-external/services/dns" middleware "github.com/hashicorp/consul/agent/grpc-middleware" "github.com/hashicorp/consul/agent/hcp/scada" - libscada "github.com/hashicorp/consul/agent/hcp/scada" "github.com/hashicorp/consul/agent/local" "github.com/hashicorp/consul/agent/proxycfg" proxycfgglue "github.com/hashicorp/consul/agent/proxycfg-glue" catalogproxycfg "github.com/hashicorp/consul/agent/proxycfg-sources/catalog" localproxycfg "github.com/hashicorp/consul/agent/proxycfg-sources/local" + "github.com/hashicorp/consul/agent/rpcclient" + "github.com/hashicorp/consul/agent/rpcclient/configentry" "github.com/hashicorp/consul/agent/rpcclient/health" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/systemd" @@ -100,6 +102,9 @@ const ( // defaultQueryTime is the amount of time we block waiting for a change // if no time is specified. Previously we would wait the maxQueryTime. defaultQueryTime = 300 * time.Second + + // Name of the file to store a server's last seen timestamp. + lastSeenFile = "lastseen" ) var ( @@ -553,11 +558,11 @@ func (a *Agent) Start(ctx context.Context) error { return err } - // copy over the existing node id, this cannot be - // changed while running anyways but this prevents - // breaking some existing behavior. then overwrite - // the configuration + // Copy over the existing node id. This cannot be + // changed while running, but this prevents + // breaking some existing behavior. c.NodeID = a.config.NodeID + // Overwrite the configuration. a.config = c if err := a.tlsConfigurator.Update(a.config.TLS); err != nil { @@ -603,6 +608,12 @@ func (a *Agent) Start(ctx context.Context) error { if c.ServerMode { serverLogger := a.baseDeps.Logger.NamedIntercept(logging.ConsulServer) + // TODO: maybe this is called too early? + if err := a.checkServerLastSeen(); err != nil { + // TODO: log a bunch of times first? + return err + } + incomingRPCLimiter := consul.ConfiguredIncomingRPCLimiter( &lib.StopChannelContext{StopCh: a.shutdownCh}, serverLogger, @@ -639,7 +650,6 @@ func (a *Agent) Start(ctx context.Context) error { return fmt.Errorf("failed to start server cert manager: %w", err) } } - } else { a.externalGRPCServer = external.NewServer( a.logger.Named("grpc.external"), @@ -730,6 +740,10 @@ func (a *Agent) Start(ctx context.Context) error { }, ) + // Start writing lastseen timestamps to file in order to age on next startup. + // TODO: maybe we should do this earlier above in the "if c.ServerMode" block? + go a.persistServerLastSeen() + // Start watching for critical services to deregister, based on their // checks. go a.reapServices() @@ -1072,7 +1086,7 @@ func (a *Agent) listenHTTP() ([]apiServer, error) { MaxHeaderBytes: a.config.HTTPMaxHeaderBytes, } - if libscada.IsCapability(l.Addr()) { + if scada.IsCapability(l.Addr()) { // wrap in http2 server handler httpServer.Handler = h2c.NewHandler(srv.handler(a.config.EnableDebug), &http2.Server{}) } @@ -4508,7 +4522,59 @@ func (a *Agent) proxyDataSources() proxycfg.DataSources { a.fillEnterpriseProxyDataSources(&sources) return sources +} + +func (a *Agent) persistServerLastSeen() { + file := filepath.Join(a.config.DataDir, lastSeenFile) + + // Create a timer with no initial tick to allow the timestamp to be written immediately. + t := time.NewTimer(0) + defer t.Stop() + + for { + select { + case <-t.C: + // Reset the timer to the larger periodic interval. + t.Reset(1 * time.Hour) + + // TODO: should we do this properly using binary encoding? + now := strconv.FormatInt(time.Now().Unix(), 10) + + if err := os.WriteFile(file, []byte(now), 0600); err != nil { + // TODO: should we exit if this has happened too many times? + a.logger.Error("failed to write lastseen timestamp: %w", err) + } + case <-a.shutdownCh: + return + } + } +} + +func (a *Agent) checkServerLastSeen() error { + file := filepath.Join(a.config.DataDir, lastSeenFile) + // Check if the lastseen timestampd file exists, and return early if it doesn't + // as this indicates the server is starting for the first time. + if _, err := os.Stat(file); errors.Is(err, os.ErrNotExist) { + return nil + } + + // Read timestamp from lastseen file. + b, err := os.ReadFile(file) + if err != nil { + return fmt.Errorf("error reading server lastseen timestamp: %w", err) + } + + // TODO: again, we probably want to do this more efficiently using binary encoding. + i, _ := strconv.Atoi(string(b)) + lastseen := time.Unix(int64(i), 0) + maxAge := time.Now().Add(-a.config.ServerRejoinAgeTTL) + + if lastseen.Before(maxAge) { + return fmt.Errorf("server has not been seen for at least %d, will not rejoin", a.config.ServerRejoinAgeTTL) + } + + return nil } func listenerPortKey(svcID structs.ServiceID, checkID structs.CheckID) string { diff --git a/agent/config/builder.go b/agent/config/builder.go index d6a904af5603..dfa2c49096af 100644 --- a/agent/config/builder.go +++ b/agent/config/builder.go @@ -25,8 +25,6 @@ import ( "github.com/hashicorp/memberlist" "golang.org/x/time/rate" - hcpconfig "github.com/hashicorp/consul/agent/hcp/config" - "github.com/hashicorp/consul/agent/cache" "github.com/hashicorp/consul/agent/checks" "github.com/hashicorp/consul/agent/connect/ca" @@ -34,6 +32,7 @@ import ( "github.com/hashicorp/consul/agent/consul/authmethod/ssoauth" consulrate "github.com/hashicorp/consul/agent/consul/rate" "github.com/hashicorp/consul/agent/dns" + hcpconfig "github.com/hashicorp/consul/agent/hcp/config" "github.com/hashicorp/consul/agent/rpc/middleware" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/token" @@ -1078,6 +1077,7 @@ func (b *builder) build() (rt RuntimeConfig, err error) { ServerMode: serverMode, ServerName: stringVal(c.ServerName), ServerPort: serverPort, + ServerRejoinAgeMax: b.durationValWithDefault("server_rejoin_age_max", c.ServerRejoinAgeMax, 24*7*time.Hour), Services: services, SessionTTLMin: b.durationVal("session_ttl_min", c.SessionTTLMin), SkipLeaveOnInt: skipLeaveOnInt, diff --git a/agent/config/config.go b/agent/config/config.go index 348dfbaddba8..5bef5efe0cf1 100644 --- a/agent/config/config.go +++ b/agent/config/config.go @@ -224,6 +224,7 @@ type Config struct { SerfBindAddrWAN *string `mapstructure:"serf_wan" json:"serf_wan,omitempty"` ServerMode *bool `mapstructure:"server" json:"server,omitempty"` ServerName *string `mapstructure:"server_name" json:"server_name,omitempty"` + ServerRejoinAgeMax *string `mapstructure:"server_rejoin_age_max" json:"server_rejoin_age_max,omitempty"` Service *ServiceDefinition `mapstructure:"service" json:"-"` Services []ServiceDefinition `mapstructure:"services" json:"-"` SessionTTLMin *string `mapstructure:"session_ttl_min" json:"session_ttl_min,omitempty"` diff --git a/agent/config/runtime.go b/agent/config/runtime.go index a4b4f6bffad8..d5159ee6e591 100644 --- a/agent/config/runtime.go +++ b/agent/config/runtime.go @@ -1353,6 +1353,18 @@ type RuntimeConfig struct { // hcl: ports { server = int } ServerPort int + // ServerRejoinAgeMax is used to specify the duration of time a server + // is allowed to be down/offline before a startup operation is refused. + // + // For example: if a server has been offline for 5 days, and this option + // is configured to 3 days, then any subsequent startup operation will fail + // and require an operator to manually intervene. + // + // The default is: 7 days + // + // hcl: server_rejoin_age_max = "duration" + ServerRejoinAgeMax time.Duration + // Services contains the provided service definitions: // // hcl: services = [ From dc912982294708ac1823fcd4d73e99603638ce12 Mon Sep 17 00:00:00 2001 From: Dan Bond Date: Thu, 27 Apr 2023 18:23:40 +0000 Subject: [PATCH 2/5] backport of commit 38cb8caa96f724423d6127d507d600db959a7902 --- agent/agent.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 7af010e38ade..b05ba3bd715c 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -4553,7 +4553,7 @@ func (a *Agent) persistServerLastSeen() { func (a *Agent) checkServerLastSeen() error { file := filepath.Join(a.config.DataDir, lastSeenFile) - // Check if the lastseen timestampd file exists, and return early if it doesn't + // Check if the lastseen timestamp file exists, and return early if it doesn't // as this indicates the server is starting for the first time. if _, err := os.Stat(file); errors.Is(err, os.ErrNotExist) { return nil @@ -4568,10 +4568,10 @@ func (a *Agent) checkServerLastSeen() error { // TODO: again, we probably want to do this more efficiently using binary encoding. i, _ := strconv.Atoi(string(b)) lastseen := time.Unix(int64(i), 0) - maxAge := time.Now().Add(-a.config.ServerRejoinAgeTTL) + maxAge := time.Now().Add(-a.config.ServerRejoinAgeMax) if lastseen.Before(maxAge) { - return fmt.Errorf("server has not been seen for at least %d, will not rejoin", a.config.ServerRejoinAgeTTL) + return fmt.Errorf("server has not been seen for at least %d, will not rejoin", a.config.ServerRejoinAgeMax) } return nil From 17e56b2f6dcd789b4b306f7438a969fd7d8556d1 Mon Sep 17 00:00:00 2001 From: Dan Bond Date: Thu, 27 Apr 2023 22:16:26 +0000 Subject: [PATCH 3/5] backport of commit 15e1e38b0ecbcdc5ab7194f807ed5544c26cc643 --- agent/agent.go | 33 ++++++++++++------- agent/config/runtime_test.go | 3 +- .../TestRuntimeConfig_Sanitize.golden | 3 +- 3 files changed, 25 insertions(+), 14 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index b05ba3bd715c..81f18d8255aa 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -104,7 +104,7 @@ const ( defaultQueryTime = 300 * time.Second // Name of the file to store a server's last seen timestamp. - lastSeenFile = "lastseen" + serverLastSeenFile = "server_last_seen" ) var ( @@ -740,7 +740,7 @@ func (a *Agent) Start(ctx context.Context) error { }, ) - // Start writing lastseen timestamps to file in order to age on next startup. + // Start writing last seen timestamps to file in order to age on next startup. // TODO: maybe we should do this earlier above in the "if c.ServerMode" block? go a.persistServerLastSeen() @@ -4524,8 +4524,11 @@ func (a *Agent) proxyDataSources() proxycfg.DataSources { return sources } +// persistServerLastSeen writes a server's last seen Unix timestamp to a file +// in the configured data directory, and then periodically updates the timestamp +// every hour. func (a *Agent) persistServerLastSeen() { - file := filepath.Join(a.config.DataDir, lastSeenFile) + file := filepath.Join(a.config.DataDir, serverLastSeenFile) // Create a timer with no initial tick to allow the timestamp to be written immediately. t := time.NewTimer(0) @@ -4542,7 +4545,7 @@ func (a *Agent) persistServerLastSeen() { if err := os.WriteFile(file, []byte(now), 0600); err != nil { // TODO: should we exit if this has happened too many times? - a.logger.Error("failed to write lastseen timestamp: %w", err) + a.logger.Error("failed to write last seen timestamp: %w", err) } case <-a.shutdownCh: return @@ -4550,28 +4553,36 @@ func (a *Agent) persistServerLastSeen() { } } +// checkServerLastSeen is a safety check for preventing old servers from rejoining a cluster. +// +// It attempts to read a server's last seen file and check the Unix timestamp against a +// confiruable max age. If the last seen file does not exist, we treat this as an initial startup +// and return no error. +// +// Example: if the server recorded a last seen timestamp of now-7d, and we configure a max age +// of 3d, then we should prevent the server from rejoining. func (a *Agent) checkServerLastSeen() error { - file := filepath.Join(a.config.DataDir, lastSeenFile) + file := filepath.Join(a.config.DataDir, serverLastSeenFile) - // Check if the lastseen timestamp file exists, and return early if it doesn't + // Check if the last seen timestamp file exists, and return early if it doesn't // as this indicates the server is starting for the first time. if _, err := os.Stat(file); errors.Is(err, os.ErrNotExist) { return nil } - // Read timestamp from lastseen file. + // Read timestamp from last seen file. b, err := os.ReadFile(file) if err != nil { - return fmt.Errorf("error reading server lastseen timestamp: %w", err) + return fmt.Errorf("error reading server last seen timestamp: %w", err) } // TODO: again, we probably want to do this more efficiently using binary encoding. i, _ := strconv.Atoi(string(b)) - lastseen := time.Unix(int64(i), 0) + lastSeen := time.Unix(int64(i), 0) maxAge := time.Now().Add(-a.config.ServerRejoinAgeMax) - if lastseen.Before(maxAge) { - return fmt.Errorf("server has not been seen for at least %d, will not rejoin", a.config.ServerRejoinAgeMax) + if lastSeen.Before(maxAge) { + return fmt.Errorf("server has not been seen since %s, will not rejoin", lastSeen.Format(time.DateTime)) } return nil diff --git a/agent/config/runtime_test.go b/agent/config/runtime_test.go index 28c1b4847ee5..439a797989d9 100644 --- a/agent/config/runtime_test.go +++ b/agent/config/runtime_test.go @@ -22,13 +22,12 @@ import ( "github.com/stretchr/testify/require" "golang.org/x/time/rate" - hcpconfig "github.com/hashicorp/consul/agent/hcp/config" - "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/cache" "github.com/hashicorp/consul/agent/checks" "github.com/hashicorp/consul/agent/consul" consulrate "github.com/hashicorp/consul/agent/consul/rate" + hcpconfig "github.com/hashicorp/consul/agent/hcp/config" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/token" "github.com/hashicorp/consul/lib" diff --git a/agent/config/testdata/TestRuntimeConfig_Sanitize.golden b/agent/config/testdata/TestRuntimeConfig_Sanitize.golden index c447b3f4ebdf..971b8560ee1a 100644 --- a/agent/config/testdata/TestRuntimeConfig_Sanitize.golden +++ b/agent/config/testdata/TestRuntimeConfig_Sanitize.golden @@ -328,6 +328,7 @@ "ServerMode": false, "ServerName": "", "ServerPort": 0, + "ServerRejoinAgeMax": "0s", "Services": [ { "Address": "", @@ -504,4 +505,4 @@ "VersionPrerelease": "", "Watches": [], "XDSUpdateRateLimit": 0 -} \ No newline at end of file +} From 7428d58b43acc7eb5a52de3bdbd040387ea6c14f Mon Sep 17 00:00:00 2001 From: Dan Bond Date: Thu, 27 Apr 2023 22:44:35 +0000 Subject: [PATCH 4/5] backport of commit 9baff6493e12568806cebeb9ca3e88a2572510e2 --- agent/agent.go | 4 ++-- agent/config/testdata/TestRuntimeConfig_Sanitize.golden | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 81f18d8255aa..882ef0c2c66f 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -4553,10 +4553,10 @@ func (a *Agent) persistServerLastSeen() { } } -// checkServerLastSeen is a safety check for preventing old servers from rejoining a cluster. +// checkServerLastSeen is a safety check for preventing old servers from rejoining an existing cluster. // // It attempts to read a server's last seen file and check the Unix timestamp against a -// confiruable max age. If the last seen file does not exist, we treat this as an initial startup +// configurable max age. If the last seen file does not exist, we treat this as an initial startup // and return no error. // // Example: if the server recorded a last seen timestamp of now-7d, and we configure a max age diff --git a/agent/config/testdata/TestRuntimeConfig_Sanitize.golden b/agent/config/testdata/TestRuntimeConfig_Sanitize.golden index 971b8560ee1a..15ff876eaceb 100644 --- a/agent/config/testdata/TestRuntimeConfig_Sanitize.golden +++ b/agent/config/testdata/TestRuntimeConfig_Sanitize.golden @@ -328,7 +328,7 @@ "ServerMode": false, "ServerName": "", "ServerPort": 0, - "ServerRejoinAgeMax": "0s", + "ServerRejoinAgeMax": "168h0m0s", "Services": [ { "Address": "", From dbb64d4ebcede93651e2359ca2ef317f1eb7d4d8 Mon Sep 17 00:00:00 2001 From: Dan Bond Date: Thu, 27 Apr 2023 22:47:10 +0000 Subject: [PATCH 5/5] backport of commit f672df43dcad76cc484029e6203efd160a380184 --- .changelog/17171.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .changelog/17171.txt diff --git a/.changelog/17171.txt b/.changelog/17171.txt new file mode 100644 index 000000000000..0de1dc26adbe --- /dev/null +++ b/.changelog/17171.txt @@ -0,0 +1,3 @@ +```release-note:improvement +agent: prevent very old servers re-joining a cluster with stale data +```