diff --git a/.changelog/17171.txt b/.changelog/17171.txt new file mode 100644 index 000000000000..0de1dc26adbe --- /dev/null +++ b/.changelog/17171.txt @@ -0,0 +1,3 @@ +```release-note:improvement +agent: prevent very old servers re-joining a cluster with stale data +``` diff --git a/agent/agent.go b/agent/agent.go index 55efcb455271..882ef0c2c66f 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -4,6 +4,7 @@ import ( "context" "crypto/tls" "encoding/json" + "errors" "fmt" "io" "net" @@ -45,12 +46,13 @@ import ( grpcDNS "github.com/hashicorp/consul/agent/grpc-external/services/dns" middleware "github.com/hashicorp/consul/agent/grpc-middleware" "github.com/hashicorp/consul/agent/hcp/scada" - libscada "github.com/hashicorp/consul/agent/hcp/scada" "github.com/hashicorp/consul/agent/local" "github.com/hashicorp/consul/agent/proxycfg" proxycfgglue "github.com/hashicorp/consul/agent/proxycfg-glue" catalogproxycfg "github.com/hashicorp/consul/agent/proxycfg-sources/catalog" localproxycfg "github.com/hashicorp/consul/agent/proxycfg-sources/local" + "github.com/hashicorp/consul/agent/rpcclient" + "github.com/hashicorp/consul/agent/rpcclient/configentry" "github.com/hashicorp/consul/agent/rpcclient/health" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/systemd" @@ -100,6 +102,9 @@ const ( // defaultQueryTime is the amount of time we block waiting for a change // if no time is specified. Previously we would wait the maxQueryTime. defaultQueryTime = 300 * time.Second + + // Name of the file to store a server's last seen timestamp. + serverLastSeenFile = "server_last_seen" ) var ( @@ -553,11 +558,11 @@ func (a *Agent) Start(ctx context.Context) error { return err } - // copy over the existing node id, this cannot be - // changed while running anyways but this prevents - // breaking some existing behavior. then overwrite - // the configuration + // Copy over the existing node id. This cannot be + // changed while running, but this prevents + // breaking some existing behavior. c.NodeID = a.config.NodeID + // Overwrite the configuration. a.config = c if err := a.tlsConfigurator.Update(a.config.TLS); err != nil { @@ -603,6 +608,12 @@ func (a *Agent) Start(ctx context.Context) error { if c.ServerMode { serverLogger := a.baseDeps.Logger.NamedIntercept(logging.ConsulServer) + // TODO: maybe this is called too early? + if err := a.checkServerLastSeen(); err != nil { + // TODO: log a bunch of times first? + return err + } + incomingRPCLimiter := consul.ConfiguredIncomingRPCLimiter( &lib.StopChannelContext{StopCh: a.shutdownCh}, serverLogger, @@ -639,7 +650,6 @@ func (a *Agent) Start(ctx context.Context) error { return fmt.Errorf("failed to start server cert manager: %w", err) } } - } else { a.externalGRPCServer = external.NewServer( a.logger.Named("grpc.external"), @@ -730,6 +740,10 @@ func (a *Agent) Start(ctx context.Context) error { }, ) + // Start writing last seen timestamps to file in order to age on next startup. + // TODO: maybe we should do this earlier above in the "if c.ServerMode" block? + go a.persistServerLastSeen() + // Start watching for critical services to deregister, based on their // checks. go a.reapServices() @@ -1072,7 +1086,7 @@ func (a *Agent) listenHTTP() ([]apiServer, error) { MaxHeaderBytes: a.config.HTTPMaxHeaderBytes, } - if libscada.IsCapability(l.Addr()) { + if scada.IsCapability(l.Addr()) { // wrap in http2 server handler httpServer.Handler = h2c.NewHandler(srv.handler(a.config.EnableDebug), &http2.Server{}) } @@ -4508,7 +4522,70 @@ func (a *Agent) proxyDataSources() proxycfg.DataSources { a.fillEnterpriseProxyDataSources(&sources) return sources +} + +// persistServerLastSeen writes a server's last seen Unix timestamp to a file +// in the configured data directory, and then periodically updates the timestamp +// every hour. +func (a *Agent) persistServerLastSeen() { + file := filepath.Join(a.config.DataDir, serverLastSeenFile) + + // Create a timer with no initial tick to allow the timestamp to be written immediately. + t := time.NewTimer(0) + defer t.Stop() + for { + select { + case <-t.C: + // Reset the timer to the larger periodic interval. + t.Reset(1 * time.Hour) + + // TODO: should we do this properly using binary encoding? + now := strconv.FormatInt(time.Now().Unix(), 10) + + if err := os.WriteFile(file, []byte(now), 0600); err != nil { + // TODO: should we exit if this has happened too many times? + a.logger.Error("failed to write last seen timestamp: %w", err) + } + case <-a.shutdownCh: + return + } + } +} + +// checkServerLastSeen is a safety check for preventing old servers from rejoining an existing cluster. +// +// It attempts to read a server's last seen file and check the Unix timestamp against a +// configurable max age. If the last seen file does not exist, we treat this as an initial startup +// and return no error. +// +// Example: if the server recorded a last seen timestamp of now-7d, and we configure a max age +// of 3d, then we should prevent the server from rejoining. +func (a *Agent) checkServerLastSeen() error { + file := filepath.Join(a.config.DataDir, serverLastSeenFile) + + // Check if the last seen timestamp file exists, and return early if it doesn't + // as this indicates the server is starting for the first time. + if _, err := os.Stat(file); errors.Is(err, os.ErrNotExist) { + return nil + } + + // Read timestamp from last seen file. + b, err := os.ReadFile(file) + if err != nil { + return fmt.Errorf("error reading server last seen timestamp: %w", err) + } + + // TODO: again, we probably want to do this more efficiently using binary encoding. + i, _ := strconv.Atoi(string(b)) + lastSeen := time.Unix(int64(i), 0) + maxAge := time.Now().Add(-a.config.ServerRejoinAgeMax) + + if lastSeen.Before(maxAge) { + return fmt.Errorf("server has not been seen since %s, will not rejoin", lastSeen.Format(time.DateTime)) + } + + return nil } func listenerPortKey(svcID structs.ServiceID, checkID structs.CheckID) string { diff --git a/agent/config/builder.go b/agent/config/builder.go index d6a904af5603..dfa2c49096af 100644 --- a/agent/config/builder.go +++ b/agent/config/builder.go @@ -25,8 +25,6 @@ import ( "github.com/hashicorp/memberlist" "golang.org/x/time/rate" - hcpconfig "github.com/hashicorp/consul/agent/hcp/config" - "github.com/hashicorp/consul/agent/cache" "github.com/hashicorp/consul/agent/checks" "github.com/hashicorp/consul/agent/connect/ca" @@ -34,6 +32,7 @@ import ( "github.com/hashicorp/consul/agent/consul/authmethod/ssoauth" consulrate "github.com/hashicorp/consul/agent/consul/rate" "github.com/hashicorp/consul/agent/dns" + hcpconfig "github.com/hashicorp/consul/agent/hcp/config" "github.com/hashicorp/consul/agent/rpc/middleware" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/token" @@ -1078,6 +1077,7 @@ func (b *builder) build() (rt RuntimeConfig, err error) { ServerMode: serverMode, ServerName: stringVal(c.ServerName), ServerPort: serverPort, + ServerRejoinAgeMax: b.durationValWithDefault("server_rejoin_age_max", c.ServerRejoinAgeMax, 24*7*time.Hour), Services: services, SessionTTLMin: b.durationVal("session_ttl_min", c.SessionTTLMin), SkipLeaveOnInt: skipLeaveOnInt, diff --git a/agent/config/config.go b/agent/config/config.go index 348dfbaddba8..5bef5efe0cf1 100644 --- a/agent/config/config.go +++ b/agent/config/config.go @@ -224,6 +224,7 @@ type Config struct { SerfBindAddrWAN *string `mapstructure:"serf_wan" json:"serf_wan,omitempty"` ServerMode *bool `mapstructure:"server" json:"server,omitempty"` ServerName *string `mapstructure:"server_name" json:"server_name,omitempty"` + ServerRejoinAgeMax *string `mapstructure:"server_rejoin_age_max" json:"server_rejoin_age_max,omitempty"` Service *ServiceDefinition `mapstructure:"service" json:"-"` Services []ServiceDefinition `mapstructure:"services" json:"-"` SessionTTLMin *string `mapstructure:"session_ttl_min" json:"session_ttl_min,omitempty"` diff --git a/agent/config/runtime.go b/agent/config/runtime.go index a4b4f6bffad8..d5159ee6e591 100644 --- a/agent/config/runtime.go +++ b/agent/config/runtime.go @@ -1353,6 +1353,18 @@ type RuntimeConfig struct { // hcl: ports { server = int } ServerPort int + // ServerRejoinAgeMax is used to specify the duration of time a server + // is allowed to be down/offline before a startup operation is refused. + // + // For example: if a server has been offline for 5 days, and this option + // is configured to 3 days, then any subsequent startup operation will fail + // and require an operator to manually intervene. + // + // The default is: 7 days + // + // hcl: server_rejoin_age_max = "duration" + ServerRejoinAgeMax time.Duration + // Services contains the provided service definitions: // // hcl: services = [ diff --git a/agent/config/runtime_test.go b/agent/config/runtime_test.go index 28c1b4847ee5..439a797989d9 100644 --- a/agent/config/runtime_test.go +++ b/agent/config/runtime_test.go @@ -22,13 +22,12 @@ import ( "github.com/stretchr/testify/require" "golang.org/x/time/rate" - hcpconfig "github.com/hashicorp/consul/agent/hcp/config" - "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/cache" "github.com/hashicorp/consul/agent/checks" "github.com/hashicorp/consul/agent/consul" consulrate "github.com/hashicorp/consul/agent/consul/rate" + hcpconfig "github.com/hashicorp/consul/agent/hcp/config" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/token" "github.com/hashicorp/consul/lib" diff --git a/agent/config/testdata/TestRuntimeConfig_Sanitize.golden b/agent/config/testdata/TestRuntimeConfig_Sanitize.golden index c447b3f4ebdf..15ff876eaceb 100644 --- a/agent/config/testdata/TestRuntimeConfig_Sanitize.golden +++ b/agent/config/testdata/TestRuntimeConfig_Sanitize.golden @@ -328,6 +328,7 @@ "ServerMode": false, "ServerName": "", "ServerPort": 0, + "ServerRejoinAgeMax": "168h0m0s", "Services": [ { "Address": "", @@ -504,4 +505,4 @@ "VersionPrerelease": "", "Watches": [], "XDSUpdateRateLimit": 0 -} \ No newline at end of file +}