diff --git a/.changelog/14922.txt b/.changelog/14922.txt new file mode 100644 index 000000000000..6fac2606cc8c --- /dev/null +++ b/.changelog/14922.txt @@ -0,0 +1,7 @@ +```release-note:bug +client: Resolve ACL roles within client ACL cache +``` + +```release-note:bug +client: Check ACL token expiry when resolving token within ACL cache +``` diff --git a/client/acl.go b/client/acl.go index bf666fbaa056..19efb6cd083d 100644 --- a/client/acl.go +++ b/client/acl.go @@ -21,6 +21,11 @@ const ( // tokenCacheSize is the number of ACL tokens to keep cached. Tokens have a fetching cost, // so we keep the hot tokens cached to reduce the lookups. tokenCacheSize = 64 + + // roleCacheSize is the number of ACL roles to keep cached. Looking up + // roles requires an RPC call, so we keep the hot roles cached to reduce + // the number of lookups. + roleCacheSize = 64 ) // clientACLResolver holds the state required for client resolution @@ -34,6 +39,10 @@ type clientACLResolver struct { // tokenCache is used to maintain the fetched token objects tokenCache *lru.TwoQueueCache + + // roleCache is used to maintain a cache of the fetched ACL roles. Each + // entry is keyed by the role ID. + roleCache *lru.TwoQueueCache } // init is used to setup the client resolver state @@ -52,13 +61,19 @@ func (c *clientACLResolver) init() error { if err != nil { return err } + c.roleCache, err = lru.New2Q(roleCacheSize) + if err != nil { + return err + } return nil } -// cachedACLValue is used to manage ACL Token or Policy TTLs +// cachedACLValue is used to manage ACL Token, Policy, or Role cache entries +// and their TTLs. type cachedACLValue struct { Token *structs.ACLToken Policy *structs.ACLPolicy + Role *structs.ACLRole CacheTime time.Time } @@ -95,13 +110,29 @@ func (c *Client) resolveTokenAndACL(secretID string) (*acl.ACL, *structs.ACLToke return nil, nil, structs.ErrTokenNotFound } + // Give the token expiry some slight leeway in case the client and server + // clocks are skewed. + if token.IsExpired(time.Now().Add(2 * time.Second)) { + return nil, nil, structs.ErrTokenExpired + } + // Check if this is a management token if token.Type == structs.ACLManagementToken { return acl.ManagementACL, token, nil } + // Resolve the policy links within the token ACL roles. + policyNames, err := c.resolveTokenACLRoles(secretID, token.Roles) + if err != nil { + return nil, nil, err + } + + // Generate a slice of all policy names included within the token, taken + // from both the ACL roles and the direct assignments. + policyNames = append(policyNames, token.Policies...) + // Resolve the policies - policies, err := c.resolvePolicies(token.SecretID, token.Policies) + policies, err := c.resolvePolicies(token.SecretID, policyNames) if err != nil { return nil, nil, err } @@ -227,3 +258,116 @@ func (c *Client) resolvePolicies(secretID string, policies []string) ([]*structs // Return the valid policies return out, nil } + +// resolveTokenACLRoles is used to unpack an ACL roles and their policy +// assignments into a list of ACL policy names. This can then be used to +// compile an ACL object. +// +// When roles need to be looked up from state via server RPC, we may use the +// expired cache version. This can only occur if we can fully resolve the role +// via the cache. +func (c *Client) resolveTokenACLRoles(secretID string, roleLinks []*structs.ACLTokenRoleLink) ([]string, error) { + + var ( + // policyNames tracks the resolved ACL policies which are linked to the + // role. This is the output object and represents the authorisation + // this role provides token bearers. + policyNames []string + + // missingRoleIDs are the roles linked which are not found within our + // cache. These must be looked up from the server via and RPC, so we + // can correctly identify the policy links. + missingRoleIDs []string + + // expiredRoleIDs are the roles linked which have been found within our + // cache, but are expired. These must be looked up from the server via + // and RPC, so we can correctly identify the policy links. + expiredRoleIDs []string + ) + + for _, roleLink := range roleLinks { + + // Look within the cache to see if the role is already present. If we + // do not find it, add the ID to our tracking, so we look this up via + // RPC. + raw, ok := c.roleCache.Get(roleLink.ID) + if !ok { + missingRoleIDs = append(missingRoleIDs, roleLink.ID) + continue + } + + // If the cached value is expired, add the ID to our tracking, so we + // look this up via RPC. Otherwise, iterate the policy links and add + // each policy name to our return object tracking. + cached := raw.(*cachedACLValue) + if cached.Age() <= c.GetConfig().ACLRoleTTL { + for _, policyLink := range cached.Role.Policies { + policyNames = append(policyNames, policyLink.Name) + } + } else { + expiredRoleIDs = append(expiredRoleIDs, cached.Role.ID) + } + } + + // Hot-path: we were able to resolve all ACL roles via the cache and + // generate a list of linked policy names. Therefore, we can avoid making + // any RPC calls. + if len(missingRoleIDs)+len(expiredRoleIDs) == 0 { + return policyNames, nil + } + + // Created a combined list of role IDs that we need to lookup from server + // state. + roleIDsToFetch := missingRoleIDs + roleIDsToFetch = append(roleIDsToFetch, expiredRoleIDs...) + + // Generate an RPC request to detail all the ACL roles that we did not find + // or were expired within the cache. + roleByIDReq := structs.ACLRolesByIDRequest{ + ACLRoleIDs: roleIDsToFetch, + QueryOptions: structs.QueryOptions{ + Region: c.Region(), + AuthToken: secretID, + AllowStale: true, + }, + } + + var roleByIDResp structs.ACLRolesByIDResponse + + // Perform the RPC call to detail the required ACL roles. If the RPC call + // fails, and we are only updating expired cache entries, use the expired + // entries. This allows use to handle intermittent failures. + err := c.RPC(structs.ACLGetRolesByIDRPCMethod, &roleByIDReq, &roleByIDResp) + if err != nil { + if len(missingRoleIDs) == 0 { + c.logger.Warn("failed to resolve ACL roles, using expired cached value", "error", err) + for _, aclRole := range roleByIDResp.ACLRoles { + for _, rolePolicyLink := range aclRole.Policies { + policyNames = append(policyNames, rolePolicyLink.Name) + } + } + return policyNames, nil + } + return nil, err + } + + // Generate a timestamp for the cache entry. We do not need to use a + // timestamp per ACL role response integration. + now := time.Now() + + for _, aclRole := range roleByIDResp.ACLRoles { + + // Add an entry to the cache using the generated timestamp for future + // expiry calculations. Any existing, expired entry will be + // overwritten. + c.roleCache.Add(aclRole.ID, &cachedACLValue{Role: aclRole, CacheTime: now}) + + // Iterate the role policy links, extracting the name and adding this + // to our return response tracking. + for _, rolePolicyLink := range aclRole.Policies { + policyNames = append(policyNames, rolePolicyLink.Name) + } + } + + return policyNames, nil +} diff --git a/client/acl_test.go b/client/acl_test.go index b1cc0a3151ee..7a108b8db894 100644 --- a/client/acl_test.go +++ b/client/acl_test.go @@ -2,17 +2,29 @@ package client import ( "testing" + "time" "github.com/hashicorp/nomad/acl" "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/client/config" + "github.com/hashicorp/nomad/helper/pointer" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/testutil" + "github.com/shoenig/test/must" "github.com/stretchr/testify/assert" ) +func Test_clientACLResolver_init(t *testing.T) { + resolver := &clientACLResolver{} + must.NoError(t, resolver.init()) + must.NotNil(t, resolver.aclCache) + must.NotNil(t, resolver.policyCache) + must.NotNil(t, resolver.tokenCache) + must.NotNil(t, resolver.roleCache) +} + func TestClient_ACL_resolveTokenValue(t *testing.T) { ci.Parallel(t) @@ -106,6 +118,47 @@ func TestClient_ACL_resolvePolicies(t *testing.T) { } } +func TestClient_resolveTokenACLRoles(t *testing.T) { + ci.Parallel(t) + + testServer, _, rootACLToken, testServerCleanupS1 := testACLServer(t, nil) + defer testServerCleanupS1() + testutil.WaitForLeader(t, testServer.RPC) + + testClient, cleanup := TestClient(t, func(c *config.Config) { + c.RPCHandler = testServer + c.ACLEnabled = true + }) + defer cleanup() + + // Create an ACL Role and a client token which is linked to this. + mockACLRole := mock.ACLRole() + + mockACLToken := mock.ACLToken() + mockACLToken.Policies = []string{} + mockACLToken.Roles = []*structs.ACLTokenRoleLink{{ID: mockACLRole.ID}} + + err := testServer.State().UpsertACLRoles(structs.MsgTypeTestSetup, 10, []*structs.ACLRole{mockACLRole}, true) + must.NoError(t, err) + err = testServer.State().UpsertACLTokens(structs.MsgTypeTestSetup, 20, []*structs.ACLToken{mockACLToken}) + must.NoError(t, err) + + // Resolve the ACL policies linked via the role. + resolvedRoles1, err := testClient.resolveTokenACLRoles(rootACLToken.SecretID, mockACLToken.Roles) + must.NoError(t, err) + must.Len(t, 2, resolvedRoles1) + + // Test the cache directly and check that the ACL role previously queried + // is now cached. + must.Eq(t, 1, testClient.roleCache.Len()) + must.True(t, testClient.roleCache.Contains(mockACLRole.ID)) + + // Resolve the roles again to check we get the same results. + resolvedRoles2, err := testClient.resolveTokenACLRoles(rootACLToken.SecretID, mockACLToken.Roles) + must.NoError(t, err) + must.SliceContainsAll(t, resolvedRoles1, resolvedRoles2) +} + func TestClient_ACL_ResolveToken_Disabled(t *testing.T) { ci.Parallel(t) @@ -199,4 +252,14 @@ func TestClient_ACL_ResolveSecretToken(t *testing.T) { assert.NotEmpty(t, respToken.AccessorID) } + // Create and upsert a token which has just expired. + mockExpiredToken := mock.ACLToken() + mockExpiredToken.ExpirationTime = pointer.Of(time.Now().Add(-5 * time.Minute)) + + err = s1.State().UpsertACLTokens(structs.MsgTypeTestSetup, 120, []*structs.ACLToken{mockExpiredToken}) + must.NoError(t, err) + + expiredTokenResp, err := c1.ResolveSecretToken(mockExpiredToken.SecretID) + must.Nil(t, expiredTokenResp) + must.StrContains(t, err.Error(), "ACL token expired") } diff --git a/client/config/config.go b/client/config/config.go index 4c6a9fc9fffb..8ff6e9861044 100644 --- a/client/config/config.go +++ b/client/config/config.go @@ -217,6 +217,10 @@ type Config struct { // ACLPolicyTTL is how long we cache policy values for ACLPolicyTTL time.Duration + // ACLRoleTTL is how long we cache ACL role value for within each Nomad + // client. + ACLRoleTTL time.Duration + // DisableRemoteExec disables remote exec targeting tasks on this client DisableRemoteExec bool diff --git a/command/agent/agent.go b/command/agent/agent.go index efb55b729475..0871f39434a8 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -720,6 +720,7 @@ func convertClientConfig(agentConfig *Config) (*clientconfig.Config, error) { conf.ACLEnabled = agentConfig.ACL.Enabled conf.ACLTokenTTL = agentConfig.ACL.TokenTTL conf.ACLPolicyTTL = agentConfig.ACL.PolicyTTL + conf.ACLRoleTTL = agentConfig.ACL.RoleTTL // Setup networking configuration conf.CNIPath = agentConfig.Client.CNIPath diff --git a/command/agent/config.go b/command/agent/config.go index df7dc91425ee..1266ad26d0fb 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -376,6 +376,12 @@ type ACLConfig struct { PolicyTTL time.Duration PolicyTTLHCL string `hcl:"policy_ttl" json:"-"` + // RoleTTL controls how long we cache ACL roles. This controls how stale + // they can be when we are enforcing policies. Defaults to "30s". + // Reducing this impacts performance by forcing more frequent resolution. + RoleTTL time.Duration + RoleTTLHCL string `hcl:"role_ttl" json:"-"` + // ReplicationToken is used by servers to replicate tokens and policies // from the authoritative region. This must be a valid management token // within the authoritative region. @@ -1250,6 +1256,7 @@ func DefaultConfig() *Config { Enabled: false, TokenTTL: 30 * time.Second, PolicyTTL: 30 * time.Second, + RoleTTL: 30 * time.Second, }, SyslogFacility: "LOCAL0", Telemetry: &Telemetry{ @@ -1755,6 +1762,12 @@ func (a *ACLConfig) Merge(b *ACLConfig) *ACLConfig { if b.PolicyTTLHCL != "" { result.PolicyTTLHCL = b.PolicyTTLHCL } + if b.RoleTTL != 0 { + result.RoleTTL = b.RoleTTL + } + if b.RoleTTLHCL != "" { + result.RoleTTLHCL = b.RoleTTLHCL + } if b.TokenMinExpirationTTL != 0 { result.TokenMinExpirationTTL = b.TokenMinExpirationTTL } diff --git a/command/agent/config_parse.go b/command/agent/config_parse.go index 5004aee0e073..8bacdc836a5a 100644 --- a/command/agent/config_parse.go +++ b/command/agent/config_parse.go @@ -66,6 +66,7 @@ func ParseConfigFile(path string) (*Config, error) { {"gc_interval", &c.Client.GCInterval, &c.Client.GCIntervalHCL, nil}, {"acl.token_ttl", &c.ACL.TokenTTL, &c.ACL.TokenTTLHCL, nil}, {"acl.policy_ttl", &c.ACL.PolicyTTL, &c.ACL.PolicyTTLHCL, nil}, + {"acl.policy_ttl", &c.ACL.RoleTTL, &c.ACL.RoleTTLHCL, nil}, {"acl.token_min_expiration_ttl", &c.ACL.TokenMinExpirationTTL, &c.ACL.TokenMinExpirationTTLHCL, nil}, {"acl.token_max_expiration_ttl", &c.ACL.TokenMaxExpirationTTL, &c.ACL.TokenMaxExpirationTTLHCL, nil}, {"client.server_join.retry_interval", &c.Client.ServerJoin.RetryInterval, &c.Client.ServerJoin.RetryIntervalHCL, nil}, diff --git a/command/agent/config_parse_test.go b/command/agent/config_parse_test.go index 4ba63eb61f7c..94d60cdf691a 100644 --- a/command/agent/config_parse_test.go +++ b/command/agent/config_parse_test.go @@ -155,6 +155,8 @@ var basicConfig = &Config{ TokenTTLHCL: "60s", PolicyTTL: 60 * time.Second, PolicyTTLHCL: "60s", + RoleTTLHCL: "60s", + RoleTTL: 60 * time.Second, TokenMinExpirationTTLHCL: "1h", TokenMinExpirationTTL: 1 * time.Hour, TokenMaxExpirationTTLHCL: "100h", diff --git a/command/agent/config_test.go b/command/agent/config_test.go index 42ec370a0872..f63538ab3e3a 100644 --- a/command/agent/config_test.go +++ b/command/agent/config_test.go @@ -158,6 +158,7 @@ func TestConfig_Merge(t *testing.T) { Enabled: true, TokenTTL: 60 * time.Second, PolicyTTL: 60 * time.Second, + RoleTTL: 60 * time.Second, TokenMinExpirationTTL: 60 * time.Second, TokenMaxExpirationTTL: 60 * time.Second, ReplicationToken: "foo", @@ -360,6 +361,7 @@ func TestConfig_Merge(t *testing.T) { Enabled: true, TokenTTL: 20 * time.Second, PolicyTTL: 20 * time.Second, + RoleTTL: 20 * time.Second, TokenMinExpirationTTL: 20 * time.Second, TokenMaxExpirationTTL: 20 * time.Second, ReplicationToken: "foobar", diff --git a/command/agent/testdata/basic.hcl b/command/agent/testdata/basic.hcl index 1f614b83d1df..55b9977b5b66 100644 --- a/command/agent/testdata/basic.hcl +++ b/command/agent/testdata/basic.hcl @@ -163,6 +163,7 @@ acl { enabled = true token_ttl = "60s" policy_ttl = "60s" + role_ttl = "60s" token_min_expiration_ttl = "1h" token_max_expiration_ttl = "100h" replication_token = "foobar" diff --git a/command/agent/testdata/basic.json b/command/agent/testdata/basic.json index b0e2850ee0c5..990400acf2d6 100644 --- a/command/agent/testdata/basic.json +++ b/command/agent/testdata/basic.json @@ -5,6 +5,7 @@ "policy_ttl": "60s", "replication_token": "foobar", "token_ttl": "60s", + "role_ttl": "60s", "token_min_expiration_ttl": "1h", "token_max_expiration_ttl": "100h" } diff --git a/website/content/docs/configuration/acl.mdx b/website/content/docs/configuration/acl.mdx index 9cfd8abebfaa..530623ac1d7b 100644 --- a/website/content/docs/configuration/acl.mdx +++ b/website/content/docs/configuration/acl.mdx @@ -19,6 +19,7 @@ acl { enabled = true token_ttl = "30s" policy_ttl = "60s" + role_ttl = "60s" } ``` @@ -42,6 +43,12 @@ acl { the request load against servers. If a client cannot reach a server, for example because of an outage, the TTL will be ignored and the cached value used. +- `role_ttl` `(string: "30s")` - Specifies the maximum time-to-live (TTL) for + cached ACL roles. This does not affect servers, since they do not cache roles. + Setting this value lower reduces how stale a role can be, but increases the + request load against servers. If a client cannot reach a server, for example + because of an outage, the TTL will be ignored and the cached value used. + - `replication_token` `(string: "")` - Specifies the Secret ID of the ACL token to use for replicating policies and tokens. This is used by servers in non-authoritative region to mirror the policies and tokens into the local region from [authoritative_region][authoritative-region].