Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hash fields used in task service IDs #3632

Merged
merged 2 commits into from
Dec 8, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
## 0.7.1 (Unreleased)

__BACKWARDS INCOMPATIBILITIES:__
* client: The format of service IDs in Consul has changed. If you rely upon
Nomad's service IDs (*not* service names; those are stable), you will need
to update your code. [GH-3632]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Extra space after period. We should say that relying on the ID is not advisable as it is liable to change

* config: Nomad no longer parses Atlas configuration stanzas. Atlas has been
deprecated since earlier this year. If you have an Atlas stanza in your
config file it will have to be removed.
Expand Down Expand Up @@ -57,6 +60,8 @@ BUG FIXES:
explicitly [GH-3520]
* cli: Fix passing Consul address via flags [GH-3504]
* cli: Fix panic when running `keyring` commands [GH-3509]
* client: Fix advertising services with tags that require URL escaping
[GH-3632]
* client: Fix a panic when restoring an allocation with a dead leader task
[GH-3502]
* client: Fix service/check updating when just interpolated variables change
Expand Down
147 changes: 77 additions & 70 deletions command/agent/consul/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@ package consul

import (
"context"
"crypto/sha1"
"encoding/base32"
"fmt"
"io"
"log"
"net"
"net/url"
Expand All @@ -21,10 +24,14 @@ import (
)

const (
// nomadServicePrefix is the first prefix that scopes all Nomad registered
// services
// nomadServicePrefix is the prefix that scopes all Nomad registered
// services (both agent and task entries).
nomadServicePrefix = "_nomad"

// nomadTaskPrefix is the prefix that scopes Nomad registered services
// for tasks.
nomadTaskPrefix = nomadServicePrefix + "-task-"

// defaultRetryInterval is how quickly to retry syncing services and
// checks to Consul when an error occurs. Will backoff up to a max.
defaultRetryInterval = time.Second
Expand Down Expand Up @@ -288,8 +295,13 @@ func (c *ServiceClient) Run() {

if err := c.sync(); err != nil {
if failures == 0 {
// Log on the first failure
c.logger.Printf("[WARN] consul.sync: failed to update services in Consul: %v", err)
} else if failures%10 == 0 {
// Log every 10th consecutive failure
c.logger.Printf("[ERR] consul.sync: still unable to update services in Consul after %d failures; latest error: %v", failures, err)
}

failures++
if !retryTimer.Stop() {
// Timer already expired, since the timer may
Expand Down Expand Up @@ -389,38 +401,31 @@ func (c *ServiceClient) sync() error {
// Not managed by Nomad, skip
continue
}

// Unknown Nomad managed service; kill
if err := c.client.ServiceDeregister(id); err != nil {
if isOldNomadService(id) {
// Don't hard-fail on old entries. See #3620
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And this is how cruft is born 😢

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I think the optimal solution is soft-failing on all deregistration calls, logging only the first failure per-id, and then logging if it ever succeeds... I guess that would only take a map of IDs to track... should I add that and drop isOldNomadService entirely?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I wasn't asking for an action on this one. I think it is just unfortunate we will have to maintain this behavior for a while!

continue
}

metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
return err
}
sdereg++
metrics.IncrCounter([]string{"client", "consul", "service_deregistrations"}, 1)
}

// Track services whose ports have changed as their checks may also
// need updating
portsChanged := make(map[string]struct{}, len(c.services))

// Add Nomad services missing from Consul
for id, locals := range c.services {
if remotes, ok := consulServices[id]; ok {
// Make sure Port and Address are stable since
// PortLabel and AddressMode aren't included in the
// service ID.
if locals.Port == remotes.Port && locals.Address == remotes.Address {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice that it has become simpler 👍

// Already exists in Consul; skip
continue
if _, ok := consulServices[id]; !ok {
if err = c.client.ServiceRegister(locals); err != nil {
metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
return err
}
// Port changed, reregister it and its checks
portsChanged[id] = struct{}{}
}
if err = c.client.ServiceRegister(locals); err != nil {
metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
return err
sreg++
metrics.IncrCounter([]string{"client", "consul", "service_registrations"}, 1)
}
sreg++
metrics.IncrCounter([]string{"client", "consul", "service_registrations"}, 1)
}

// Remove Nomad checks in Consul but unknown locally
Expand All @@ -433,8 +438,14 @@ func (c *ServiceClient) sync() error {
// Service not managed by Nomad, skip
continue
}
// Unknown Nomad managed check; kill

// Unknown Nomad managed check; remove
if err := c.client.CheckDeregister(id); err != nil {
if isOldNomadService(check.ServiceID) {
// Don't hard-fail on old entries.
continue
}

metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
return err
}
Expand All @@ -444,12 +455,11 @@ func (c *ServiceClient) sync() error {

// Add Nomad checks missing from Consul
for id, check := range c.checks {
if check, ok := consulChecks[id]; ok {
if _, changed := portsChanged[check.ServiceID]; !changed {
// Already in Consul and ports didn't change; skipping
continue
}
if _, ok := consulChecks[id]; ok {
// Already in Consul; skipping
continue
}

if err := c.client.CheckRegister(check); err != nil {
metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
return err
Expand Down Expand Up @@ -751,22 +761,17 @@ func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Ta
continue
}

// Service exists and hasn't changed, don't re-add it later
delete(newIDs, existingID)

// Service still exists so add it to the task's registration
sreg := &ServiceRegistration{
serviceID: existingID,
checkIDs: make(map[string]struct{}, len(newSvc.Checks)),
}
taskReg.Services[existingID] = sreg

// PortLabel and AddressMode aren't included in the ID, so we
// have to compare manually.
serviceUnchanged := newSvc.PortLabel == existingSvc.PortLabel && newSvc.AddressMode == existingSvc.AddressMode
if serviceUnchanged {
// Service exists and hasn't changed, don't add it later
delete(newIDs, existingID)
}

// See what checks were updated
// See if any checks were updated
existingChecks := make(map[string]*structs.ServiceCheck, len(existingSvc.Checks))
for _, check := range existingSvc.Checks {
existingChecks[makeCheckID(existingID, check)] = check
Expand All @@ -779,17 +784,16 @@ func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Ta
// Check exists, so don't remove it
delete(existingChecks, checkID)
sreg.checkIDs[checkID] = struct{}{}
} else if serviceUnchanged {
// New check on an unchanged service; add them now
newCheckIDs, err := c.checkRegs(ops, allocID, existingID, newSvc, newTask, exec, net)
if err != nil {
return err
}
}

for _, checkID := range newCheckIDs {
sreg.checkIDs[checkID] = struct{}{}
// New check on an unchanged service; add them now
newCheckIDs, err := c.checkRegs(ops, allocID, existingID, newSvc, newTask, exec, net)
if err != nil {
return err
}

}
for _, checkID := range newCheckIDs {
sreg.checkIDs[checkID] = struct{}{}

}

Expand Down Expand Up @@ -999,36 +1003,27 @@ func (c *ServiceClient) removeTaskRegistration(allocID, taskName string) {
//
// Agent service IDs are of the form:
//
// {nomadServicePrefix}-{ROLE}-{Service.Name}-{Service.Tags...}
// Example Server ID: _nomad-server-nomad-serf
// Example Client ID: _nomad-client-nomad-client-http
// {nomadServicePrefix}-{ROLE}-b32(sha1({Service.Name}-{Service.Tags...})
// Example Server ID: _nomad-server-FBBK265QN4TMT25ND4EP42TJVMYJ3HR4
// Example Client ID: _nomad-client-GGNJPGL7YN7RGMVXZILMPVRZZVRSZC7L
//
func makeAgentServiceID(role string, service *structs.Service) string {
parts := make([]string, len(service.Tags)+3)
parts[0] = nomadServicePrefix
parts[1] = role
parts[2] = service.Name
copy(parts[3:], service.Tags)
return strings.Join(parts, "-")
h := sha1.New()
io.WriteString(h, service.Name)
for _, tag := range service.Tags {
io.WriteString(h, tag)
}
b32 := base32.StdEncoding.EncodeToString(h.Sum(nil))
return fmt.Sprintf("%s-%s-%s", nomadServicePrefix, role, b32)
}

// makeTaskServiceID creates a unique ID for identifying a task service in
// Consul.
//
// Task service IDs are of the form:
//
// {nomadServicePrefix}-executor-{ALLOC_ID}-{Service.Name}-{Service.Tags...}
// Example Service ID: _nomad-executor-1234-echo-http-tag1-tag2-tag3
// Consul. All structs.Service fields are included in the ID's hash except
// Checks. This allows updates to merely compare IDs.
//
// Example Service ID: _nomad-task-TNM333JKJPM5AK4FAS3VXQLXFDWOF4VH
func makeTaskServiceID(allocID, taskName string, service *structs.Service) string {
parts := make([]string, len(service.Tags)+5)
parts[0] = nomadServicePrefix
parts[1] = "executor"
parts[2] = allocID
parts[3] = taskName
parts[4] = service.Name
copy(parts[5:], service.Tags)
return strings.Join(parts, "-")
return nomadTaskPrefix + service.Hash(allocID, taskName)
}

// makeCheckID creates a unique ID for a check.
Expand Down Expand Up @@ -1084,9 +1079,21 @@ func createCheckReg(serviceID, checkID string, check *structs.ServiceCheck, host
}

// isNomadService returns true if the ID matches the pattern of a Nomad managed
// service. Agent services return false as independent client and server agents
// may be running on the same machine. #2827
// service (new or old formats). Agent services return false as independent
// client and server agents may be running on the same machine. #2827
func isNomadService(id string) bool {
return strings.HasPrefix(id, nomadTaskPrefix) || isOldNomadService(id)
}

// isOldNomadService returns true if the ID matches an old pattern managed by
// Nomad.
//
// Pre-0.7.1 task service IDs are of the form:
//
// {nomadServicePrefix}-executor-{ALLOC_ID}-{Service.Name}-{Service.Tags...}
// Example Service ID: _nomad-executor-1234-echo-http-tag1-tag2-tag3
//
func isOldNomadService(id string) bool {
const prefix = nomadServicePrefix + "-executor"
return strings.HasPrefix(id, prefix)
}
Expand Down
7 changes: 6 additions & 1 deletion command/agent/consul/int_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,12 @@ func TestConsul_Integration(t *testing.T) {
{
Name: "httpd2",
PortLabel: "http",
Tags: []string{"test", "http2"},
Tags: []string{
"test",
// Use URL-unfriendly tags to test #3620
"public-test.ettaviation.com:80/ redirect=302,https://test.ettaviation.com",
"public-test.ettaviation.com:443/",
},
},
}

Expand Down
23 changes: 15 additions & 8 deletions command/agent/consul/unit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,8 +220,8 @@ func TestConsul_ChangeTags(t *testing.T) {
}

// TestConsul_ChangePorts asserts that changing the ports on a service updates
// it in Consul. Since ports are not part of the service ID this is a slightly
// different code path than changing tags.
// it in Consul. Pre-0.7.1 ports were not part of the service ID and this was a
// slightly different code path than changing tags.
func TestConsul_ChangePorts(t *testing.T) {
ctx := setupFake()
ctx.Task.Services[0].Checks = []*structs.ServiceCheck{
Expand Down Expand Up @@ -349,8 +349,8 @@ func TestConsul_ChangePorts(t *testing.T) {
}

for k, v := range ctx.FakeConsul.services {
if k != origServiceKey {
t.Errorf("unexpected key change; was: %q -- but found %q", origServiceKey, k)
if k == origServiceKey {
t.Errorf("expected key change; still: %q", k)
}
if v.Name != ctx.Task.Services[0].Name {
t.Errorf("expected Name=%q != %q", ctx.Task.Services[0].Name, v.Name)
Expand All @@ -370,15 +370,15 @@ func TestConsul_ChangePorts(t *testing.T) {
for k, v := range ctx.FakeConsul.checks {
switch v.Name {
case "c1":
if k != origTCPKey {
t.Errorf("unexpected key change for %s from %q to %q", v.Name, origTCPKey, k)
if k == origTCPKey {
t.Errorf("expected key change for %s from %q", v.Name, origTCPKey)
}
if expected := fmt.Sprintf(":%d", xPort); v.TCP != expected {
t.Errorf("expected Port x=%v but found: %v", expected, v.TCP)
}
case "c2":
if k != origScriptKey {
t.Errorf("unexpected key change for %s from %q to %q", v.Name, origScriptKey, k)
if k == origScriptKey {
t.Errorf("expected key change for %s from %q", v.Name, origScriptKey)
}
select {
case <-ctx.execs:
Expand Down Expand Up @@ -1383,9 +1383,16 @@ func TestIsNomadService(t *testing.T) {
}{
{"_nomad-client-nomad-client-http", false},
{"_nomad-server-nomad-serf", false},

// Pre-0.7.1 style IDs still match
{"_nomad-executor-abc", true},
{"_nomad-executor", true},

// Post-0.7.1 style IDs match
{"_nomad-task-FBBK265QN4TMT25ND4EP42TJVMYJ3HR4", true},

{"not-nomad", false},
{"_nomad", false},
}

for _, test := range tests {
Expand Down
20 changes: 15 additions & 5 deletions nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"crypto/sha1"
"crypto/sha256"
"crypto/sha512"
"encoding/base32"
"encoding/hex"
"errors"
"fmt"
Expand Down Expand Up @@ -3140,15 +3141,24 @@ func (s *Service) ValidateName(name string) error {
return nil
}

// Hash calculates the hash of the check based on it's content and the service
// which owns it
func (s *Service) Hash() string {
// Hash returns a base32 encoded hash of a Service's contents excluding checks
// as they're hashed independently.
func (s *Service) Hash(allocID, taskName string) string {
h := sha1.New()
io.WriteString(h, allocID)
io.WriteString(h, taskName)
io.WriteString(h, s.Name)
io.WriteString(h, strings.Join(s.Tags, ""))
io.WriteString(h, s.PortLabel)
io.WriteString(h, s.AddressMode)
return fmt.Sprintf("%x", h.Sum(nil))
for _, tag := range s.Tags {
io.WriteString(h, tag)
}

// Base32 is used for encoding the hash as sha1 hashes can always be
// encoded without padding, only 4 bytes larger than base64, and saves
// 8 bytes vs hex. Since these hashes are used in Consul URLs it's nice
// to have a reasonably compact URL-safe representation.
return base32.StdEncoding.EncodeToString(h.Sum(nil))
}

const (
Expand Down