From 1570972cb33d4a9141c213e331b3064acc4a253f Mon Sep 17 00:00:00 2001 From: Chelsea Holland Komlo Date: Thu, 25 Jan 2018 11:30:15 -0500 Subject: [PATCH 01/24] add concept of health checks to fingerprinters and nodes fix up feedback from code review add driver info for all drivers to node --- api/nodes.go | 11 ++ client/client.go | 44 +++++- client/client_test.go | 15 ++ client/driver/docker.go | 36 ++++- client/driver/docker_test.go | 43 ++++++ client/driver/mock_driver.go | 39 +++++- client/fingerprint/fingerprint.go | 15 ++ client/fingerprint_manager.go | 141 +++++++++++++++++-- client/fingerprint_manager_test.go | 211 ++++++++++++++++++++++++++++- client/structs/structs.go | 41 ++++++ nomad/structs/structs.go | 47 +++++++ 11 files changed, 621 insertions(+), 22 deletions(-) diff --git a/api/nodes.go b/api/nodes.go index db7f25ffb39e..549eeea66639 100644 --- a/api/nodes.go +++ b/api/nodes.go @@ -4,6 +4,7 @@ import ( "fmt" "sort" "strconv" + "time" ) // Nodes is used to query node-related API endpoints @@ -96,6 +97,15 @@ func (n *Nodes) GcAlloc(allocID string, q *QueryOptions) error { return err } +// DriverInfo is used to deserialize a DriverInfo entry +type DriverInfo struct { + Attributes map[string]string + Detected bool + Healthy bool + HealthDescription string + UpdateTime time.Time +} + // Node is used to deserialize a node entry. type Node struct { ID string @@ -114,6 +124,7 @@ type Node struct { StatusDescription string StatusUpdatedAt int64 Events []*NodeEvent + Drivers map[string]*DriverInfo CreateIndex uint64 ModifyIndex uint64 } diff --git a/client/client.go b/client/client.go index 5c2211546676..909fd368b4cd 100644 --- a/client/client.go +++ b/client/client.go @@ -259,7 +259,7 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulServic } fingerprintManager := NewFingerprintManager(c.GetConfig, c.config.Node, - c.shutdownCh, c.updateNodeFromFingerprint, c.logger) + c.shutdownCh, c.updateNodeFromFingerprint, c.updateNodeFromHealthCheck, c.logger) // Fingerprint the node and scan for drivers if err := fingerprintManager.Run(); err != nil { @@ -894,6 +894,9 @@ func (c *Client) setupNode() error { if node.Links == nil { node.Links = make(map[string]string) } + if node.Drivers == nil { + node.Drivers = make(map[string]*structs.DriverInfo) + } if node.Meta == nil { node.Meta = make(map[string]string) } @@ -1003,9 +1006,48 @@ func (c *Client) updateNodeFromFingerprint(response *cstructs.FingerprintRespons c.config.Node.Resources.Merge(response.Resources) } + for name, new_val := range response.Drivers { + old_val := c.config.Node.Drivers[name] + if new_val.Equals(old_val) { + continue + } + if old_val == nil { + c.config.Node.Drivers[name] = new_val + } else { + c.config.Node.Drivers[name].MergeFingerprintInfo(new_val) + } + } + if nodeHasChanged { + c.updateNode() + } + + return c.config.Node +} + +func (c *Client) updateNodeFromHealthCheck(response *cstructs.HealthCheckResponse) *structs.Node { + c.configLock.Lock() + defer c.configLock.Unlock() + + nodeHasChanged := false + + // update the node with the latest driver health information + for name, new_val := range response.Drivers { + old_val := c.config.Node.Drivers[name] + if new_val.Equals(old_val) { + continue + } + nodeHasChanged = true + if old_val == nil { + c.config.Node.Drivers[name] = new_val + } else { + c.config.Node.Drivers[name].MergeHealthCheck(new_val) + } + } + if nodeHasChanged { c.updateNode() } + return c.config.Node } diff --git a/client/client_test.go b/client/client_test.go index de5d35ee9970..821ce40d2d0f 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -148,6 +148,21 @@ func TestClient_Fingerprint_Periodic(t *testing.T) { if mockDriverStatus == "" { return false, fmt.Errorf("mock driver attribute should be set on the client") } + + // assert that the Driver information for the node is also set correctly + mockDriverInfo := node.Drivers["mock_driver"] + if mockDriverInfo == nil { + return false, fmt.Errorf("mock driver is nil when it should be set on node Drivers") + } + if !mockDriverInfo.Detected { + return false, fmt.Errorf("mock driver should be set as healthy") + } + if !mockDriverInfo.Healthy { + return false, fmt.Errorf("mock driver should be set as healthy") + } + if mockDriverInfo.HealthDescription == "" { + return false, fmt.Errorf("mock driver description should not be empty") + } return true, nil }, func(err error) { t.Fatalf("err: %v", err) diff --git a/client/driver/docker.go b/client/driver/docker.go index 4f6871333296..0fcfe03914f5 100644 --- a/client/driver/docker.go +++ b/client/driver/docker.go @@ -492,7 +492,6 @@ func (d *DockerDriver) Fingerprint(req *cstructs.FingerprintRequest, resp *cstru d.logger.Printf("[INFO] driver.docker: failed to initialize client: %s", err) } d.fingerprintSuccess = helper.BoolToPtr(false) - resp.RemoveAttribute(dockerDriverAttr) return nil } @@ -552,6 +551,41 @@ func (d *DockerDriver) Fingerprint(req *cstructs.FingerprintRequest, resp *cstru return nil } +// HealthChck implements the interface for the HealthCheck interface. This +// performs a health check on the docker driver, asserting whether the docker +// driver is responsive to a `docker ps` command. +func (d *DockerDriver) HealthCheck(req *cstructs.HealthCheckRequest, resp *cstructs.HealthCheckResponse) error { + unhealthy := &structs.DriverInfo{ + HealthDescription: "Docker driver is available but unresponsive", + UpdateTime: time.Now(), + } + + _, _, err := d.dockerClients() + if err != nil { + d.logger.Printf("[WARN] driver.docker: docker driver is available but is unresponsive to `docker ps`") + resp.AddDriverInfo("docker", unhealthy) + return err + } + + d.logger.Printf("[TRACE] driver.docker: docker driver is available and is responsive to `docker ps`") + healthy := &structs.DriverInfo{ + Healthy: true, + HealthDescription: "Docker driver is available and responsive", + UpdateTime: time.Now(), + } + resp.AddDriverInfo("docker", healthy) + return nil +} + +// GetHealthChecks implements the interface for the HealthCheck interface. This +// sets whether the driver is eligible for periodic health checks and the +// interval at which to do them. +func (d *DockerDriver) GetHealthCheckInterval(req *cstructs.HealthCheckIntervalRequest, resp *cstructs.HealthCheckIntervalResponse) error { + resp.Eligible = true + resp.Period = 1 * time.Minute + return nil +} + // Validate is used to validate the driver configuration func (d *DockerDriver) Validate(config map[string]interface{}) error { fd := &fields.FieldData{ diff --git a/client/driver/docker_test.go b/client/driver/docker_test.go index 58e9bb813a06..3063a79aa46a 100644 --- a/client/driver/docker_test.go +++ b/client/driver/docker_test.go @@ -21,6 +21,7 @@ import ( "github.com/hashicorp/nomad/client/allocdir" "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/client/driver/env" + "github.com/hashicorp/nomad/client/fingerprint" cstructs "github.com/hashicorp/nomad/client/structs" "github.com/hashicorp/nomad/client/testutil" "github.com/hashicorp/nomad/helper/uuid" @@ -164,6 +165,7 @@ func TestDockerDriver_Fingerprint(t *testing.T) { if !tu.IsTravis() { t.Parallel() } + ctx := testDockerDriverContexts(t, &structs.Task{Name: "foo", Driver: "docker", Resources: basicResources}) //ctx.DriverCtx.config.Options = map[string]string{"docker.cleanup.image": "false"} defer ctx.AllocDir.Destroy() @@ -227,6 +229,7 @@ func TestDockerDriver_Fingerprint_Bridge(t *testing.T) { request := &cstructs.FingerprintRequest{Config: conf, Node: conf.Node} var response cstructs.FingerprintResponse + err = dd.Fingerprint(request, &response) if err != nil { t.Fatalf("error fingerprinting docker: %v", err) @@ -251,6 +254,46 @@ func TestDockerDriver_Fingerprint_Bridge(t *testing.T) { t.Logf("docker bridge ip: %q", attributes["driver.docker.bridge_ip"]) } +func TestDockerDriver_Check_DockerHealthStatus(t *testing.T) { + if !tu.IsTravis() { + t.Parallel() + } + if !testutil.DockerIsConnected(t) { + t.Skip("requires Docker") + } + if runtime.GOOS != "linux" { + t.Skip("expect only on linux") + } + + require := require.New(t) + + // This seems fragile, so we might need to reconsider this test if it + // proves flaky + expectedAddr, err := sockaddr.GetInterfaceIP("docker0") + if err != nil { + t.Fatalf("unable to get ip for docker0: %v", err) + } + if expectedAddr == "" { + t.Fatalf("unable to get ip for docker bridge") + } + + conf := testConfig(t) + conf.Node = mock.Node() + dd := NewDockerDriver(NewDriverContext("", "", conf, conf.Node, testLogger(), nil)) + + request := &cstructs.HealthCheckRequest{} + var response cstructs.HealthCheckResponse + + dc, ok := dd.(fingerprint.HealthCheck) + require.True(ok) + err = dc.HealthCheck(request, &response) + require.Nil(err) + + driverInfo := response.Drivers["docker"] + require.NotNil(driverInfo) + require.True(driverInfo.Healthy) +} + func TestDockerDriver_StartOpen_Wait(t *testing.T) { if !tu.IsTravis() { t.Parallel() diff --git a/client/driver/mock_driver.go b/client/driver/mock_driver.go index e8a90f505d50..cab5844ccc00 100644 --- a/client/driver/mock_driver.go +++ b/client/driver/mock_driver.go @@ -30,6 +30,8 @@ const ( // to "stop" a previously functioning driver after the specified duration // (specified in seconds) for testing of periodic drivers and fingerprinters. ShutdownPeriodicDuration = "test.shutdown_periodic_duration" + + mockDriverName = "driver.mock_driver" ) // MockDriverConfig is the driver configuration for the MockDriver @@ -234,9 +236,9 @@ func (m *MockDriver) Fingerprint(req *cstructs.FingerprintRequest, resp *cstruct // current time is after the time which the node should shut down, simulate // driver failure case !m.shutdownFingerprintTime.IsZero() && time.Now().After(m.shutdownFingerprintTime): - resp.RemoveAttribute("driver.mock_driver") + resp.RemoveAttribute(mockDriverName) default: - resp.AddAttribute("driver.mock_driver", "1") + resp.AddAttribute(mockDriverName, "1") resp.Detected = true } return nil @@ -247,6 +249,39 @@ func (m *MockDriver) Periodic() (bool, time.Duration) { return true, 500 * time.Millisecond } +// HealthCheck implements the interface for HealthCheck, and indicates the current +// health status of the mock driver. +func (m *MockDriver) HealthCheck(req *cstructs.HealthCheckRequest, resp *cstructs.HealthCheckResponse) error { + switch { + case !m.shutdownFingerprintTime.IsZero() && time.Now().After(m.shutdownFingerprintTime): + notHealthy := &structs.DriverInfo{ + Healthy: false, + HealthDescription: "not running", + UpdateTime: time.Now(), + } + resp.AddDriverInfo("mock_driver", notHealthy) + return nil + default: + healthy := &structs.DriverInfo{ + Healthy: true, + HealthDescription: "running", + UpdateTime: time.Now(), + } + resp.AddDriverInfo("mock_driver", healthy) + return nil + } +} + +// GetHealthCheckInterval implements the interface for HealthCheck and indicates +// that mock driver should be checked periodically. Returns a boolean +// indicating if ti should be checked, and the duration at which to do this +// check. +func (m *MockDriver) GetHealthCheckInterval(req *cstructs.HealthCheckIntervalRequest, resp *cstructs.HealthCheckIntervalResponse) error { + resp.Eligible = true + resp.Period = 1 * time.Second + return nil +} + // MockDriverHandle is a driver handler which supervises a mock task type mockDriverHandle struct { ctx *ExecContext diff --git a/client/fingerprint/fingerprint.go b/client/fingerprint/fingerprint.go index 8a3477f51747..d6746a03d9d8 100644 --- a/client/fingerprint/fingerprint.go +++ b/client/fingerprint/fingerprint.go @@ -85,6 +85,21 @@ func NewFingerprint(name string, logger *log.Logger) (Fingerprint, error) { // Factory is used to instantiate a new Fingerprint type Factory func(*log.Logger) Fingerprint +// HealthCheck is used for doing periodic health checks. On a given time +// interfal, a health check will be called by the fingerprint manager of the +// node. +type HealthCheck interface { + // Check is used to update properties of the node on the status of the health + // check + HealthCheck(*cstructs.HealthCheckRequest, *cstructs.HealthCheckResponse) error + + // GetHealthCheckInterval is a mechanism for the health checker to indicate that + // it should be run periodically. The return value is a boolean indicating + // whether it should be done periodically, and the time interval at which + // this check should happen. + GetHealthCheckInterval(*cstructs.HealthCheckIntervalRequest, *cstructs.HealthCheckIntervalResponse) error +} + // Fingerprint is used for doing "fingerprinting" of the // host to automatically determine attributes, resources, // and metadata about it. Each of these is a heuristic, and diff --git a/client/fingerprint_manager.go b/client/fingerprint_manager.go index 74b75bafde4c..0c2b3b4f9fae 100644 --- a/client/fingerprint_manager.go +++ b/client/fingerprint_manager.go @@ -20,10 +20,14 @@ type FingerprintManager struct { nodeLock sync.Mutex shutdownCh chan struct{} - // updateNode is a callback to the client to update the state of its + // updateNodeAttributes is a callback to the client to update the state of its // associated node - updateNode func(*cstructs.FingerprintResponse) *structs.Node - logger *log.Logger + updateNodeAttributes func(*cstructs.FingerprintResponse) *structs.Node + + // UpdateHealthCheck is a callback to the client to update the state of the + // node for resources that require a health check + updateHealthCheck func(*cstructs.HealthCheckResponse) *structs.Node + logger *log.Logger } // NewFingerprintManager is a constructor that creates and returns an instance @@ -31,19 +35,21 @@ type FingerprintManager struct { func NewFingerprintManager(getConfig func() *config.Config, node *structs.Node, shutdownCh chan struct{}, - updateNode func(*cstructs.FingerprintResponse) *structs.Node, + updateNodeAttributes func(*cstructs.FingerprintResponse) *structs.Node, + updateHealthCheck func(*cstructs.HealthCheckResponse) *structs.Node, logger *log.Logger) *FingerprintManager { return &FingerprintManager{ - getConfig: getConfig, - updateNode: updateNode, - node: node, - shutdownCh: shutdownCh, - logger: logger, + getConfig: getConfig, + updateNodeAttributes: updateNodeAttributes, + updateHealthCheck: updateHealthCheck, + node: node, + shutdownCh: shutdownCh, + logger: logger, } } -// run runs each fingerprinter individually on an ongoing basis -func (fm *FingerprintManager) run(f fingerprint.Fingerprint, period time.Duration, name string) { +// runFingerprint runs each fingerprinter individually on an ongoing basis +func (fm *FingerprintManager) runFingerprint(f fingerprint.Fingerprint, period time.Duration, name string) { fm.logger.Printf("[DEBUG] client.fingerprint_manager: fingerprinting %s every %v", name, period) for { @@ -61,6 +67,25 @@ func (fm *FingerprintManager) run(f fingerprint.Fingerprint, period time.Duratio } } +// runHealthCheck runs each health check individually on an ongoing basis +func (fm *FingerprintManager) runHealthCheck(hc fingerprint.HealthCheck, period time.Duration, name string) { + fm.logger.Printf("[DEBUG] client.fingerprint_manager: healthchecking %s every %v", name, period) + + for { + select { + case <-time.After(period): + err := fm.healthCheck(name, hc) + if err != nil { + fm.logger.Printf("[DEBUG] client.fingerprint_manager: health checking for %v failed: %+v", name, err) + continue + } + + case <-fm.shutdownCh: + return + } + } +} + // setupDrivers is used to fingerprint the node to see if these drivers are // supported func (fm *FingerprintManager) setupDrivers(drivers []string) error { @@ -73,7 +98,7 @@ func (fm *FingerprintManager) setupDrivers(drivers []string) error { return err } - detected, err := fm.fingerprint(name, d) + detected, err := fm.fingerprintDriver(name, d) if err != nil { fm.logger.Printf("[DEBUG] client.fingerprint_manager: fingerprinting for %v failed: %+v", name, err) return err @@ -86,7 +111,19 @@ func (fm *FingerprintManager) setupDrivers(drivers []string) error { p, period := d.Periodic() if p { - go fm.run(d, period, name) + go fm.runFingerprint(d, period, name) + } + + // We should only run the health check task if the driver is detected + // Note that if the driver is detected later in a periodic health check, + // this won't automateically trigger the periodic health check. + if hc, ok := d.(fingerprint.HealthCheck); ok && detected { + req := &cstructs.HealthCheckIntervalRequest{} + resp := &cstructs.HealthCheckIntervalResponse{} + hc.GetHealthCheckInterval(req, resp) + if resp.Eligible { + go fm.runHealthCheck(hc, resp.Period, name) + } } } @@ -94,6 +131,53 @@ func (fm *FingerprintManager) setupDrivers(drivers []string) error { return nil } +// fingerprintDriver is a temporary solution to move towards DriverInfo and +// away from annotating a node's attributes to demonstrate support for a +// particular driver. Takes the FingerprintResponse and converts it to the +// proper DriverInfo update and then sets the prefix attributes as well +func (fm *FingerprintManager) fingerprintDriver(name string, f fingerprint.Fingerprint) (bool, error) { + request := &cstructs.FingerprintRequest{Config: fm.getConfig(), Node: fm.node} + var response cstructs.FingerprintResponse + if err := f.Fingerprint(request, &response); err != nil { + return false, err + } + + // TODO This is a temporary measure, as eventually all drivers will need to + // support this. Doing this so that we can enable this iteratively and also + // in a backwards compatible way, where node attributes for drivers will + // eventually be phased out. + di := &structs.DriverInfo{ + Attributes: response.Attributes, + Detected: response.Detected, + } + response.AddDriver(name, di) + + if node := fm.updateNodeAttributes(&response); node != nil { + fm.nodeLock.Lock() + fm.node = node + fm.nodeLock.Unlock() + } + + if hc, ok := f.(fingerprint.HealthCheck); ok { + fm.healthCheck(name, hc) + } else { + + resp := &cstructs.HealthCheckResponse{ + Drivers: map[string]*structs.DriverInfo{ + name: di, + }, + } + + if node := fm.updateHealthCheck(resp); node != nil { + fm.nodeLock.Lock() + fm.node = node + fm.nodeLock.Unlock() + } + } + + return response.Detected, nil +} + // fingerprint does an initial fingerprint of the client. If the fingerprinter // is meant to be run continuously, a process is launched to perform this // fingerprint on an ongoing basis in the background. @@ -109,7 +193,7 @@ func (fm *FingerprintManager) fingerprint(name string, f fingerprint.Fingerprint return false, err } - if node := fm.updateNode(&response); node != nil { + if node := fm.updateNodeAttributes(&response); node != nil { fm.nodeLock.Lock() fm.node = node fm.nodeLock.Unlock() @@ -118,6 +202,23 @@ func (fm *FingerprintManager) fingerprint(name string, f fingerprint.Fingerprint return response.Detected, nil } +// healthcheck checks the health of the specified resource. +func (fm *FingerprintManager) healthCheck(name string, hc fingerprint.HealthCheck) error { + request := &cstructs.HealthCheckRequest{} + var response cstructs.HealthCheckResponse + if err := hc.HealthCheck(request, &response); err != nil { + return err + } + + if node := fm.updateHealthCheck(&response); node != nil { + fm.nodeLock.Lock() + fm.node = node + fm.nodeLock.Unlock() + } + + return nil +} + // setupFingerprints is used to fingerprint the node to see if these attributes are // supported func (fm *FingerprintManager) setupFingerprinters(fingerprints []string) error { @@ -143,7 +244,17 @@ func (fm *FingerprintManager) setupFingerprinters(fingerprints []string) error { p, period := f.Periodic() if p { - go fm.run(f, period, name) + go fm.runFingerprint(f, period, name) + } + + if hc, ok := f.(fingerprint.HealthCheck); ok { + req := &cstructs.HealthCheckIntervalRequest{} + var resp cstructs.HealthCheckIntervalResponse + if err := hc.GetHealthCheckInterval(req, &resp); err != nil { + if resp.Eligible { + go fm.runHealthCheck(hc, resp.Period, name) + } + } } } diff --git a/client/fingerprint_manager_test.go b/client/fingerprint_manager_test.go index 30f12e0256e8..c24d9c59ed5b 100644 --- a/client/fingerprint_manager_test.go +++ b/client/fingerprint_manager_test.go @@ -6,6 +6,7 @@ import ( "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/client/driver" + cstructs "github.com/hashicorp/nomad/client/structs" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/testutil" "github.com/stretchr/testify/require" @@ -20,6 +21,7 @@ func TestFingerprintManager_Run_MockDriver(t *testing.T) { Attributes: make(map[string]string, 0), Links: make(map[string]string, 0), Resources: &structs.Resources{}, + Drivers: make(map[string]*structs.DriverInfo, 0), } testConfig := config.Config{Node: node} testClient := &Client{config: &testConfig} @@ -34,6 +36,7 @@ func TestFingerprintManager_Run_MockDriver(t *testing.T) { node, make(chan struct{}), testClient.updateNodeFromFingerprint, + testClient.updateNodeFromHealthCheck, testLogger(), ) @@ -51,6 +54,7 @@ func TestFingerprintManager_Run_ResourcesFingerprint(t *testing.T) { Attributes: make(map[string]string, 0), Links: make(map[string]string, 0), Resources: &structs.Resources{}, + Drivers: make(map[string]*structs.DriverInfo, 0), } testConfig := config.Config{Node: node} testClient := &Client{config: &testConfig} @@ -65,6 +69,7 @@ func TestFingerprintManager_Run_ResourcesFingerprint(t *testing.T) { node, make(chan struct{}), testClient.updateNodeFromFingerprint, + testClient.updateNodeFromHealthCheck, testLogger(), ) @@ -83,6 +88,7 @@ func TestFingerprintManager_Fingerprint_Run(t *testing.T) { Attributes: make(map[string]string, 0), Links: make(map[string]string, 0), Resources: &structs.Resources{}, + Drivers: make(map[string]*structs.DriverInfo, 0), } testConfig := config.Config{Node: node} testClient := &Client{config: &testConfig} @@ -98,6 +104,7 @@ func TestFingerprintManager_Fingerprint_Run(t *testing.T) { node, make(chan struct{}), testClient.updateNodeFromFingerprint, + testClient.updateNodeFromHealthCheck, testLogger(), ) @@ -105,6 +112,7 @@ func TestFingerprintManager_Fingerprint_Run(t *testing.T) { require.Nil(err) require.NotEqual("", node.Attributes["driver.raw_exec"]) + require.True(node.Drivers["raw_exec"].Detected) } func TestFingerprintManager_Fingerprint_Periodic(t *testing.T) { @@ -115,6 +123,7 @@ func TestFingerprintManager_Fingerprint_Periodic(t *testing.T) { Attributes: make(map[string]string, 0), Links: make(map[string]string, 0), Resources: &structs.Resources{}, + Drivers: make(map[string]*structs.DriverInfo, 0), } testConfig := config.Config{Node: node} testClient := &Client{config: &testConfig} @@ -138,13 +147,14 @@ func TestFingerprintManager_Fingerprint_Periodic(t *testing.T) { node, shutdownCh, testClient.updateNodeFromFingerprint, + testClient.updateNodeFromHealthCheck, testLogger(), ) err := fm.Run() require.Nil(err) - // Ensure the mock driver is registered on the client + // Ensure the mock driver is registered and healthy on the client testutil.WaitForResult(func() (bool, error) { mockDriverStatus := node.Attributes["driver.mock_driver"] if mockDriverStatus == "" { @@ -155,7 +165,8 @@ func TestFingerprintManager_Fingerprint_Periodic(t *testing.T) { t.Fatalf("err: %v", err) }) - // Ensure that the client fingerprinter eventually removes this attribute + // Ensure that the client fingerprinter eventually removes this attribute and + // marks the driver as unhealthy testutil.WaitForResult(func() (bool, error) { mockDriverStatus := node.Attributes["driver.mock_driver"] if mockDriverStatus != "" { @@ -167,7 +178,187 @@ func TestFingerprintManager_Fingerprint_Periodic(t *testing.T) { }) } -func TestFingerprintManager_Run_InWhitelist(t *testing.T) { +// This is a temporary measure to check that a driver has both attributes on a +// node set as well as DriverInfo. +func TestFingerprintManager_HealthCheck_Driver(t *testing.T) { + t.Parallel() + require := require.New(t) + + node := &structs.Node{ + Attributes: make(map[string]string, 0), + Drivers: make(map[string]*structs.DriverInfo, 0), + } + updateNode := func(r *cstructs.FingerprintResponse) *structs.Node { + if r.Attributes != nil { + for k, v := range r.Attributes { + node.Attributes[k] = v + } + } + return node + } + updateHealthCheck := func(resp *cstructs.HealthCheckResponse) *structs.Node { + if resp.Drivers != nil { + for k, v := range resp.Drivers { + node.Drivers[k] = v + } + } + return node + } + conf := config.DefaultConfig() + conf.Options = map[string]string{ + "driver.raw_exec.enable": "1", + "test.shutdown_periodic_after": "true", + "test.shutdown_periodic_duration": "2", + } + getConfig := func() *config.Config { + return conf + } + + shutdownCh := make(chan struct{}) + defer (func() { + close(shutdownCh) + })() + + fm := NewFingerprintManager( + getConfig, + node, + shutdownCh, + updateNode, + updateHealthCheck, + testLogger(), + ) + + err := fm.Run() + require.Nil(err) + + // Ensure the mock driver is registered and healthy on the client + testutil.WaitForResult(func() (bool, error) { + mockDriverAttribute := node.Attributes["driver.mock_driver"] + if mockDriverAttribute == "" { + return false, fmt.Errorf("mock driver info should be set on the client attributes") + } + mockDriverInfo := node.Drivers["mock_driver"] + if mockDriverInfo == nil { + return false, fmt.Errorf("mock driver info should be set on the client") + } + if !mockDriverInfo.Healthy { + return false, fmt.Errorf("mock driver info should be healthy") + } + return true, nil + }, func(err error) { + t.Fatalf("err: %v", err) + }) + + // Ensure that a default driver without health checks enabled is registered and healthy on the client + testutil.WaitForResult(func() (bool, error) { + rawExecAttribute := node.Attributes["driver.raw_exec"] + if rawExecAttribute == "" { + return false, fmt.Errorf("raw exec info should be set on the client attributes") + } + rawExecInfo := node.Drivers["raw_exec"] + if rawExecInfo == nil { + return false, fmt.Errorf("raw exec driver info should be set on the client") + } + if !rawExecInfo.Detected { + return false, fmt.Errorf("raw exec driver should be detected") + } + return true, nil + }, func(err error) { + t.Fatalf("err: %v", err) + }) +} + +func TestFingerprintManager_HealthCheck_Periodic(t *testing.T) { + t.Parallel() + require := require.New(t) + + node := &structs.Node{ + Drivers: make(map[string]*structs.DriverInfo, 0), + } + updateNode := func(resp *cstructs.FingerprintResponse) *structs.Node { + for k, v := range resp.Drivers { + if node.Drivers[k] == nil { + node.Drivers[k] = v + } else { + node.Drivers[k].MergeFingerprintInfo(v) + } + } + return node + } + updateHealthCheck := func(resp *cstructs.HealthCheckResponse) *structs.Node { + for k, v := range resp.Drivers { + if node.Drivers[k] == nil { + node.Drivers[k] = v + } else { + node.Drivers[k].MergeHealthCheck(v) + } + } + return node + } + conf := config.DefaultConfig() + conf.Options = map[string]string{ + "test.shutdown_periodic_after": "true", + "test.shutdown_periodic_duration": "2", + } + getConfig := func() *config.Config { + return conf + } + + shutdownCh := make(chan struct{}) + defer (func() { + close(shutdownCh) + })() + + fm := NewFingerprintManager( + getConfig, + node, + shutdownCh, + updateNode, + updateHealthCheck, + testLogger(), + ) + + err := fm.Run() + require.Nil(err) + + // Ensure the mock driver is registered and healthy on the client + testutil.WaitForResult(func() (bool, error) { + mockDriverInfo := node.Drivers["mock_driver"] + if mockDriverInfo == nil { + return false, fmt.Errorf("mock driver info should be set on the client") + } + if !mockDriverInfo.Detected { + return false, fmt.Errorf("mock driver info should be detected") + } + if !mockDriverInfo.Healthy { + return false, fmt.Errorf("mock driver info should be healthy") + } + return true, nil + }, func(err error) { + t.Fatalf("err: %v", err) + }) + + // Ensure that the client health check eventually removes this attribute and + // marks the driver as unhealthy + testutil.WaitForResult(func() (bool, error) { + + mockDriverInfo := node.Drivers["mock_driver"] + if mockDriverInfo == nil { + return false, fmt.Errorf("mock driver info should be set on the client") + } + if !mockDriverInfo.Detected { + return false, fmt.Errorf("mock driver should be detected") + } + if mockDriverInfo.Healthy { + return false, fmt.Errorf("mock driver should not be healthy") + } + return true, nil + }, func(err error) { + t.Fatalf("err: %v", err) + }) +} + +func TestFimgerprintManager_Run_InWhitelist(t *testing.T) { t.Parallel() require := require.New(t) @@ -175,6 +366,7 @@ func TestFingerprintManager_Run_InWhitelist(t *testing.T) { Attributes: make(map[string]string, 0), Links: make(map[string]string, 0), Resources: &structs.Resources{}, + Drivers: make(map[string]*structs.DriverInfo, 0), } testConfig := config.Config{Node: node} testClient := &Client{config: &testConfig} @@ -195,6 +387,7 @@ func TestFingerprintManager_Run_InWhitelist(t *testing.T) { node, shutdownCh, testClient.updateNodeFromFingerprint, + testClient.updateNodeFromHealthCheck, testLogger(), ) @@ -211,6 +404,7 @@ func TestFingerprintManager_Run_InBlacklist(t *testing.T) { Attributes: make(map[string]string, 0), Links: make(map[string]string, 0), Resources: &structs.Resources{}, + Drivers: make(map[string]*structs.DriverInfo, 0), } testConfig := config.Config{Node: node} testClient := &Client{config: &testConfig} @@ -232,6 +426,7 @@ func TestFingerprintManager_Run_InBlacklist(t *testing.T) { node, shutdownCh, testClient.updateNodeFromFingerprint, + testClient.updateNodeFromHealthCheck, testLogger(), ) @@ -249,6 +444,7 @@ func TestFingerprintManager_Run_Combination(t *testing.T) { Attributes: make(map[string]string, 0), Links: make(map[string]string, 0), Resources: &structs.Resources{}, + Drivers: make(map[string]*structs.DriverInfo, 0), } testConfig := config.Config{Node: node} testClient := &Client{config: &testConfig} @@ -270,6 +466,7 @@ func TestFingerprintManager_Run_Combination(t *testing.T) { node, shutdownCh, testClient.updateNodeFromFingerprint, + testClient.updateNodeFromHealthCheck, testLogger(), ) @@ -289,6 +486,7 @@ func TestFingerprintManager_Run_WhitelistDrivers(t *testing.T) { Attributes: make(map[string]string, 0), Links: make(map[string]string, 0), Resources: &structs.Resources{}, + Drivers: make(map[string]*structs.DriverInfo, 0), } testConfig := config.Config{Node: node} testClient := &Client{config: &testConfig} @@ -312,6 +510,7 @@ func TestFingerprintManager_Run_WhitelistDrivers(t *testing.T) { node, shutdownCh, testClient.updateNodeFromFingerprint, + testClient.updateNodeFromHealthCheck, testLogger(), ) @@ -328,6 +527,7 @@ func TestFingerprintManager_Run_AllDriversBlacklisted(t *testing.T) { Attributes: make(map[string]string, 0), Links: make(map[string]string, 0), Resources: &structs.Resources{}, + Drivers: make(map[string]*structs.DriverInfo, 0), } testConfig := config.Config{Node: node} testClient := &Client{config: &testConfig} @@ -350,6 +550,7 @@ func TestFingerprintManager_Run_AllDriversBlacklisted(t *testing.T) { node, shutdownCh, testClient.updateNodeFromFingerprint, + testClient.updateNodeFromHealthCheck, testLogger(), ) @@ -368,6 +569,7 @@ func TestFingerprintManager_Run_DriversWhiteListBlacklistCombination(t *testing. Attributes: make(map[string]string, 0), Links: make(map[string]string, 0), Resources: &structs.Resources{}, + Drivers: make(map[string]*structs.DriverInfo, 0), } testConfig := config.Config{Node: node} testClient := &Client{config: &testConfig} @@ -392,6 +594,7 @@ func TestFingerprintManager_Run_DriversWhiteListBlacklistCombination(t *testing. node, shutdownCh, testClient.updateNodeFromFingerprint, + testClient.updateNodeFromHealthCheck, testLogger(), ) @@ -412,6 +615,7 @@ func TestFingerprintManager_Run_DriversInBlacklist(t *testing.T) { Attributes: make(map[string]string, 0), Links: make(map[string]string, 0), Resources: &structs.Resources{}, + Drivers: make(map[string]*structs.DriverInfo, 0), } conf := config.DefaultConfig() conf.Options = map[string]string{ @@ -433,6 +637,7 @@ func TestFingerprintManager_Run_DriversInBlacklist(t *testing.T) { node, shutdownCh, testClient.updateNodeFromFingerprint, + testClient.updateNodeFromHealthCheck, testLogger(), ) diff --git a/client/structs/structs.go b/client/structs/structs.go index c038d1ff084d..d1c15ee56a9f 100644 --- a/client/structs/structs.go +++ b/client/structs/structs.go @@ -361,6 +361,19 @@ type FingerprintResponse struct { // Detected is a boolean indicating whether the fingerprinter detected // if the resource was available Detected bool + + // Drivers is a map of driver names to driver info. This allows the + // fingerprint method of each driver to set whether the driver is enabled or + // not, as well as its attributes + Drivers map[string]*structs.DriverInfo +} + +func (f *FingerprintResponse) AddDriver(name string, value *structs.DriverInfo) { + if f.Drivers == nil { + f.Drivers = make(map[string]*structs.DriverInfo, 0) + } + + f.Drivers[name] = value } // AddAttribute adds the name and value for a node attribute to the fingerprint @@ -405,3 +418,31 @@ func (f *FingerprintResponse) RemoveLink(name string) { f.Links[name] = "" } + +// HealthCheckRequest is the request type for a type that fulfils the Health +// Check interface +type HealthCheckRequest struct{} + +// HealthCheckResponse is the response type for a type that fulfills the Health +// Check interface +type HealthCheckResponse struct { + // Drivers is a map of driver names to current driver information + Drivers map[string]*structs.DriverInfo +} + +type HealthCheckIntervalRequest struct{} +type HealthCheckIntervalResponse struct { + Eligible bool + Period time.Duration +} + +// AddDriverInfo adds information about a driver to the fingerprint response. +// If the Drivers field has not yet been initialized, it does so here. +func (h *HealthCheckResponse) AddDriverInfo(name string, driverInfo *structs.DriverInfo) { + // initialize Drivers if it has not been already + if h.Drivers == nil { + h.Drivers = make(map[string]*structs.DriverInfo, 0) + } + + h.Drivers[name] = driverInfo +} diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index d77a294deb4d..3da2258d7f4c 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -1163,6 +1163,50 @@ func ValidNodeStatus(status string) bool { } } +// DriverInfo is the current state of a single driver. This is updated +// regularly as driver health changes on the node. +type DriverInfo struct { + Attributes map[string]string + Detected bool + Healthy bool + HealthDescription string + UpdateTime time.Time +} + +func (di *DriverInfo) MergeHealthCheck(other *DriverInfo) { + di.Healthy = other.Healthy + di.HealthDescription = other.HealthDescription + di.UpdateTime = other.UpdateTime +} + +func (di *DriverInfo) MergeFingerprintInfo(other *DriverInfo) { + di.Detected = other.Detected + di.Attributes = other.Attributes +} + +func (di *DriverInfo) Equals(other *DriverInfo) bool { + if di == nil && other == nil { + return true + } + + if di == nil && other != nil || di != nil && other == nil { + return false + } + if !di.Detected == other.Detected { + return false + } + + if !di.Healthy == other.Healthy { + return false + } + + if strings.Compare(di.HealthDescription, other.HealthDescription) != 0 { + return false + } + + return true +} + // Node is a representation of a schedulable client node type Node struct { // ID is a unique identifier for the node. It can be constructed @@ -1241,6 +1285,9 @@ type Node struct { // retaining only MaxRetainedNodeEvents number at a time Events []*NodeEvent + // Drivers is a map of driver names to current driver information + Drivers map[string]*DriverInfo + // Raft Indexes CreateIndex uint64 ModifyIndex uint64 From a340baddf14f2e90912fd2210f85673d1bbf7438 Mon Sep 17 00:00:00 2001 From: Chelsea Holland Komlo Date: Thu, 22 Feb 2018 17:24:34 -0500 Subject: [PATCH 02/24] allow nomad to schedule based on the status of a client driver health check Slight updates for go style --- client/client.go | 24 +++++++-------- nomad/mock/mock.go | 1 + scheduler/feasible.go | 42 +++++++++++++++++--------- scheduler/feasible_test.go | 60 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 102 insertions(+), 25 deletions(-) diff --git a/client/client.go b/client/client.go index 909fd368b4cd..eb801f0d6bef 100644 --- a/client/client.go +++ b/client/client.go @@ -1006,15 +1006,15 @@ func (c *Client) updateNodeFromFingerprint(response *cstructs.FingerprintRespons c.config.Node.Resources.Merge(response.Resources) } - for name, new_val := range response.Drivers { - old_val := c.config.Node.Drivers[name] - if new_val.Equals(old_val) { + for name, newVal := range response.Drivers { + oldVal := c.config.Node.Drivers[name] + if newVal.Equals(oldVal) { continue } - if old_val == nil { - c.config.Node.Drivers[name] = new_val + if oldVal == nil { + c.config.Node.Drivers[name] = newVal } else { - c.config.Node.Drivers[name].MergeFingerprintInfo(new_val) + c.config.Node.Drivers[name].MergeFingerprintInfo(newVal) } } if nodeHasChanged { @@ -1031,16 +1031,16 @@ func (c *Client) updateNodeFromHealthCheck(response *cstructs.HealthCheckRespons nodeHasChanged := false // update the node with the latest driver health information - for name, new_val := range response.Drivers { - old_val := c.config.Node.Drivers[name] - if new_val.Equals(old_val) { + for name, newVal := range response.Drivers { + oldVal := c.config.Node.Drivers[name] + if newVal.Equals(oldVal) { continue } nodeHasChanged = true - if old_val == nil { - c.config.Node.Drivers[name] = new_val + if oldVal == nil { + c.config.Node.Drivers[name] = newVal } else { - c.config.Node.Drivers[name].MergeHealthCheck(new_val) + c.config.Node.Drivers[name].MergeHealthCheck(newVal) } } diff --git a/nomad/mock/mock.go b/nomad/mock/mock.go index 3a8588b9cbad..2c2338dcfb5b 100644 --- a/nomad/mock/mock.go +++ b/nomad/mock/mock.go @@ -46,6 +46,7 @@ func Node() *structs.Node { }, }, }, + Drivers: make(map[string]*structs.DriverInfo), Links: map[string]string{ "consul": "foobar.dc1", }, diff --git a/scheduler/feasible.go b/scheduler/feasible.go index 9ba83195c951..76d35138bee7 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -129,21 +129,37 @@ func (c *DriverChecker) Feasible(option *structs.Node) bool { func (c *DriverChecker) hasDrivers(option *structs.Node) bool { for driver := range c.drivers { driverStr := fmt.Sprintf("driver.%s", driver) - value, ok := option.Attributes[driverStr] - if !ok { - return false - } - enabled, err := strconv.ParseBool(value) - if err != nil { - c.ctx.Logger(). - Printf("[WARN] scheduler.DriverChecker: node %v has invalid driver setting %v: %v", - option.ID, driverStr, value) - return false - } + // TODO this is a compatibility mechanism- as of Nomad 0.8, nodes have a + // DriverInfo that corresponds with every driver. As a Nomad server might + // be on a later version than a Nomad client, we need to check for + // compatibility here to verify the client supports this. + if option.Drivers != nil { + driverInfo := option.Drivers[driverStr] + if driverInfo == nil { + c.ctx.Logger(). + Printf("[WARN] scheduler.DriverChecker: node %v has no driver info set for %v", + option.ID, driverStr) + return false + } + return driverInfo.Detected && driverInfo.Healthy + } else { + value, ok := option.Attributes[driverStr] + if !ok { + return false + } - if !enabled { - return false + enabled, err := strconv.ParseBool(value) + if err != nil { + c.ctx.Logger(). + Printf("[WARN] scheduler.DriverChecker: node %v has invalid driver setting %v: %v", + option.ID, driverStr, value) + return false + } + + if !enabled { + return false + } } } return true diff --git a/scheduler/feasible_test.go b/scheduler/feasible_test.go index a22f66e1e764..9ee70a5703c1 100644 --- a/scheduler/feasible_test.go +++ b/scheduler/feasible_test.go @@ -4,6 +4,7 @@ import ( "fmt" "reflect" "testing" + "time" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/mock" @@ -125,6 +126,65 @@ func TestDriverChecker(t *testing.T) { } } +func TestDriverChecker_HealthChecks(t *testing.T) { + _, ctx := testContext(t) + nodes := []*structs.Node{ + mock.Node(), + mock.Node(), + mock.Node(), + } + nodes[0].Attributes["driver.foo"] = "1" + nodes[0].Drivers["driver.foo"] = &structs.DriverInfo{ + Detected: true, + Healthy: true, + HealthDescription: "running", + UpdateTime: time.Now(), + } + nodes[1].Attributes["driver.bar"] = "1" + nodes[1].Drivers["driver.bar"] = &structs.DriverInfo{ + Detected: true, + Healthy: false, + HealthDescription: "not running", + UpdateTime: time.Now(), + } + nodes[2].Attributes["driver.baz"] = "0" + nodes[2].Drivers["driver.baz"] = &structs.DriverInfo{ + Detected: false, + Healthy: false, + HealthDescription: "not running", + UpdateTime: time.Now(), + } + + testDrivers := []string{"foo", "bar", "baz"} + cases := []struct { + Node *structs.Node + Result bool + }{ + { + Node: nodes[0], + Result: true, + }, + { + Node: nodes[1], + Result: false, + }, + { + Node: nodes[2], + Result: false, + }, + } + + for i, c := range cases { + drivers := map[string]struct{}{ + testDrivers[i]: {}, + } + checker := NewDriverChecker(ctx, drivers) + if act := checker.Feasible(c.Node); act != c.Result { + t.Fatalf("case(%d) failed: got %v; want %v", i, act, c.Result) + } + } +} + func TestConstraintChecker(t *testing.T) { _, ctx := testContext(t) nodes := []*structs.Node{ From 9ec5a93bc1d8c009983feb8a62745595d43a5ad6 Mon Sep 17 00:00:00 2001 From: Chelsea Holland Komlo Date: Tue, 27 Feb 2018 14:57:10 -0500 Subject: [PATCH 03/24] fix scheduler driver name; create node structs file --- client/client.go | 13 ++++---- client/driver/docker.go | 11 +++++-- client/fingerprint_manager.go | 6 ++-- nomad/structs/node.go | 46 +++++++++++++++++++++++++++ nomad/structs/node_test.go | 60 +++++++++++++++++++++++++++++++++++ nomad/structs/structs.go | 44 ------------------------- scheduler/feasible.go | 5 +-- scheduler/feasible_test.go | 6 ++-- 8 files changed, 130 insertions(+), 61 deletions(-) create mode 100644 nomad/structs/node.go create mode 100644 nomad/structs/node_test.go diff --git a/client/client.go b/client/client.go index eb801f0d6bef..ed5f9cdd0aba 100644 --- a/client/client.go +++ b/client/client.go @@ -1008,12 +1008,9 @@ func (c *Client) updateNodeFromFingerprint(response *cstructs.FingerprintRespons for name, newVal := range response.Drivers { oldVal := c.config.Node.Drivers[name] - if newVal.Equals(oldVal) { - continue - } - if oldVal == nil { + if oldVal == nil && newVal != nil { c.config.Node.Drivers[name] = newVal - } else { + } else if newVal != nil && newVal.Detected != oldVal.Detected { c.config.Node.Drivers[name].MergeFingerprintInfo(newVal) } } @@ -1032,8 +1029,11 @@ func (c *Client) updateNodeFromHealthCheck(response *cstructs.HealthCheckRespons // update the node with the latest driver health information for name, newVal := range response.Drivers { + if newVal == nil { + continue + } oldVal := c.config.Node.Drivers[name] - if newVal.Equals(oldVal) { + if newVal.HealthCheckEquals(oldVal) { continue } nodeHasChanged = true @@ -1675,6 +1675,7 @@ func (c *Client) watchNodeUpdates() { c.retryRegisterNode() hasChanged = false + timer.Reset(c.retryIntv(nodeUpdateRetryIntv)) case <-c.triggerNodeUpdate: if hasChanged { continue diff --git a/client/driver/docker.go b/client/driver/docker.go index 0fcfe03914f5..e937d2b4a01c 100644 --- a/client/driver/docker.go +++ b/client/driver/docker.go @@ -560,7 +560,14 @@ func (d *DockerDriver) HealthCheck(req *cstructs.HealthCheckRequest, resp *cstru UpdateTime: time.Now(), } - _, _, err := d.dockerClients() + client, _, err := d.dockerClients() + if err != nil { + d.logger.Printf("[WARN] driver.docker: docker driver is available but is unresponsive to `docker ps`") + resp.AddDriverInfo("docker", unhealthy) + return err + } + + _, err = client.ListContainers(docker.ListContainersOptions{All: false}) if err != nil { d.logger.Printf("[WARN] driver.docker: docker driver is available but is unresponsive to `docker ps`") resp.AddDriverInfo("docker", unhealthy) @@ -582,7 +589,7 @@ func (d *DockerDriver) HealthCheck(req *cstructs.HealthCheckRequest, resp *cstru // interval at which to do them. func (d *DockerDriver) GetHealthCheckInterval(req *cstructs.HealthCheckIntervalRequest, resp *cstructs.HealthCheckIntervalResponse) error { resp.Eligible = true - resp.Period = 1 * time.Minute + resp.Period = 3 * time.Second return nil } diff --git a/client/fingerprint_manager.go b/client/fingerprint_manager.go index 0c2b3b4f9fae..4be82d16b2ad 100644 --- a/client/fingerprint_manager.go +++ b/client/fingerprint_manager.go @@ -206,9 +206,7 @@ func (fm *FingerprintManager) fingerprint(name string, f fingerprint.Fingerprint func (fm *FingerprintManager) healthCheck(name string, hc fingerprint.HealthCheck) error { request := &cstructs.HealthCheckRequest{} var response cstructs.HealthCheckResponse - if err := hc.HealthCheck(request, &response); err != nil { - return err - } + err := hc.HealthCheck(request, &response) if node := fm.updateHealthCheck(&response); node != nil { fm.nodeLock.Lock() @@ -216,7 +214,7 @@ func (fm *FingerprintManager) healthCheck(name string, hc fingerprint.HealthChec fm.nodeLock.Unlock() } - return nil + return err } // setupFingerprints is used to fingerprint the node to see if these attributes are diff --git a/nomad/structs/node.go b/nomad/structs/node.go new file mode 100644 index 000000000000..80daed172f21 --- /dev/null +++ b/nomad/structs/node.go @@ -0,0 +1,46 @@ +package structs + +import ( + "strings" + "time" +) + +// DriverInfo is the current state of a single driver. This is updated +// regularly as driver health changes on the node. +type DriverInfo struct { + Attributes map[string]string + Detected bool + Healthy bool + HealthDescription string + UpdateTime time.Time +} + +func (di *DriverInfo) MergeHealthCheck(other *DriverInfo) { + di.Healthy = other.Healthy + di.HealthDescription = other.HealthDescription + di.UpdateTime = other.UpdateTime +} + +func (di *DriverInfo) MergeFingerprintInfo(other *DriverInfo) { + di.Detected = other.Detected + di.Attributes = other.Attributes +} + +// DriverInfo determines if two driver info objects are equal..As this is used +// in the process of health checking, we only check the fields that are +// computed by the health checker. In the future, this will be merged. +func (di *DriverInfo) HealthCheckEquals(other *DriverInfo) bool { + if di == nil && other != nil || di != nil && other == nil { + return false + } + + if di.Healthy != other.Healthy { + return false + } + + if strings.Compare(di.HealthDescription, other.HealthDescription) != 0 { + return false + } + + return true +} diff --git a/nomad/structs/node_test.go b/nomad/structs/node_test.go new file mode 100644 index 000000000000..9a5ff7ae2a06 --- /dev/null +++ b/nomad/structs/node_test.go @@ -0,0 +1,60 @@ +package structs + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestDriverInfoEquals(t *testing.T) { + require := require.New(t) + var driverInfoTest = []struct { + input []*DriverInfo + expected bool + errorMsg string + }{ + { + []*DriverInfo{ + &DriverInfo{ + Healthy: true, + }, + &DriverInfo{ + Healthy: false, + }, + }, + false, + "Different healthy values should not be equal.", + }, + { + []*DriverInfo{ + &DriverInfo{ + HealthDescription: "not running", + }, + &DriverInfo{ + HealthDescription: "running", + }, + }, + false, + "Different health description values should not be equal.", + }, + { + []*DriverInfo{ + &DriverInfo{ + Healthy: true, + HealthDescription: "running", + }, + &DriverInfo{ + Healthy: true, + HealthDescription: "running", + }, + }, + true, + "Different health description values should not be equal.", + }, + } + for _, testCase := range driverInfoTest { + first := testCase.input[0] + second := testCase.input[1] + require.Equal(testCase.expected, first.HealthCheckEquals(second), testCase.errorMsg) + } +} diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 3da2258d7f4c..68cf611cba30 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -1163,50 +1163,6 @@ func ValidNodeStatus(status string) bool { } } -// DriverInfo is the current state of a single driver. This is updated -// regularly as driver health changes on the node. -type DriverInfo struct { - Attributes map[string]string - Detected bool - Healthy bool - HealthDescription string - UpdateTime time.Time -} - -func (di *DriverInfo) MergeHealthCheck(other *DriverInfo) { - di.Healthy = other.Healthy - di.HealthDescription = other.HealthDescription - di.UpdateTime = other.UpdateTime -} - -func (di *DriverInfo) MergeFingerprintInfo(other *DriverInfo) { - di.Detected = other.Detected - di.Attributes = other.Attributes -} - -func (di *DriverInfo) Equals(other *DriverInfo) bool { - if di == nil && other == nil { - return true - } - - if di == nil && other != nil || di != nil && other == nil { - return false - } - if !di.Detected == other.Detected { - return false - } - - if !di.Healthy == other.Healthy { - return false - } - - if strings.Compare(di.HealthDescription, other.HealthDescription) != 0 { - return false - } - - return true -} - // Node is a representation of a schedulable client node type Node struct { // ID is a unique identifier for the node. It can be constructed diff --git a/scheduler/feasible.go b/scheduler/feasible.go index 76d35138bee7..359591c8048e 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -135,13 +135,14 @@ func (c *DriverChecker) hasDrivers(option *structs.Node) bool { // be on a later version than a Nomad client, we need to check for // compatibility here to verify the client supports this. if option.Drivers != nil { - driverInfo := option.Drivers[driverStr] + driverInfo := option.Drivers[driver] if driverInfo == nil { c.ctx.Logger(). Printf("[WARN] scheduler.DriverChecker: node %v has no driver info set for %v", - option.ID, driverStr) + option.ID, driver) return false } + return driverInfo.Detected && driverInfo.Healthy } else { value, ok := option.Attributes[driverStr] diff --git a/scheduler/feasible_test.go b/scheduler/feasible_test.go index 9ee70a5703c1..4fc40d70e343 100644 --- a/scheduler/feasible_test.go +++ b/scheduler/feasible_test.go @@ -134,21 +134,21 @@ func TestDriverChecker_HealthChecks(t *testing.T) { mock.Node(), } nodes[0].Attributes["driver.foo"] = "1" - nodes[0].Drivers["driver.foo"] = &structs.DriverInfo{ + nodes[0].Drivers["foo"] = &structs.DriverInfo{ Detected: true, Healthy: true, HealthDescription: "running", UpdateTime: time.Now(), } nodes[1].Attributes["driver.bar"] = "1" - nodes[1].Drivers["driver.bar"] = &structs.DriverInfo{ + nodes[1].Drivers["bar"] = &structs.DriverInfo{ Detected: true, Healthy: false, HealthDescription: "not running", UpdateTime: time.Now(), } nodes[2].Attributes["driver.baz"] = "0" - nodes[2].Drivers["driver.baz"] = &structs.DriverInfo{ + nodes[2].Drivers["baz"] = &structs.DriverInfo{ Detected: false, Healthy: false, HealthDescription: "not running", From 7c34605e26bf00980deed2a73f69c51e686c0d5d Mon Sep 17 00:00:00 2001 From: Chelsea Holland Komlo Date: Wed, 28 Feb 2018 12:40:54 -0500 Subject: [PATCH 04/24] fix up gofmt --- nomad/structs/node_test.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/nomad/structs/node_test.go b/nomad/structs/node_test.go index 9a5ff7ae2a06..877b8cabb556 100644 --- a/nomad/structs/node_test.go +++ b/nomad/structs/node_test.go @@ -15,10 +15,10 @@ func TestDriverInfoEquals(t *testing.T) { }{ { []*DriverInfo{ - &DriverInfo{ + { Healthy: true, }, - &DriverInfo{ + { Healthy: false, }, }, @@ -27,10 +27,10 @@ func TestDriverInfoEquals(t *testing.T) { }, { []*DriverInfo{ - &DriverInfo{ + { HealthDescription: "not running", }, - &DriverInfo{ + { HealthDescription: "running", }, }, @@ -39,11 +39,11 @@ func TestDriverInfoEquals(t *testing.T) { }, { []*DriverInfo{ - &DriverInfo{ + { Healthy: true, HealthDescription: "running", }, - &DriverInfo{ + { Healthy: true, HealthDescription: "running", }, From 9dfb5c6b463671fc77232e4ed07a9895e515f2ae Mon Sep 17 00:00:00 2001 From: Chelsea Holland Komlo Date: Wed, 28 Feb 2018 14:32:54 -0500 Subject: [PATCH 05/24] go style; update comments --- client/client.go | 2 ++ client/driver/docker.go | 2 +- client/driver/docker_test.go | 2 -- nomad/structs/node.go | 4 ++++ 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/client/client.go b/client/client.go index ed5f9cdd0aba..5d7ea1d106e0 100644 --- a/client/client.go +++ b/client/client.go @@ -1021,6 +1021,8 @@ func (c *Client) updateNodeFromFingerprint(response *cstructs.FingerprintRespons return c.config.Node } +// updateNodeFromHealthCheck receives a health check response and updates the +// node accordingly func (c *Client) updateNodeFromHealthCheck(response *cstructs.HealthCheckResponse) *structs.Node { c.configLock.Lock() defer c.configLock.Unlock() diff --git a/client/driver/docker.go b/client/driver/docker.go index e937d2b4a01c..2812a6fbe549 100644 --- a/client/driver/docker.go +++ b/client/driver/docker.go @@ -589,7 +589,7 @@ func (d *DockerDriver) HealthCheck(req *cstructs.HealthCheckRequest, resp *cstru // interval at which to do them. func (d *DockerDriver) GetHealthCheckInterval(req *cstructs.HealthCheckIntervalRequest, resp *cstructs.HealthCheckIntervalResponse) error { resp.Eligible = true - resp.Period = 3 * time.Second + resp.Period = 1 * time.Minute return nil } diff --git a/client/driver/docker_test.go b/client/driver/docker_test.go index 3063a79aa46a..316d9e1edf24 100644 --- a/client/driver/docker_test.go +++ b/client/driver/docker_test.go @@ -267,8 +267,6 @@ func TestDockerDriver_Check_DockerHealthStatus(t *testing.T) { require := require.New(t) - // This seems fragile, so we might need to reconsider this test if it - // proves flaky expectedAddr, err := sockaddr.GetInterfaceIP("docker0") if err != nil { t.Fatalf("unable to get ip for docker0: %v", err) diff --git a/nomad/structs/node.go b/nomad/structs/node.go index 80daed172f21..7910e1ba7d91 100644 --- a/nomad/structs/node.go +++ b/nomad/structs/node.go @@ -15,12 +15,16 @@ type DriverInfo struct { UpdateTime time.Time } +// MergeHealthCheck merges information from a health check for a drier into a +// node's driver info func (di *DriverInfo) MergeHealthCheck(other *DriverInfo) { di.Healthy = other.Healthy di.HealthDescription = other.HealthDescription di.UpdateTime = other.UpdateTime } +// MergeFingerprint merges information from fingerprinting a node for a driver +// into a node's driver info for that driver. func (di *DriverInfo) MergeFingerprintInfo(other *DriverInfo) { di.Detected = other.Detected di.Attributes = other.Attributes From 70bebd1eedf8c6a3a153597a8e51454056c3ba58 Mon Sep 17 00:00:00 2001 From: Chelsea Holland Komlo Date: Thu, 1 Mar 2018 10:25:16 -0500 Subject: [PATCH 06/24] fix up scheduler mocks --- nomad/mock/mock.go | 1 - scheduler/feasible_test.go | 3 +++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/nomad/mock/mock.go b/nomad/mock/mock.go index 2c2338dcfb5b..3a8588b9cbad 100644 --- a/nomad/mock/mock.go +++ b/nomad/mock/mock.go @@ -46,7 +46,6 @@ func Node() *structs.Node { }, }, }, - Drivers: make(map[string]*structs.DriverInfo), Links: map[string]string{ "consul": "foobar.dc1", }, diff --git a/scheduler/feasible_test.go b/scheduler/feasible_test.go index 4fc40d70e343..7f4357d8f2fa 100644 --- a/scheduler/feasible_test.go +++ b/scheduler/feasible_test.go @@ -133,6 +133,9 @@ func TestDriverChecker_HealthChecks(t *testing.T) { mock.Node(), mock.Node(), } + for _, e := range nodes { + e.Drivers = make(map[string]*structs.DriverInfo) + } nodes[0].Attributes["driver.foo"] = "1" nodes[0].Drivers["foo"] = &structs.DriverInfo{ Detected: true, From fd25db9e1dd687e71bac92ee9f2705b90e2968b2 Mon Sep 17 00:00:00 2001 From: Chelsea Holland Komlo Date: Thu, 1 Mar 2018 17:25:34 -0500 Subject: [PATCH 07/24] updating comments; locking concurrent node access --- client/client.go | 1 - client/driver/mock_driver.go | 2 +- client/fingerprint_manager.go | 7 ++++++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/client/client.go b/client/client.go index 5d7ea1d106e0..3662cf6b2cfc 100644 --- a/client/client.go +++ b/client/client.go @@ -1677,7 +1677,6 @@ func (c *Client) watchNodeUpdates() { c.retryRegisterNode() hasChanged = false - timer.Reset(c.retryIntv(nodeUpdateRetryIntv)) case <-c.triggerNodeUpdate: if hasChanged { continue diff --git a/client/driver/mock_driver.go b/client/driver/mock_driver.go index cab5844ccc00..09a86f72deda 100644 --- a/client/driver/mock_driver.go +++ b/client/driver/mock_driver.go @@ -274,7 +274,7 @@ func (m *MockDriver) HealthCheck(req *cstructs.HealthCheckRequest, resp *cstruct // GetHealthCheckInterval implements the interface for HealthCheck and indicates // that mock driver should be checked periodically. Returns a boolean -// indicating if ti should be checked, and the duration at which to do this +// indicating if it should be checked, and the duration at which to do this // check. func (m *MockDriver) GetHealthCheckInterval(req *cstructs.HealthCheckIntervalRequest, resp *cstructs.HealthCheckIntervalResponse) error { resp.Eligible = true diff --git a/client/fingerprint_manager.go b/client/fingerprint_manager.go index 4be82d16b2ad..db9eba8a638b 100644 --- a/client/fingerprint_manager.go +++ b/client/fingerprint_manager.go @@ -138,7 +138,12 @@ func (fm *FingerprintManager) setupDrivers(drivers []string) error { func (fm *FingerprintManager) fingerprintDriver(name string, f fingerprint.Fingerprint) (bool, error) { request := &cstructs.FingerprintRequest{Config: fm.getConfig(), Node: fm.node} var response cstructs.FingerprintResponse - if err := f.Fingerprint(request, &response); err != nil { + + fm.nodeLock.Lock() + err := f.Fingerprint(request, &response) + fm.nodeLock.Unlock() + + if err != nil { return false, err } From 865b7e0aceff542acf1f6b72e0f6b996b48d12c6 Mon Sep 17 00:00:00 2001 From: Chelsea Holland Komlo Date: Fri, 2 Mar 2018 20:21:13 -0500 Subject: [PATCH 08/24] fix up racy tests --- client/client_test.go | 75 ++++++------ client/fingerprint_manager.go | 4 +- client/fingerprint_manager_test.go | 178 ++++++++++++++++------------- 3 files changed, 142 insertions(+), 115 deletions(-) diff --git a/client/client_test.go b/client/client_test.go index 821ce40d2d0f..2ec869320a7d 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -142,42 +142,49 @@ func TestClient_Fingerprint_Periodic(t *testing.T) { node := c1.config.Node mockDriverName := "driver.mock_driver" - // Ensure the mock driver is registered on the client - testutil.WaitForResult(func() (bool, error) { - mockDriverStatus := node.Attributes[mockDriverName] - if mockDriverStatus == "" { - return false, fmt.Errorf("mock driver attribute should be set on the client") - } + { + // Ensure the mock driver is registered on the client + testutil.WaitForResult(func() (bool, error) { + c1.configLock.Lock() + mockDriverStatus := node.Attributes[mockDriverName] + mockDriverInfo := node.Drivers["mock_driver"] + c1.configLock.Unlock() + if mockDriverStatus == "" { + return false, fmt.Errorf("mock driver attribute should be set on the client") + } - // assert that the Driver information for the node is also set correctly - mockDriverInfo := node.Drivers["mock_driver"] - if mockDriverInfo == nil { - return false, fmt.Errorf("mock driver is nil when it should be set on node Drivers") - } - if !mockDriverInfo.Detected { - return false, fmt.Errorf("mock driver should be set as healthy") - } - if !mockDriverInfo.Healthy { - return false, fmt.Errorf("mock driver should be set as healthy") - } - if mockDriverInfo.HealthDescription == "" { - return false, fmt.Errorf("mock driver description should not be empty") - } - return true, nil - }, func(err error) { - t.Fatalf("err: %v", err) - }) + // assert that the Driver information for the node is also set correctly + if mockDriverInfo == nil { + return false, fmt.Errorf("mock driver is nil when it should be set on node Drivers") + } + if !mockDriverInfo.Detected { + return false, fmt.Errorf("mock driver should be set as healthy") + } + if !mockDriverInfo.Healthy { + return false, fmt.Errorf("mock driver should be set as healthy") + } + if mockDriverInfo.HealthDescription == "" { + return false, fmt.Errorf("mock driver description should not be empty") + } + return true, nil + }, func(err error) { + t.Fatalf("err: %v", err) + }) + } - // Ensure that the client fingerprinter eventually removes this attribute - testutil.WaitForResult(func() (bool, error) { - mockDriverStatus := node.Attributes[mockDriverName] - if mockDriverStatus != "" { - return false, fmt.Errorf("mock driver attribute should not be set on the client") - } - return true, nil - }, func(err error) { - t.Fatalf("err: %v", err) - }) + { + testutil.WaitForResult(func() (bool, error) { + c1.configLock.Lock() + mockDriverStatus := node.Attributes[mockDriverName] + c1.configLock.Unlock() + if mockDriverStatus != "" { + return false, fmt.Errorf("mock driver attribute should not be set on the client") + } + return true, nil + }, func(err error) { + t.Fatalf("err: %v", err) + }) + } } // TestClient_MixedTLS asserts that when a server is running with TLS enabled diff --git a/client/fingerprint_manager.go b/client/fingerprint_manager.go index db9eba8a638b..2db0704e61e0 100644 --- a/client/fingerprint_manager.go +++ b/client/fingerprint_manager.go @@ -136,10 +136,10 @@ func (fm *FingerprintManager) setupDrivers(drivers []string) error { // particular driver. Takes the FingerprintResponse and converts it to the // proper DriverInfo update and then sets the prefix attributes as well func (fm *FingerprintManager) fingerprintDriver(name string, f fingerprint.Fingerprint) (bool, error) { - request := &cstructs.FingerprintRequest{Config: fm.getConfig(), Node: fm.node} var response cstructs.FingerprintResponse fm.nodeLock.Lock() + request := &cstructs.FingerprintRequest{Config: fm.getConfig(), Node: fm.node} err := f.Fingerprint(request, &response) fm.nodeLock.Unlock() @@ -187,10 +187,10 @@ func (fm *FingerprintManager) fingerprintDriver(name string, f fingerprint.Finge // is meant to be run continuously, a process is launched to perform this // fingerprint on an ongoing basis in the background. func (fm *FingerprintManager) fingerprint(name string, f fingerprint.Fingerprint) (bool, error) { - request := &cstructs.FingerprintRequest{Config: fm.getConfig(), Node: fm.node} var response cstructs.FingerprintResponse fm.nodeLock.Lock() + request := &cstructs.FingerprintRequest{Config: fm.getConfig(), Node: fm.node} err := f.Fingerprint(request, &response) fm.nodeLock.Unlock() diff --git a/client/fingerprint_manager_test.go b/client/fingerprint_manager_test.go index c24d9c59ed5b..7a0615f6a645 100644 --- a/client/fingerprint_manager_test.go +++ b/client/fingerprint_manager_test.go @@ -154,28 +154,35 @@ func TestFingerprintManager_Fingerprint_Periodic(t *testing.T) { err := fm.Run() require.Nil(err) - // Ensure the mock driver is registered and healthy on the client - testutil.WaitForResult(func() (bool, error) { - mockDriverStatus := node.Attributes["driver.mock_driver"] - if mockDriverStatus == "" { - return false, fmt.Errorf("mock driver attribute should be set on the client") - } - return true, nil - }, func(err error) { - t.Fatalf("err: %v", err) - }) - + { + // Ensure the mock driver is registered and healthy on the client + testutil.WaitForResult(func() (bool, error) { + fm.nodeLock.Lock() + mockDriverStatus := node.Attributes["driver.mock_driver"] + fm.nodeLock.Unlock() + if mockDriverStatus == "" { + return false, fmt.Errorf("mock driver attribute should be set on the client") + } + return true, nil + }, func(err error) { + t.Fatalf("err: %v", err) + }) + } // Ensure that the client fingerprinter eventually removes this attribute and // marks the driver as unhealthy - testutil.WaitForResult(func() (bool, error) { - mockDriverStatus := node.Attributes["driver.mock_driver"] - if mockDriverStatus != "" { - return false, fmt.Errorf("mock driver attribute should not be set on the client") - } - return true, nil - }, func(err error) { - t.Fatalf("err: %v", err) - }) + { + testutil.WaitForResult(func() (bool, error) { + fm.nodeLock.Lock() + mockDriverStatus := node.Attributes["driver.mock_driver"] + fm.nodeLock.Unlock() + if mockDriverStatus != "" { + return false, fmt.Errorf("mock driver attribute should not be set on the client") + } + return true, nil + }, func(err error) { + t.Fatalf("err: %v", err) + }) + } } // This is a temporary measure to check that a driver has both attributes on a @@ -273,32 +280,18 @@ func TestFingerprintManager_HealthCheck_Periodic(t *testing.T) { require := require.New(t) node := &structs.Node{ - Drivers: make(map[string]*structs.DriverInfo, 0), - } - updateNode := func(resp *cstructs.FingerprintResponse) *structs.Node { - for k, v := range resp.Drivers { - if node.Drivers[k] == nil { - node.Drivers[k] = v - } else { - node.Drivers[k].MergeFingerprintInfo(v) - } - } - return node - } - updateHealthCheck := func(resp *cstructs.HealthCheckResponse) *structs.Node { - for k, v := range resp.Drivers { - if node.Drivers[k] == nil { - node.Drivers[k] = v - } else { - node.Drivers[k].MergeHealthCheck(v) - } - } - return node + Attributes: make(map[string]string, 0), + Links: make(map[string]string, 0), + Resources: &structs.Resources{}, + Drivers: make(map[string]*structs.DriverInfo, 0), } + testConfig := config.Config{Node: node} + testClient := &Client{config: &testConfig} + conf := config.DefaultConfig() conf.Options = map[string]string{ "test.shutdown_periodic_after": "true", - "test.shutdown_periodic_duration": "2", + "test.shutdown_periodic_duration": "3", } getConfig := func() *config.Config { return conf @@ -313,49 +306,76 @@ func TestFingerprintManager_HealthCheck_Periodic(t *testing.T) { getConfig, node, shutdownCh, - updateNode, - updateHealthCheck, + testClient.updateNodeFromFingerprint, + testClient.updateNodeFromHealthCheck, testLogger(), ) err := fm.Run() require.Nil(err) - // Ensure the mock driver is registered and healthy on the client - testutil.WaitForResult(func() (bool, error) { - mockDriverInfo := node.Drivers["mock_driver"] - if mockDriverInfo == nil { - return false, fmt.Errorf("mock driver info should be set on the client") - } - if !mockDriverInfo.Detected { - return false, fmt.Errorf("mock driver info should be detected") - } - if !mockDriverInfo.Healthy { - return false, fmt.Errorf("mock driver info should be healthy") - } - return true, nil - }, func(err error) { - t.Fatalf("err: %v", err) - }) - - // Ensure that the client health check eventually removes this attribute and - // marks the driver as unhealthy - testutil.WaitForResult(func() (bool, error) { - - mockDriverInfo := node.Drivers["mock_driver"] - if mockDriverInfo == nil { - return false, fmt.Errorf("mock driver info should be set on the client") - } - if !mockDriverInfo.Detected { - return false, fmt.Errorf("mock driver should be detected") - } - if mockDriverInfo.Healthy { - return false, fmt.Errorf("mock driver should not be healthy") - } - return true, nil - }, func(err error) { - t.Fatalf("err: %v", err) - }) + { + // Ensure the mock driver is registered and healthy on the client + testutil.WaitForResult(func() (bool, error) { + fm.nodeLock.Lock() + mockDriverInfo := node.Drivers["mock_driver"] + fm.nodeLock.Unlock() + if mockDriverInfo == nil { + return false, fmt.Errorf("mock driver info should be set on the client") + } + if !mockDriverInfo.Detected { + return false, fmt.Errorf("mock driver info should be detected") + } + if !mockDriverInfo.Healthy { + return false, fmt.Errorf("mock driver info should be healthy") + } + return true, nil + }, func(err error) { + t.Fatalf("err: %v", err) + }) + } + { + // Ensure that the client health check eventually removes this attribute and + // marks the driver as unhealthy + testutil.WaitForResult(func() (bool, error) { + fm.nodeLock.Lock() + mockDriverInfo := node.Drivers["mock_driver"] + fm.nodeLock.Unlock() + if mockDriverInfo == nil { + return false, fmt.Errorf("mock driver info should be set on the client") + } + if !mockDriverInfo.Detected { + return false, fmt.Errorf("mock driver info should be detected") + } + if !mockDriverInfo.Healthy { + return false, fmt.Errorf("mock driver info should be healthy") + } + return true, nil + }, func(err error) { + t.Fatalf("err: %v", err) + }) + } + { + // Ensure that the client health check eventually removes this attribute and + // marks the driver as unhealthy + testutil.WaitForResult(func() (bool, error) { + fm.nodeLock.Lock() + mockDriverInfo := node.Drivers["mock_driver"] + fm.nodeLock.Unlock() + if mockDriverInfo == nil { + return false, fmt.Errorf("mock driver info should be set on the client") + } + if !mockDriverInfo.Detected { + return false, fmt.Errorf("mock driver should be detected") + } + if mockDriverInfo.Healthy { + return false, fmt.Errorf("mock driver should not be healthy") + } + return true, nil + }, func(err error) { + t.Fatalf("err: %v", err) + }) + } } func TestFimgerprintManager_Run_InWhitelist(t *testing.T) { From 240fee48486ff4c90b28e44c84ce8177fa78cc69 Mon Sep 17 00:00:00 2001 From: Chelsea Holland Komlo Date: Tue, 6 Mar 2018 16:03:24 -0500 Subject: [PATCH 09/24] fix up codereview feedback --- client/driver/docker.go | 6 +++--- client/fingerprint_manager.go | 13 ++++++++++--- nomad/structs/node.go | 3 +-- scheduler/feasible.go | 8 ++++---- 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/client/driver/docker.go b/client/driver/docker.go index 2812a6fbe549..c2fc516ce87d 100644 --- a/client/driver/docker.go +++ b/client/driver/docker.go @@ -551,7 +551,7 @@ func (d *DockerDriver) Fingerprint(req *cstructs.FingerprintRequest, resp *cstru return nil } -// HealthChck implements the interface for the HealthCheck interface. This +// HealthCheck implements the interface for the HealthCheck interface. This // performs a health check on the docker driver, asserting whether the docker // driver is responsive to a `docker ps` command. func (d *DockerDriver) HealthCheck(req *cstructs.HealthCheckRequest, resp *cstructs.HealthCheckResponse) error { @@ -562,14 +562,14 @@ func (d *DockerDriver) HealthCheck(req *cstructs.HealthCheckRequest, resp *cstru client, _, err := d.dockerClients() if err != nil { - d.logger.Printf("[WARN] driver.docker: docker driver is available but is unresponsive to `docker ps`") + d.logger.Printf("[WARN] driver.docker: failed to retrieve Docker client in the process of a docker health check: %v", err) resp.AddDriverInfo("docker", unhealthy) return err } _, err = client.ListContainers(docker.ListContainersOptions{All: false}) if err != nil { - d.logger.Printf("[WARN] driver.docker: docker driver is available but is unresponsive to `docker ps`") + d.logger.Printf("[WARN] driver.docker: failed to list Docker containers in the process of a docker health check: %v", err) resp.AddDriverInfo("docker", unhealthy) return err } diff --git a/client/fingerprint_manager.go b/client/fingerprint_manager.go index 2db0704e61e0..f73bdc9c7422 100644 --- a/client/fingerprint_manager.go +++ b/client/fingerprint_manager.go @@ -2,6 +2,7 @@ package client import ( "log" + "strings" "sync" "time" @@ -24,7 +25,7 @@ type FingerprintManager struct { // associated node updateNodeAttributes func(*cstructs.FingerprintResponse) *structs.Node - // UpdateHealthCheck is a callback to the client to update the state of the + // updateHealthCheck is a callback to the client to update the state of the // node for resources that require a health check updateHealthCheck func(*cstructs.HealthCheckResponse) *structs.Node logger *log.Logger @@ -76,7 +77,7 @@ func (fm *FingerprintManager) runHealthCheck(hc fingerprint.HealthCheck, period case <-time.After(period): err := fm.healthCheck(name, hc) if err != nil { - fm.logger.Printf("[DEBUG] client.fingerprint_manager: health checking for %v failed: %+v", name, err) + fm.logger.Printf("[DEBUG] client.fingerprint_manager: health checking for %v failed: %v", name, err) continue } @@ -151,8 +152,14 @@ func (fm *FingerprintManager) fingerprintDriver(name string, f fingerprint.Finge // support this. Doing this so that we can enable this iteratively and also // in a backwards compatible way, where node attributes for drivers will // eventually be phased out. + strippedAttributes := make(map[string]string, 0) + for k, v := range response.Attributes { + copy := k + strings.Replace(copy, "driver.", "", 1) + strippedAttributes[k] = v + } di := &structs.DriverInfo{ - Attributes: response.Attributes, + Attributes: strippedAttributes, Detected: response.Detected, } response.AddDriver(name, di) diff --git a/nomad/structs/node.go b/nomad/structs/node.go index 7910e1ba7d91..d8b30bf8fe07 100644 --- a/nomad/structs/node.go +++ b/nomad/structs/node.go @@ -1,7 +1,6 @@ package structs import ( - "strings" "time" ) @@ -42,7 +41,7 @@ func (di *DriverInfo) HealthCheckEquals(other *DriverInfo) bool { return false } - if strings.Compare(di.HealthDescription, other.HealthDescription) != 0 { + if di.HealthDescription == other.HealthDescription { return false } diff --git a/scheduler/feasible.go b/scheduler/feasible.go index 359591c8048e..35582b4845c0 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -130,10 +130,10 @@ func (c *DriverChecker) hasDrivers(option *structs.Node) bool { for driver := range c.drivers { driverStr := fmt.Sprintf("driver.%s", driver) - // TODO this is a compatibility mechanism- as of Nomad 0.8, nodes have a - // DriverInfo that corresponds with every driver. As a Nomad server might - // be on a later version than a Nomad client, we need to check for - // compatibility here to verify the client supports this. + // COMPAT: Remove in 0.X: As of Nomad 0.8, nodes have a DriverInfo that + // corresponds with every driver. As a Nomad server might be on a later + // version than a Nomad client, we need to check for compatibility here + // to verify the client supports this. if option.Drivers != nil { driverInfo := option.Drivers[driver] if driverInfo == nil { From 521fbd4f97251a60059c937281e60c21f0108ded Mon Sep 17 00:00:00 2001 From: Chelsea Holland Komlo Date: Wed, 7 Mar 2018 10:31:47 -0500 Subject: [PATCH 10/24] refresh driver information for non-health checking drivers periodically --- client/fingerprint_manager.go | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/client/fingerprint_manager.go b/client/fingerprint_manager.go index f73bdc9c7422..c2541d435e4d 100644 --- a/client/fingerprint_manager.go +++ b/client/fingerprint_manager.go @@ -110,21 +110,26 @@ func (fm *FingerprintManager) setupDrivers(drivers []string) error { availDrivers = append(availDrivers, name) } - p, period := d.Periodic() - if p { - go fm.runFingerprint(d, period, name) - } - // We should only run the health check task if the driver is detected // Note that if the driver is detected later in a periodic health check, // this won't automateically trigger the periodic health check. if hc, ok := d.(fingerprint.HealthCheck); ok && detected { + p, period := d.Periodic() + if p { + go fm.runFingerprint(d, period, name) + } + req := &cstructs.HealthCheckIntervalRequest{} resp := &cstructs.HealthCheckIntervalResponse{} hc.GetHealthCheckInterval(req, resp) if resp.Eligible { go fm.runHealthCheck(hc, resp.Period, name) } + } else { + p, period := d.Periodic() + if p { + go fm.runFingerprintDriver(d, period, name) + } } } @@ -132,6 +137,24 @@ func (fm *FingerprintManager) setupDrivers(drivers []string) error { return nil } +func (fm *FingerprintManager) runFingerprintDriver(f fingerprint.Fingerprint, period time.Duration, name string) { + fm.logger.Printf("[DEBUG] client.fingerprint_manager: fingerprinting driver %s every %v", name, period) + + for { + select { + case <-time.After(period): + _, err := fm.fingerprintDriver(name, f) + if err != nil { + fm.logger.Printf("[DEBUG] client.fingerprint_manager: periodic fingerprinting for driver %v failed: %+v", name, err) + continue + } + + case <-fm.shutdownCh: + return + } + } +} + // fingerprintDriver is a temporary solution to move towards DriverInfo and // away from annotating a node's attributes to demonstrate support for a // particular driver. Takes the FingerprintResponse and converts it to the From 8a0ed4e5ffd5f0bdac15c2fff4ebfecb9c8137d8 Mon Sep 17 00:00:00 2001 From: Chelsea Holland Komlo Date: Wed, 7 Mar 2018 11:58:14 -0500 Subject: [PATCH 11/24] improve tests --- client/client.go | 4 ++++ client/client_test.go | 16 +++++++++++++++- client/fingerprint_manager.go | 26 ++++++++++++++++++++------ client/fingerprint_manager_test.go | 19 +++++++++++++++++++ 4 files changed, 58 insertions(+), 7 deletions(-) diff --git a/client/client.go b/client/client.go index 3662cf6b2cfc..4019941aeb19 100644 --- a/client/client.go +++ b/client/client.go @@ -1036,6 +1036,10 @@ func (c *Client) updateNodeFromHealthCheck(response *cstructs.HealthCheckRespons } oldVal := c.config.Node.Drivers[name] if newVal.HealthCheckEquals(oldVal) { + + // make sure we accurately reflect the last time a health check has been + // performed for the driver. + oldVal.UpdateTime = newVal.UpdateTime continue } nodeHasChanged = true diff --git a/client/client_test.go b/client/client_test.go index 2ec869320a7d..c8a145c69d9c 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -158,7 +158,7 @@ func TestClient_Fingerprint_Periodic(t *testing.T) { return false, fmt.Errorf("mock driver is nil when it should be set on node Drivers") } if !mockDriverInfo.Detected { - return false, fmt.Errorf("mock driver should be set as healthy") + return false, fmt.Errorf("mock driver should be set as detected") } if !mockDriverInfo.Healthy { return false, fmt.Errorf("mock driver should be set as healthy") @@ -176,10 +176,24 @@ func TestClient_Fingerprint_Periodic(t *testing.T) { testutil.WaitForResult(func() (bool, error) { c1.configLock.Lock() mockDriverStatus := node.Attributes[mockDriverName] + mockDriverInfo := node.Drivers["mock_driver"] c1.configLock.Unlock() if mockDriverStatus != "" { return false, fmt.Errorf("mock driver attribute should not be set on the client") } + // assert that the Driver information for the node is also set correctly + if mockDriverInfo == nil { + return false, fmt.Errorf("mock driver is nil when it should be set on node Drivers") + } + if !mockDriverInfo.Detected { + return false, fmt.Errorf("mock driver should be set as detected") + } + if mockDriverInfo.Healthy { + return false, fmt.Errorf("mock driver should be set as healthy") + } + if mockDriverInfo.HealthDescription == "" { + return false, fmt.Errorf("mock driver description should not be empty") + } return true, nil }, func(err error) { t.Fatalf("err: %v", err) diff --git a/client/fingerprint_manager.go b/client/fingerprint_manager.go index c2541d435e4d..817a86bd1ae6 100644 --- a/client/fingerprint_manager.go +++ b/client/fingerprint_manager.go @@ -99,6 +99,10 @@ func (fm *FingerprintManager) setupDrivers(drivers []string) error { return err } + if hc, ok := d.(fingerprint.HealthCheck); ok { + fm.healthCheck(name, hc) + } + detected, err := fm.fingerprintDriver(name, d) if err != nil { fm.logger.Printf("[DEBUG] client.fingerprint_manager: fingerprinting for %v failed: %+v", name, err) @@ -171,7 +175,9 @@ func (fm *FingerprintManager) fingerprintDriver(name string, f fingerprint.Finge return false, err } - // TODO This is a temporary measure, as eventually all drivers will need to + // COMPAT: Remove in 0.9: As of Nomad 0.8 there is a temporary measure to + // update all driver attributes to its corresponding driver info object, + // as eventually all drivers will need to // support this. Doing this so that we can enable this iteratively and also // in a backwards compatible way, where node attributes for drivers will // eventually be phased out. @@ -193,13 +199,21 @@ func (fm *FingerprintManager) fingerprintDriver(name string, f fingerprint.Finge fm.nodeLock.Unlock() } - if hc, ok := f.(fingerprint.HealthCheck); ok { - fm.healthCheck(name, hc) - } else { - + // COMPAT: Remove in 0.9: As of Nomad 0.8, there is a driver info for every + // driver. As a compatibility mechanism, we proactively set this driver info + // for drivers which do not yet support health checks. + if _, ok := f.(fingerprint.HealthCheck); !ok { resp := &cstructs.HealthCheckResponse{ Drivers: map[string]*structs.DriverInfo{ - name: di, + name: &structs.DriverInfo{ + // This achieves backwards compatibility- before, we only relied on the + // status of the fingerprint method to determine whether a driver was + // enabled. For drivers without health checks yet enabled, we should always + // set this to true, and then once we enable health checks, this can + // dynamically update this value. + Healthy: true, + UpdateTime: time.Now(), + }, }, } diff --git a/client/fingerprint_manager_test.go b/client/fingerprint_manager_test.go index 7a0615f6a645..b3727287172d 100644 --- a/client/fingerprint_manager_test.go +++ b/client/fingerprint_manager_test.go @@ -113,6 +113,7 @@ func TestFingerprintManager_Fingerprint_Run(t *testing.T) { require.NotEqual("", node.Attributes["driver.raw_exec"]) require.True(node.Drivers["raw_exec"].Detected) + require.True(node.Drivers["raw_exec"].Healthy) } func TestFingerprintManager_Fingerprint_Periodic(t *testing.T) { @@ -273,6 +274,24 @@ func TestFingerprintManager_HealthCheck_Driver(t *testing.T) { }, func(err error) { t.Fatalf("err: %v", err) }) + + // Ensure the mock driver is de-registered when it becomes unhealthy + testutil.WaitForResult(func() (bool, error) { + mockDriverAttribute := node.Attributes["driver.mock_driver"] + if mockDriverAttribute == "" { + return false, fmt.Errorf("mock driver info should be set on the client attributes") + } + mockDriverInfo := node.Drivers["mock_driver"] + if mockDriverInfo == nil { + return false, fmt.Errorf("mock driver info should be set on the client") + } + if !mockDriverInfo.Healthy { + return false, fmt.Errorf("mock driver info should not be healthy") + } + return true, nil + }, func(err error) { + t.Fatalf("err: %v", err) + }) } func TestFingerprintManager_HealthCheck_Periodic(t *testing.T) { From 3ad03d9498d0f622a4acef02c98dccb6fe6f6997 Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Wed, 7 Mar 2018 10:34:38 -0800 Subject: [PATCH 12/24] notes from walk through --- client/client.go | 37 +++++++++++++++++++++++++++++++++++ client/fingerprint_manager.go | 2 ++ 2 files changed, 39 insertions(+) diff --git a/client/client.go b/client/client.go index 4019941aeb19..7a82824aaefe 100644 --- a/client/client.go +++ b/client/client.go @@ -1006,6 +1006,14 @@ func (c *Client) updateNodeFromFingerprint(response *cstructs.FingerprintRespons c.config.Node.Resources.Merge(response.Resources) } + // XXX -------------- + // All drivers only expose DriverInfo + // This becomes a separate setter. So we have: + // * updateNodeFromFingerprinters + // * updateNodeFromDriver(fingerprint, health *DriverInfo) + /// TODO is anything updating the Node.Attributes based on updating + // driverinfos + for name, newVal := range response.Drivers { oldVal := c.config.Node.Drivers[name] if oldVal == nil && newVal != nil { @@ -1021,6 +1029,35 @@ func (c *Client) updateNodeFromFingerprint(response *cstructs.FingerprintRespons return c.config.Node } +/* Model for converging the periodic fingerprinting and health checking: + +func watcher(driver Driver) { + p := driver.Periodic() + healthChecker driver.HealthChecker() + hDur := healthChecker.Periodic() + + var t1, t2 time.Timer + + if healthChecker { + t2 = time.NewTimer(hDur) + } + + for { + select { + case shutdown: + case periodicFingerprint <- t1.C: + // Do stuff + t1.Reset(its duration) + case healthCheck <- t2.C: // Nil: never fire + // Do stuff + newHealth := driver.HealthCheck() + updateNodeFromDriver(nil, newHealth) + t2.Reset(its duration) + } + } +} +*/ + // updateNodeFromHealthCheck receives a health check response and updates the // node accordingly func (c *Client) updateNodeFromHealthCheck(response *cstructs.HealthCheckResponse) *structs.Node { diff --git a/client/fingerprint_manager.go b/client/fingerprint_manager.go index 817a86bd1ae6..6a8618f96a86 100644 --- a/client/fingerprint_manager.go +++ b/client/fingerprint_manager.go @@ -68,6 +68,7 @@ func (fm *FingerprintManager) runFingerprint(f fingerprint.Fingerprint, period t } } +// XXX Do we need this for now // runHealthCheck runs each health check individually on an ongoing basis func (fm *FingerprintManager) runHealthCheck(hc fingerprint.HealthCheck, period time.Duration, name string) { fm.logger.Printf("[DEBUG] client.fingerprint_manager: healthchecking %s every %v", name, period) @@ -294,6 +295,7 @@ func (fm *FingerprintManager) setupFingerprinters(fingerprints []string) error { go fm.runFingerprint(f, period, name) } + // XXX This code path doesn't exist right now if hc, ok := f.(fingerprint.HealthCheck); ok { req := &cstructs.HealthCheckIntervalRequest{} var resp cstructs.HealthCheckIntervalResponse From 8aefd294e1282ed52c1fa9c84e0ce16f628ed286 Mon Sep 17 00:00:00 2001 From: Chelsea Holland Komlo Date: Fri, 9 Mar 2018 12:28:01 -0500 Subject: [PATCH 13/24] Code review feedback --- client/client.go | 76 ++++++++++--- client/fingerprint_manager.go | 171 +++++++++++++---------------- client/fingerprint_manager_test.go | 41 +++---- client/structs/structs.go | 13 --- 4 files changed, 155 insertions(+), 146 deletions(-) diff --git a/client/client.go b/client/client.go index 7a82824aaefe..6de239d3899b 100644 --- a/client/client.go +++ b/client/client.go @@ -259,7 +259,8 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulServic } fingerprintManager := NewFingerprintManager(c.GetConfig, c.config.Node, - c.shutdownCh, c.updateNodeFromFingerprint, c.updateNodeFromHealthCheck, c.logger) + c.shutdownCh, c.updateNodeFromFingerprint, c.updateNodeFromHealthCheck, + c.updateNodeFromDriver, c.logger) // Fingerprint the node and scan for drivers if err := fingerprintManager.Run(); err != nil { @@ -1006,23 +1007,66 @@ func (c *Client) updateNodeFromFingerprint(response *cstructs.FingerprintRespons c.config.Node.Resources.Merge(response.Resources) } - // XXX -------------- - // All drivers only expose DriverInfo - // This becomes a separate setter. So we have: - // * updateNodeFromFingerprinters - // * updateNodeFromDriver(fingerprint, health *DriverInfo) - /// TODO is anything updating the Node.Attributes based on updating - // driverinfos + if nodeHasChanged { + c.updateNode() + } - for name, newVal := range response.Drivers { - oldVal := c.config.Node.Drivers[name] - if oldVal == nil && newVal != nil { - c.config.Node.Drivers[name] = newVal - } else if newVal != nil && newVal.Detected != oldVal.Detected { - c.config.Node.Drivers[name].MergeFingerprintInfo(newVal) + return c.config.Node +} + +// updateNodeFromDriver receives either a fingerprint of the driver or its +// health and merges this into a single DriverInfo object +func (c *Client) updateNodeFromDriver(name string, fingerprint, health *structs.DriverInfo) *structs.Node { + c.configLock.Lock() + defer c.configLock.Unlock() + + var hasChanged bool + + if fingerprint != nil { + if c.config.Node.Drivers[name] == nil { + hasChanged = true + c.config.Node.Drivers[name] = fingerprint + } else { + if c.config.Node.Drivers[name].Detected != fingerprint.Detected { + hasChanged = true + c.config.Node.Drivers[name].Detected = fingerprint.Detected + } + + for attrName, newVal := range fingerprint.Attributes { + oldVal := c.config.Node.Drivers[name].Attributes[attrName] + if oldVal == newVal { + continue + } + + hasChanged = true + if newVal == "" { + delete(c.config.Node.Attributes, attrName) + } else { + c.config.Node.Attributes[attrName] = newVal + } + } } } - if nodeHasChanged { + + if health != nil { + if c.config.Node.Drivers[name] == nil { + hasChanged = true + c.config.Node.Drivers[name] = health + } else { + oldVal := c.config.Node.Drivers[name] + if !health.HealthCheckEquals(oldVal) { + hasChanged = true + c.config.Node.Drivers[name].MergeHealthCheck(health) + } else { + // make sure we accurately reflect the last time a health check has been + // performed for the driver. + oldVal.UpdateTime = health.UpdateTime + } + } + } + + if hasChanged { + c.config.Node.Drivers[name].UpdateTime = time.Now() c.updateNode() } @@ -1060,6 +1104,7 @@ func watcher(driver Driver) { // updateNodeFromHealthCheck receives a health check response and updates the // node accordingly +// TODO is this even needed? func (c *Client) updateNodeFromHealthCheck(response *cstructs.HealthCheckResponse) *structs.Node { c.configLock.Lock() defer c.configLock.Unlock() @@ -1083,7 +1128,6 @@ func (c *Client) updateNodeFromHealthCheck(response *cstructs.HealthCheckRespons if oldVal == nil { c.config.Node.Drivers[name] = newVal } else { - c.config.Node.Drivers[name].MergeHealthCheck(newVal) } } diff --git a/client/fingerprint_manager.go b/client/fingerprint_manager.go index 6a8618f96a86..9f257c3757e1 100644 --- a/client/fingerprint_manager.go +++ b/client/fingerprint_manager.go @@ -28,7 +28,9 @@ type FingerprintManager struct { // updateHealthCheck is a callback to the client to update the state of the // node for resources that require a health check updateHealthCheck func(*cstructs.HealthCheckResponse) *structs.Node - logger *log.Logger + + updateNodeFromDriver func(string, *structs.DriverInfo, *structs.DriverInfo) *structs.Node + logger *log.Logger } // NewFingerprintManager is a constructor that creates and returns an instance @@ -38,11 +40,13 @@ func NewFingerprintManager(getConfig func() *config.Config, shutdownCh chan struct{}, updateNodeAttributes func(*cstructs.FingerprintResponse) *structs.Node, updateHealthCheck func(*cstructs.HealthCheckResponse) *structs.Node, + updateNodeFromDriver func(string, *structs.DriverInfo, *structs.DriverInfo) *structs.Node, logger *log.Logger) *FingerprintManager { return &FingerprintManager{ getConfig: getConfig, updateNodeAttributes: updateNodeAttributes, updateHealthCheck: updateHealthCheck, + updateNodeFromDriver: updateNodeFromDriver, node: node, shutdownCh: shutdownCh, logger: logger, @@ -53,9 +57,14 @@ func NewFingerprintManager(getConfig func() *config.Config, func (fm *FingerprintManager) runFingerprint(f fingerprint.Fingerprint, period time.Duration, name string) { fm.logger.Printf("[DEBUG] client.fingerprint_manager: fingerprinting %s every %v", name, period) + timer := time.NewTimer(period) + defer timer.Stop() + for { select { - case <-time.After(period): + case <-timer.C: + timer.Reset(period) + _, err := fm.fingerprint(name, f) if err != nil { fm.logger.Printf("[DEBUG] client.fingerprint_manager: periodic fingerprinting for %v failed: %+v", name, err) @@ -68,26 +77,6 @@ func (fm *FingerprintManager) runFingerprint(f fingerprint.Fingerprint, period t } } -// XXX Do we need this for now -// runHealthCheck runs each health check individually on an ongoing basis -func (fm *FingerprintManager) runHealthCheck(hc fingerprint.HealthCheck, period time.Duration, name string) { - fm.logger.Printf("[DEBUG] client.fingerprint_manager: healthchecking %s every %v", name, period) - - for { - select { - case <-time.After(period): - err := fm.healthCheck(name, hc) - if err != nil { - fm.logger.Printf("[DEBUG] client.fingerprint_manager: health checking for %v failed: %v", name, err) - continue - } - - case <-fm.shutdownCh: - return - } - } -} - // setupDrivers is used to fingerprint the node to see if these drivers are // supported func (fm *FingerprintManager) setupDrivers(drivers []string) error { @@ -100,41 +89,33 @@ func (fm *FingerprintManager) setupDrivers(drivers []string) error { return err } - if hc, ok := d.(fingerprint.HealthCheck); ok { - fm.healthCheck(name, hc) - } - detected, err := fm.fingerprintDriver(name, d) if err != nil { fm.logger.Printf("[DEBUG] client.fingerprint_manager: fingerprinting for %v failed: %+v", name, err) return err } - // log the fingerprinters which have been applied - if detected { - availDrivers = append(availDrivers, name) + if hc, ok := d.(fingerprint.HealthCheck); ok { + fm.runHealthCheck(name, hc) + } else { + // for drivers which are not of type health check, update them to have their + // health status match the status of whether they are detected or not. + healthInfo := &structs.DriverInfo{ + Healthy: detected, + UpdateTime: time.Now(), + } + if node := fm.updateNodeFromDriver(name, nil, healthInfo); node != nil { + fm.nodeLock.Lock() + fm.node = node + fm.nodeLock.Unlock() + } } - // We should only run the health check task if the driver is detected - // Note that if the driver is detected later in a periodic health check, - // this won't automateically trigger the periodic health check. - if hc, ok := d.(fingerprint.HealthCheck); ok && detected { - p, period := d.Periodic() - if p { - go fm.runFingerprint(d, period, name) - } + go fm.watchDriver(d, name) - req := &cstructs.HealthCheckIntervalRequest{} - resp := &cstructs.HealthCheckIntervalResponse{} - hc.GetHealthCheckInterval(req, resp) - if resp.Eligible { - go fm.runHealthCheck(hc, resp.Period, name) - } - } else { - p, period := d.Periodic() - if p { - go fm.runFingerprintDriver(d, period, name) - } + // log the fingerprinters which have been applied + if detected { + availDrivers = append(availDrivers, name) } } @@ -142,20 +123,47 @@ func (fm *FingerprintManager) setupDrivers(drivers []string) error { return nil } -func (fm *FingerprintManager) runFingerprintDriver(f fingerprint.Fingerprint, period time.Duration, name string) { - fm.logger.Printf("[DEBUG] client.fingerprint_manager: fingerprinting driver %s every %v", name, period) +// watchDrivers facilitates the different periods between fingerprint and +// health checking a driver +func (fm *FingerprintManager) watchDriver(d driver.Driver, name string) { + _, fingerprintPeriod := d.Periodic() + fm.logger.Printf("[DEBUG] client.fingerprint_manager: fingerprinting driver %s every %v", name, fingerprintPeriod) + + var healthCheckPeriod time.Duration + hc, isHealthCheck := d.(fingerprint.HealthCheck) + if isHealthCheck { + req := &cstructs.HealthCheckIntervalRequest{} + var resp cstructs.HealthCheckIntervalResponse + hc.GetHealthCheckInterval(req, &resp) + if resp.Eligible { + fm.logger.Printf("[DEBUG] client.fingerprint_manager: health checking driver %s every %v", name, healthCheckPeriod) + healthCheckPeriod = resp.Period + } + } + + t1 := time.NewTimer(fingerprintPeriod) + defer t1.Stop() + t2 := time.NewTimer(healthCheckPeriod) + defer t2.Stop() for { select { - case <-time.After(period): - _, err := fm.fingerprintDriver(name, f) + case <-fm.shutdownCh: + return + case <-t1.C: + t1.Reset(fingerprintPeriod) + _, err := fm.fingerprintDriver(name, d) if err != nil { fm.logger.Printf("[DEBUG] client.fingerprint_manager: periodic fingerprinting for driver %v failed: %+v", name, err) - continue } - - case <-fm.shutdownCh: - return + case <-t2.C: + if isHealthCheck { + t2.Reset(healthCheckPeriod) + err := fm.runHealthCheck(name, hc) + if err != nil { + fm.logger.Printf("[DEBUG] client.fingerprint_manager: health checking for %v failed: %v", name, err) + } + } } } } @@ -176,6 +184,12 @@ func (fm *FingerprintManager) fingerprintDriver(name string, f fingerprint.Finge return false, err } + if node := fm.updateNodeAttributes(&response); node != nil { + fm.nodeLock.Lock() + fm.node = node + fm.nodeLock.Unlock() + } + // COMPAT: Remove in 0.9: As of Nomad 0.8 there is a temporary measure to // update all driver attributes to its corresponding driver info object, // as eventually all drivers will need to @@ -188,43 +202,17 @@ func (fm *FingerprintManager) fingerprintDriver(name string, f fingerprint.Finge strings.Replace(copy, "driver.", "", 1) strippedAttributes[k] = v } + di := &structs.DriverInfo{ Attributes: strippedAttributes, Detected: response.Detected, } - response.AddDriver(name, di) - - if node := fm.updateNodeAttributes(&response); node != nil { + if node := fm.updateNodeFromDriver(name, di, nil); node != nil { fm.nodeLock.Lock() fm.node = node fm.nodeLock.Unlock() } - // COMPAT: Remove in 0.9: As of Nomad 0.8, there is a driver info for every - // driver. As a compatibility mechanism, we proactively set this driver info - // for drivers which do not yet support health checks. - if _, ok := f.(fingerprint.HealthCheck); !ok { - resp := &cstructs.HealthCheckResponse{ - Drivers: map[string]*structs.DriverInfo{ - name: &structs.DriverInfo{ - // This achieves backwards compatibility- before, we only relied on the - // status of the fingerprint method to determine whether a driver was - // enabled. For drivers without health checks yet enabled, we should always - // set this to true, and then once we enable health checks, this can - // dynamically update this value. - Healthy: true, - UpdateTime: time.Now(), - }, - }, - } - - if node := fm.updateHealthCheck(resp); node != nil { - fm.nodeLock.Lock() - fm.node = node - fm.nodeLock.Unlock() - } - } - return response.Detected, nil } @@ -253,12 +241,12 @@ func (fm *FingerprintManager) fingerprint(name string, f fingerprint.Fingerprint } // healthcheck checks the health of the specified resource. -func (fm *FingerprintManager) healthCheck(name string, hc fingerprint.HealthCheck) error { +func (fm *FingerprintManager) runHealthCheck(name string, hc fingerprint.HealthCheck) error { request := &cstructs.HealthCheckRequest{} var response cstructs.HealthCheckResponse err := hc.HealthCheck(request, &response) - if node := fm.updateHealthCheck(&response); node != nil { + if node := fm.updateNodeFromDriver(name, nil, response.Drivers[name]); node != nil { fm.nodeLock.Lock() fm.node = node fm.nodeLock.Unlock() @@ -294,17 +282,6 @@ func (fm *FingerprintManager) setupFingerprinters(fingerprints []string) error { if p { go fm.runFingerprint(f, period, name) } - - // XXX This code path doesn't exist right now - if hc, ok := f.(fingerprint.HealthCheck); ok { - req := &cstructs.HealthCheckIntervalRequest{} - var resp cstructs.HealthCheckIntervalResponse - if err := hc.GetHealthCheckInterval(req, &resp); err != nil { - if resp.Eligible { - go fm.runHealthCheck(hc, resp.Period, name) - } - } - } } fm.logger.Printf("[DEBUG] client.fingerprint_manager: detected fingerprints %v", appliedFingerprints) diff --git a/client/fingerprint_manager_test.go b/client/fingerprint_manager_test.go index b3727287172d..28b4c3186c0b 100644 --- a/client/fingerprint_manager_test.go +++ b/client/fingerprint_manager_test.go @@ -6,7 +6,6 @@ import ( "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/client/driver" - cstructs "github.com/hashicorp/nomad/client/structs" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/testutil" "github.com/stretchr/testify/require" @@ -37,6 +36,7 @@ func TestFingerprintManager_Run_MockDriver(t *testing.T) { make(chan struct{}), testClient.updateNodeFromFingerprint, testClient.updateNodeFromHealthCheck, + testClient.updateNodeFromDriver, testLogger(), ) @@ -70,6 +70,7 @@ func TestFingerprintManager_Run_ResourcesFingerprint(t *testing.T) { make(chan struct{}), testClient.updateNodeFromFingerprint, testClient.updateNodeFromHealthCheck, + testClient.updateNodeFromDriver, testLogger(), ) @@ -105,6 +106,7 @@ func TestFingerprintManager_Fingerprint_Run(t *testing.T) { make(chan struct{}), testClient.updateNodeFromFingerprint, testClient.updateNodeFromHealthCheck, + testClient.updateNodeFromDriver, testLogger(), ) @@ -149,6 +151,7 @@ func TestFingerprintManager_Fingerprint_Periodic(t *testing.T) { shutdownCh, testClient.updateNodeFromFingerprint, testClient.updateNodeFromHealthCheck, + testClient.updateNodeFromDriver, testLogger(), ) @@ -194,24 +197,13 @@ func TestFingerprintManager_HealthCheck_Driver(t *testing.T) { node := &structs.Node{ Attributes: make(map[string]string, 0), + Links: make(map[string]string, 0), + Resources: &structs.Resources{}, Drivers: make(map[string]*structs.DriverInfo, 0), } - updateNode := func(r *cstructs.FingerprintResponse) *structs.Node { - if r.Attributes != nil { - for k, v := range r.Attributes { - node.Attributes[k] = v - } - } - return node - } - updateHealthCheck := func(resp *cstructs.HealthCheckResponse) *structs.Node { - if resp.Drivers != nil { - for k, v := range resp.Drivers { - node.Drivers[k] = v - } - } - return node - } + testConfig := config.Config{Node: node} + testClient := &Client{config: &testConfig} + conf := config.DefaultConfig() conf.Options = map[string]string{ "driver.raw_exec.enable": "1", @@ -231,8 +223,9 @@ func TestFingerprintManager_HealthCheck_Driver(t *testing.T) { getConfig, node, shutdownCh, - updateNode, - updateHealthCheck, + testClient.updateNodeFromFingerprint, + testClient.updateNodeFromHealthCheck, + testClient.updateNodeFromDriver, testLogger(), ) @@ -327,6 +320,7 @@ func TestFingerprintManager_HealthCheck_Periodic(t *testing.T) { shutdownCh, testClient.updateNodeFromFingerprint, testClient.updateNodeFromHealthCheck, + testClient.updateNodeFromDriver, testLogger(), ) @@ -384,7 +378,7 @@ func TestFingerprintManager_HealthCheck_Periodic(t *testing.T) { if mockDriverInfo == nil { return false, fmt.Errorf("mock driver info should be set on the client") } - if !mockDriverInfo.Detected { + if mockDriverInfo.Detected { return false, fmt.Errorf("mock driver should be detected") } if mockDriverInfo.Healthy { @@ -427,6 +421,7 @@ func TestFimgerprintManager_Run_InWhitelist(t *testing.T) { shutdownCh, testClient.updateNodeFromFingerprint, testClient.updateNodeFromHealthCheck, + testClient.updateNodeFromDriver, testLogger(), ) @@ -466,6 +461,7 @@ func TestFingerprintManager_Run_InBlacklist(t *testing.T) { shutdownCh, testClient.updateNodeFromFingerprint, testClient.updateNodeFromHealthCheck, + testClient.updateNodeFromDriver, testLogger(), ) @@ -506,6 +502,7 @@ func TestFingerprintManager_Run_Combination(t *testing.T) { shutdownCh, testClient.updateNodeFromFingerprint, testClient.updateNodeFromHealthCheck, + testClient.updateNodeFromDriver, testLogger(), ) @@ -550,6 +547,7 @@ func TestFingerprintManager_Run_WhitelistDrivers(t *testing.T) { shutdownCh, testClient.updateNodeFromFingerprint, testClient.updateNodeFromHealthCheck, + testClient.updateNodeFromDriver, testLogger(), ) @@ -590,6 +588,7 @@ func TestFingerprintManager_Run_AllDriversBlacklisted(t *testing.T) { shutdownCh, testClient.updateNodeFromFingerprint, testClient.updateNodeFromHealthCheck, + testClient.updateNodeFromDriver, testLogger(), ) @@ -634,6 +633,7 @@ func TestFingerprintManager_Run_DriversWhiteListBlacklistCombination(t *testing. shutdownCh, testClient.updateNodeFromFingerprint, testClient.updateNodeFromHealthCheck, + testClient.updateNodeFromDriver, testLogger(), ) @@ -677,6 +677,7 @@ func TestFingerprintManager_Run_DriversInBlacklist(t *testing.T) { shutdownCh, testClient.updateNodeFromFingerprint, testClient.updateNodeFromHealthCheck, + testClient.updateNodeFromDriver, testLogger(), ) diff --git a/client/structs/structs.go b/client/structs/structs.go index d1c15ee56a9f..af28b55ca6c6 100644 --- a/client/structs/structs.go +++ b/client/structs/structs.go @@ -361,19 +361,6 @@ type FingerprintResponse struct { // Detected is a boolean indicating whether the fingerprinter detected // if the resource was available Detected bool - - // Drivers is a map of driver names to driver info. This allows the - // fingerprint method of each driver to set whether the driver is enabled or - // not, as well as its attributes - Drivers map[string]*structs.DriverInfo -} - -func (f *FingerprintResponse) AddDriver(name string, value *structs.DriverInfo) { - if f.Drivers == nil { - f.Drivers = make(map[string]*structs.DriverInfo, 0) - } - - f.Drivers[name] = value } // AddAttribute adds the name and value for a node attribute to the fingerprint From 115d54cb19e59fafbf029e88e94fa33729fc1a4d Mon Sep 17 00:00:00 2001 From: Chelsea Holland Komlo Date: Wed, 14 Mar 2018 13:10:41 -0400 Subject: [PATCH 14/24] fix up health check logic comparison; add node events to client driver checks --- client/client.go | 11 + client/client_test.go | 6 +- client/fingerprint_manager_test.go | 586 ++++++++++++++--------------- nomad/structs/node.go | 2 +- 4 files changed, 308 insertions(+), 297 deletions(-) diff --git a/client/client.go b/client/client.go index 6de239d3899b..374ec514ae82 100644 --- a/client/client.go +++ b/client/client.go @@ -1057,6 +1057,17 @@ func (c *Client) updateNodeFromDriver(name string, fingerprint, health *structs. if !health.HealthCheckEquals(oldVal) { hasChanged = true c.config.Node.Drivers[name].MergeHealthCheck(health) + + events := []*structs.NodeEvent{ + { + Subsystem: "Driver", + Message: fmt.Sprintf("Driver status changed: %s", health.HealthDescription), + Timestamp: time.Now().Unix(), + }, + } + if err := c.submitNodeEvents(events); err != nil { + c.logger.Printf("[ERR] nomad.client Error when submitting node event: %v", err) + } } else { // make sure we accurately reflect the last time a health check has been // performed for the driver. diff --git a/client/client_test.go b/client/client_test.go index c8a145c69d9c..5e7967441b58 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -146,9 +146,9 @@ func TestClient_Fingerprint_Periodic(t *testing.T) { // Ensure the mock driver is registered on the client testutil.WaitForResult(func() (bool, error) { c1.configLock.Lock() + defer c1.configLock.Unlock() mockDriverStatus := node.Attributes[mockDriverName] mockDriverInfo := node.Drivers["mock_driver"] - c1.configLock.Unlock() if mockDriverStatus == "" { return false, fmt.Errorf("mock driver attribute should be set on the client") } @@ -175,9 +175,9 @@ func TestClient_Fingerprint_Periodic(t *testing.T) { { testutil.WaitForResult(func() (bool, error) { c1.configLock.Lock() + defer c1.configLock.Unlock() mockDriverStatus := node.Attributes[mockDriverName] mockDriverInfo := node.Drivers["mock_driver"] - c1.configLock.Unlock() if mockDriverStatus != "" { return false, fmt.Errorf("mock driver attribute should not be set on the client") } @@ -185,7 +185,7 @@ func TestClient_Fingerprint_Periodic(t *testing.T) { if mockDriverInfo == nil { return false, fmt.Errorf("mock driver is nil when it should be set on node Drivers") } - if !mockDriverInfo.Detected { + if mockDriverInfo.Detected { return false, fmt.Errorf("mock driver should be set as detected") } if mockDriverInfo.Healthy { diff --git a/client/fingerprint_manager_test.go b/client/fingerprint_manager_test.go index 28b4c3186c0b..f7008cd7193b 100644 --- a/client/fingerprint_manager_test.go +++ b/client/fingerprint_manager_test.go @@ -2,11 +2,12 @@ package client import ( "fmt" + "log" + "os" "testing" "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/client/driver" - "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/testutil" "github.com/stretchr/testify/require" ) @@ -16,24 +17,23 @@ func TestFingerprintManager_Run_MockDriver(t *testing.T) { t.Parallel() require := require.New(t) - node := &structs.Node{ - Attributes: make(map[string]string, 0), - Links: make(map[string]string, 0), - Resources: &structs.Resources{}, - Drivers: make(map[string]*structs.DriverInfo, 0), - } - testConfig := config.Config{Node: node} - testClient := &Client{config: &testConfig} - conf := config.DefaultConfig() + s1, serverAddr := testServer(t, nil) + defer s1.Shutdown() + testutil.WaitForLeader(t, s1.RPC) - getConfig := func() *config.Config { - return conf - } + testClient := TestClient(t, func(c *config.Config) { + c.RPCHandler = s1 + c.Servers = []string{serverAddr} + }) + + testClient.logger = log.New(os.Stderr, "", log.LstdFlags) + defer testClient.Shutdown() + waitTilNodeReady(testClient, t) fm := NewFingerprintManager( - getConfig, - node, - make(chan struct{}), + testClient.GetConfig, + testClient.config.Node, + testClient.shutdownCh, testClient.updateNodeFromFingerprint, testClient.updateNodeFromHealthCheck, testClient.updateNodeFromDriver, @@ -42,6 +42,9 @@ func TestFingerprintManager_Run_MockDriver(t *testing.T) { err := fm.Run() require.Nil(err) + + node := testClient.config.Node + require.NotEqual("", node.Attributes["driver.mock_driver"]) } @@ -50,32 +53,34 @@ func TestFingerprintManager_Run_ResourcesFingerprint(t *testing.T) { t.Parallel() require := require.New(t) - node := &structs.Node{ - Attributes: make(map[string]string, 0), - Links: make(map[string]string, 0), - Resources: &structs.Resources{}, - Drivers: make(map[string]*structs.DriverInfo, 0), - } - testConfig := config.Config{Node: node} - testClient := &Client{config: &testConfig} + s1, serverAddr := testServer(t, nil) + defer s1.Shutdown() + testutil.WaitForLeader(t, s1.RPC) - conf := config.DefaultConfig() - getConfig := func() *config.Config { - return conf - } + testClient := TestClient(t, func(c *config.Config) { + c.RPCHandler = s1 + c.Servers = []string{serverAddr} + }) + + testClient.logger = testLogger() + defer testClient.Shutdown() + waitTilNodeReady(testClient, t) fm := NewFingerprintManager( - getConfig, - node, - make(chan struct{}), + testClient.GetConfig, + testClient.config.Node, + testClient.shutdownCh, testClient.updateNodeFromFingerprint, testClient.updateNodeFromHealthCheck, testClient.updateNodeFromDriver, - testLogger(), + testClient.logger, ) err := fm.Run() require.Nil(err) + + node := testClient.config.Node + require.NotEqual(0, node.Resources.CPU) require.NotEqual(0, node.Resources.MemoryMB) require.NotZero(node.Resources.DiskMB) @@ -85,34 +90,34 @@ func TestFingerprintManager_Fingerprint_Run(t *testing.T) { t.Parallel() require := require.New(t) - node := &structs.Node{ - Attributes: make(map[string]string, 0), - Links: make(map[string]string, 0), - Resources: &structs.Resources{}, - Drivers: make(map[string]*structs.DriverInfo, 0), - } - testConfig := config.Config{Node: node} - testClient := &Client{config: &testConfig} + s1, serverAddr := testServer(t, nil) + defer s1.Shutdown() + testutil.WaitForLeader(t, s1.RPC) - conf := config.DefaultConfig() - conf.Options = map[string]string{"driver.raw_exec.enable": "true"} - getConfig := func() *config.Config { - return conf - } + testClient := TestClient(t, func(c *config.Config) { + c.RPCHandler = s1 + c.Servers = []string{serverAddr} + }) + + testClient.logger = testLogger() + defer testClient.Shutdown() + waitTilNodeReady(testClient, t) fm := NewFingerprintManager( - getConfig, - node, - make(chan struct{}), + testClient.GetConfig, + testClient.config.Node, + testClient.shutdownCh, testClient.updateNodeFromFingerprint, testClient.updateNodeFromHealthCheck, testClient.updateNodeFromDriver, - testLogger(), + testClient.logger, ) err := fm.Run() require.Nil(err) + node := testClient.config.Node + require.NotEqual("", node.Attributes["driver.raw_exec"]) require.True(node.Drivers["raw_exec"].Detected) require.True(node.Drivers["raw_exec"].Healthy) @@ -122,37 +127,32 @@ func TestFingerprintManager_Fingerprint_Periodic(t *testing.T) { t.Parallel() require := require.New(t) - node := &structs.Node{ - Attributes: make(map[string]string, 0), - Links: make(map[string]string, 0), - Resources: &structs.Resources{}, - Drivers: make(map[string]*structs.DriverInfo, 0), - } - testConfig := config.Config{Node: node} - testClient := &Client{config: &testConfig} - - conf := config.DefaultConfig() - conf.Options = map[string]string{ - "test.shutdown_periodic_after": "true", - "test.shutdown_periodic_duration": "3", - } - getConfig := func() *config.Config { - return conf - } + s1, serverAddr := testServer(t, nil) + defer s1.Shutdown() + testutil.WaitForLeader(t, s1.RPC) + + testClient := TestClient(t, func(c *config.Config) { + c.RPCHandler = s1 + c.Servers = []string{serverAddr} + c.Options = map[string]string{ + "driver.raw_exec.enable": "1", + "test.shutdown_periodic_after": "true", + "test.shutdown_periodic_duration": "2", + } + }) - shutdownCh := make(chan struct{}) - defer (func() { - close(shutdownCh) - })() + testClient.logger = testLogger() + defer testClient.Shutdown() + waitTilNodeReady(testClient, t) fm := NewFingerprintManager( - getConfig, - node, - shutdownCh, + testClient.GetConfig, + testClient.config.Node, + testClient.shutdownCh, testClient.updateNodeFromFingerprint, testClient.updateNodeFromHealthCheck, testClient.updateNodeFromDriver, - testLogger(), + testClient.logger, ) err := fm.Run() @@ -162,8 +162,10 @@ func TestFingerprintManager_Fingerprint_Periodic(t *testing.T) { // Ensure the mock driver is registered and healthy on the client testutil.WaitForResult(func() (bool, error) { fm.nodeLock.Lock() + node := fm.node + defer fm.nodeLock.Unlock() + mockDriverStatus := node.Attributes["driver.mock_driver"] - fm.nodeLock.Unlock() if mockDriverStatus == "" { return false, fmt.Errorf("mock driver attribute should be set on the client") } @@ -177,8 +179,10 @@ func TestFingerprintManager_Fingerprint_Periodic(t *testing.T) { { testutil.WaitForResult(func() (bool, error) { fm.nodeLock.Lock() + node := fm.node + defer fm.nodeLock.Unlock() + mockDriverStatus := node.Attributes["driver.mock_driver"] - fm.nodeLock.Unlock() if mockDriverStatus != "" { return false, fmt.Errorf("mock driver attribute should not be set on the client") } @@ -195,38 +199,32 @@ func TestFingerprintManager_HealthCheck_Driver(t *testing.T) { t.Parallel() require := require.New(t) - node := &structs.Node{ - Attributes: make(map[string]string, 0), - Links: make(map[string]string, 0), - Resources: &structs.Resources{}, - Drivers: make(map[string]*structs.DriverInfo, 0), - } - testConfig := config.Config{Node: node} - testClient := &Client{config: &testConfig} - - conf := config.DefaultConfig() - conf.Options = map[string]string{ - "driver.raw_exec.enable": "1", - "test.shutdown_periodic_after": "true", - "test.shutdown_periodic_duration": "2", - } - getConfig := func() *config.Config { - return conf - } + s1, serverAddr := testServer(t, nil) + defer s1.Shutdown() + testutil.WaitForLeader(t, s1.RPC) + + testClient := TestClient(t, func(c *config.Config) { + c.RPCHandler = s1 + c.Servers = []string{serverAddr} + c.Options = map[string]string{ + "driver.raw_exec.enable": "1", + "test.shutdown_periodic_after": "true", + "test.shutdown_periodic_duration": "2", + } + }) - shutdownCh := make(chan struct{}) - defer (func() { - close(shutdownCh) - })() + testClient.logger = testLogger() + defer testClient.Shutdown() + waitTilNodeReady(testClient, t) fm := NewFingerprintManager( - getConfig, - node, - shutdownCh, + testClient.GetConfig, + testClient.config.Node, + testClient.shutdownCh, testClient.updateNodeFromFingerprint, testClient.updateNodeFromHealthCheck, testClient.updateNodeFromDriver, - testLogger(), + testClient.logger, ) err := fm.Run() @@ -234,6 +232,10 @@ func TestFingerprintManager_HealthCheck_Driver(t *testing.T) { // Ensure the mock driver is registered and healthy on the client testutil.WaitForResult(func() (bool, error) { + fm.nodeLock.Lock() + node := fm.node + defer fm.nodeLock.Unlock() + mockDriverAttribute := node.Attributes["driver.mock_driver"] if mockDriverAttribute == "" { return false, fmt.Errorf("mock driver info should be set on the client attributes") @@ -252,6 +254,10 @@ func TestFingerprintManager_HealthCheck_Driver(t *testing.T) { // Ensure that a default driver without health checks enabled is registered and healthy on the client testutil.WaitForResult(func() (bool, error) { + fm.nodeLock.Lock() + node := fm.node + defer fm.nodeLock.Unlock() + rawExecAttribute := node.Attributes["driver.raw_exec"] if rawExecAttribute == "" { return false, fmt.Errorf("raw exec info should be set on the client attributes") @@ -270,6 +276,10 @@ func TestFingerprintManager_HealthCheck_Driver(t *testing.T) { // Ensure the mock driver is de-registered when it becomes unhealthy testutil.WaitForResult(func() (bool, error) { + fm.nodeLock.Lock() + node := fm.node + defer fm.nodeLock.Unlock() + mockDriverAttribute := node.Attributes["driver.mock_driver"] if mockDriverAttribute == "" { return false, fmt.Errorf("mock driver info should be set on the client attributes") @@ -291,37 +301,32 @@ func TestFingerprintManager_HealthCheck_Periodic(t *testing.T) { t.Parallel() require := require.New(t) - node := &structs.Node{ - Attributes: make(map[string]string, 0), - Links: make(map[string]string, 0), - Resources: &structs.Resources{}, - Drivers: make(map[string]*structs.DriverInfo, 0), - } - testConfig := config.Config{Node: node} - testClient := &Client{config: &testConfig} - - conf := config.DefaultConfig() - conf.Options = map[string]string{ - "test.shutdown_periodic_after": "true", - "test.shutdown_periodic_duration": "3", - } - getConfig := func() *config.Config { - return conf - } + s1, serverAddr := testServer(t, nil) + defer s1.Shutdown() + testutil.WaitForLeader(t, s1.RPC) + + testClient := TestClient(t, func(c *config.Config) { + c.RPCHandler = s1 + c.Servers = []string{serverAddr} + c.Options = map[string]string{ + "driver.raw_exec.enable": "1", + "test.shutdown_periodic_after": "true", + "test.shutdown_periodic_duration": "2", + } + }) - shutdownCh := make(chan struct{}) - defer (func() { - close(shutdownCh) - })() + testClient.logger = testLogger() + defer testClient.Shutdown() + waitTilNodeReady(testClient, t) fm := NewFingerprintManager( - getConfig, - node, - shutdownCh, + testClient.GetConfig, + testClient.config.Node, + testClient.shutdownCh, testClient.updateNodeFromFingerprint, testClient.updateNodeFromHealthCheck, testClient.updateNodeFromDriver, - testLogger(), + testClient.logger, ) err := fm.Run() @@ -331,8 +336,10 @@ func TestFingerprintManager_HealthCheck_Periodic(t *testing.T) { // Ensure the mock driver is registered and healthy on the client testutil.WaitForResult(func() (bool, error) { fm.nodeLock.Lock() + node := fm.node + defer fm.nodeLock.Unlock() + mockDriverInfo := node.Drivers["mock_driver"] - fm.nodeLock.Unlock() if mockDriverInfo == nil { return false, fmt.Errorf("mock driver info should be set on the client") } @@ -352,8 +359,10 @@ func TestFingerprintManager_HealthCheck_Periodic(t *testing.T) { // marks the driver as unhealthy testutil.WaitForResult(func() (bool, error) { fm.nodeLock.Lock() + node := fm.node + defer fm.nodeLock.Unlock() + mockDriverInfo := node.Drivers["mock_driver"] - fm.nodeLock.Unlock() if mockDriverInfo == nil { return false, fmt.Errorf("mock driver info should be set on the client") } @@ -373,8 +382,10 @@ func TestFingerprintManager_HealthCheck_Periodic(t *testing.T) { // marks the driver as unhealthy testutil.WaitForResult(func() (bool, error) { fm.nodeLock.Lock() + node := fm.node + defer fm.nodeLock.Unlock() + mockDriverInfo := node.Drivers["mock_driver"] - fm.nodeLock.Unlock() if mockDriverInfo == nil { return false, fmt.Errorf("mock driver info should be set on the client") } @@ -395,38 +406,39 @@ func TestFimgerprintManager_Run_InWhitelist(t *testing.T) { t.Parallel() require := require.New(t) - node := &structs.Node{ - Attributes: make(map[string]string, 0), - Links: make(map[string]string, 0), - Resources: &structs.Resources{}, - Drivers: make(map[string]*structs.DriverInfo, 0), - } - testConfig := config.Config{Node: node} - testClient := &Client{config: &testConfig} - - conf := config.DefaultConfig() - conf.Options = map[string]string{"fingerprint.whitelist": " arch,cpu,memory,network,storage,foo,bar "} - getConfig := func() *config.Config { - return conf - } + s1, serverAddr := testServer(t, nil) + defer s1.Shutdown() + testutil.WaitForLeader(t, s1.RPC) + + testClient := TestClient(t, func(c *config.Config) { + c.RPCHandler = s1 + c.Servers = []string{serverAddr} + c.Options = map[string]string{ + "driver.raw_exec.enable": "1", + "test.shutdown_periodic_after": "true", + "test.shutdown_periodic_duration": "2", + } + }) - shutdownCh := make(chan struct{}) - defer (func() { - close(shutdownCh) - })() + testClient.logger = testLogger() + defer testClient.Shutdown() + waitTilNodeReady(testClient, t) fm := NewFingerprintManager( - getConfig, - node, - shutdownCh, + testClient.GetConfig, + testClient.config.Node, + testClient.shutdownCh, testClient.updateNodeFromFingerprint, testClient.updateNodeFromHealthCheck, testClient.updateNodeFromDriver, - testLogger(), + testClient.logger, ) err := fm.Run() require.Nil(err) + + node := testClient.config.Node + require.NotEqual(node.Attributes["cpu.frequency"], "") } @@ -434,39 +446,38 @@ func TestFingerprintManager_Run_InBlacklist(t *testing.T) { t.Parallel() require := require.New(t) - node := &structs.Node{ - Attributes: make(map[string]string, 0), - Links: make(map[string]string, 0), - Resources: &structs.Resources{}, - Drivers: make(map[string]*structs.DriverInfo, 0), - } - testConfig := config.Config{Node: node} - testClient := &Client{config: &testConfig} - - conf := config.DefaultConfig() - conf.Options = map[string]string{"fingerprint.whitelist": " arch,memory,foo,bar "} - conf.Options = map[string]string{"fingerprint.blacklist": " cpu "} - getConfig := func() *config.Config { - return conf - } + s1, serverAddr := testServer(t, nil) + defer s1.Shutdown() + testutil.WaitForLeader(t, s1.RPC) - shutdownCh := make(chan struct{}) - defer (func() { - close(shutdownCh) - })() + testClient := TestClient(t, func(c *config.Config) { + c.RPCHandler = s1 + c.Servers = []string{serverAddr} + c.Options = map[string]string{ + "fingerprint.whitelist": " arch,memory,foo,bar ", + "fingerprint.blacklist": " cpu ", + } + }) + + testClient.logger = testLogger() + defer testClient.Shutdown() + waitTilNodeReady(testClient, t) fm := NewFingerprintManager( - getConfig, - node, - shutdownCh, + testClient.GetConfig, + testClient.config.Node, + testClient.shutdownCh, testClient.updateNodeFromFingerprint, testClient.updateNodeFromHealthCheck, testClient.updateNodeFromDriver, - testLogger(), + testClient.logger, ) err := fm.Run() require.Nil(err) + + node := testClient.config.Node + require.Equal(node.Attributes["cpu.frequency"], "") require.NotEqual(node.Attributes["memory.totalbytes"], "") } @@ -475,39 +486,38 @@ func TestFingerprintManager_Run_Combination(t *testing.T) { t.Parallel() require := require.New(t) - node := &structs.Node{ - Attributes: make(map[string]string, 0), - Links: make(map[string]string, 0), - Resources: &structs.Resources{}, - Drivers: make(map[string]*structs.DriverInfo, 0), - } - testConfig := config.Config{Node: node} - testClient := &Client{config: &testConfig} - - conf := config.DefaultConfig() - conf.Options = map[string]string{"fingerprint.whitelist": " arch,cpu,memory,foo,bar "} - conf.Options = map[string]string{"fingerprint.blacklist": " memory,nomad "} - getConfig := func() *config.Config { - return conf - } + s1, serverAddr := testServer(t, nil) + defer s1.Shutdown() + testutil.WaitForLeader(t, s1.RPC) + + testClient := TestClient(t, func(c *config.Config) { + c.RPCHandler = s1 + c.Servers = []string{serverAddr} + c.Options = map[string]string{ + "fingerprint.whitelist": " arch,cpu,memory,foo,bar ", + "fingerprint.blacklist": " memory,nomad ", + } + }) - shutdownCh := make(chan struct{}) - defer (func() { - close(shutdownCh) - })() + testClient.logger = testLogger() + defer testClient.Shutdown() + waitTilNodeReady(testClient, t) fm := NewFingerprintManager( - getConfig, - node, - shutdownCh, + testClient.GetConfig, + testClient.config.Node, + testClient.shutdownCh, testClient.updateNodeFromFingerprint, testClient.updateNodeFromHealthCheck, testClient.updateNodeFromDriver, - testLogger(), + testClient.logger, ) err := fm.Run() require.Nil(err) + + node := testClient.config.Node + require.NotEqual(node.Attributes["cpu.frequency"], "") require.NotEqual(node.Attributes["cpu.arch"], "") require.Equal(node.Attributes["memory.totalbytes"], "") @@ -518,41 +528,37 @@ func TestFingerprintManager_Run_WhitelistDrivers(t *testing.T) { t.Parallel() require := require.New(t) - node := &structs.Node{ - Attributes: make(map[string]string, 0), - Links: make(map[string]string, 0), - Resources: &structs.Resources{}, - Drivers: make(map[string]*structs.DriverInfo, 0), - } - testConfig := config.Config{Node: node} - testClient := &Client{config: &testConfig} + s1, serverAddr := testServer(t, nil) + defer s1.Shutdown() + testutil.WaitForLeader(t, s1.RPC) - conf := config.DefaultConfig() - conf.Options = map[string]string{ - "driver.raw_exec.enable": "1", - "driver.whitelist": " raw_exec , foo ", - } - getConfig := func() *config.Config { - return conf - } + testClient := TestClient(t, func(c *config.Config) { + c.RPCHandler = s1 + c.Servers = []string{serverAddr} + c.Options = map[string]string{ + "driver.raw_exec.enable": "1", + "driver.whitelist": " raw_exec , foo ", + } + }) - shutdownCh := make(chan struct{}) - defer (func() { - close(shutdownCh) - })() + testClient.logger = testLogger() + defer testClient.Shutdown() + waitTilNodeReady(testClient, t) fm := NewFingerprintManager( - getConfig, - node, - shutdownCh, + testClient.GetConfig, + testClient.config.Node, + testClient.shutdownCh, testClient.updateNodeFromFingerprint, testClient.updateNodeFromHealthCheck, testClient.updateNodeFromDriver, - testLogger(), + testClient.logger, ) err := fm.Run() require.Nil(err) + + node := testClient.config.Node require.NotEqual(node.Attributes["driver.raw_exec"], "") } @@ -560,40 +566,37 @@ func TestFingerprintManager_Run_AllDriversBlacklisted(t *testing.T) { t.Parallel() require := require.New(t) - node := &structs.Node{ - Attributes: make(map[string]string, 0), - Links: make(map[string]string, 0), - Resources: &structs.Resources{}, - Drivers: make(map[string]*structs.DriverInfo, 0), - } - testConfig := config.Config{Node: node} - testClient := &Client{config: &testConfig} + s1, serverAddr := testServer(t, nil) + defer s1.Shutdown() + testutil.WaitForLeader(t, s1.RPC) - conf := config.DefaultConfig() - conf.Options = map[string]string{ - "driver.whitelist": " foo,bar,baz ", - } - getConfig := func() *config.Config { - return conf - } + testClient := TestClient(t, func(c *config.Config) { + c.RPCHandler = s1 + c.Servers = []string{serverAddr} + c.Options = map[string]string{ + "driver.whitelist": " foo,bar,baz ", + } + }) - shutdownCh := make(chan struct{}) - defer (func() { - close(shutdownCh) - })() + testClient.logger = testLogger() + defer testClient.Shutdown() + waitTilNodeReady(testClient, t) fm := NewFingerprintManager( - getConfig, - node, - shutdownCh, + testClient.GetConfig, + testClient.config.Node, + testClient.shutdownCh, testClient.updateNodeFromFingerprint, testClient.updateNodeFromHealthCheck, testClient.updateNodeFromDriver, - testLogger(), + testClient.logger, ) err := fm.Run() require.Nil(err) + + node := testClient.config.Node + require.Equal(node.Attributes["driver.raw_exec"], "") require.Equal(node.Attributes["driver.exec"], "") require.Equal(node.Attributes["driver.docker"], "") @@ -603,42 +606,39 @@ func TestFingerprintManager_Run_DriversWhiteListBlacklistCombination(t *testing. t.Parallel() require := require.New(t) - node := &structs.Node{ - Attributes: make(map[string]string, 0), - Links: make(map[string]string, 0), - Resources: &structs.Resources{}, - Drivers: make(map[string]*structs.DriverInfo, 0), - } - testConfig := config.Config{Node: node} - testClient := &Client{config: &testConfig} - - conf := config.DefaultConfig() - conf.Options = map[string]string{ - "driver.raw_exec.enable": "1", - "driver.whitelist": " raw_exec,exec,foo,bar,baz ", - "driver.blacklist": " exec,foo,bar,baz ", - } - getConfig := func() *config.Config { - return conf - } + s1, serverAddr := testServer(t, nil) + defer s1.Shutdown() + testutil.WaitForLeader(t, s1.RPC) + + testClient := TestClient(t, func(c *config.Config) { + c.RPCHandler = s1 + c.Servers = []string{serverAddr} + c.Options = map[string]string{ + "driver.raw_exec.enable": "1", + "driver.whitelist": " raw_exec,exec,foo,bar,baz ", + "driver.blacklist": " exec,foo,bar,baz ", + } + }) - shutdownCh := make(chan struct{}) - defer (func() { - close(shutdownCh) - })() + testClient.logger = testLogger() + defer testClient.Shutdown() + waitTilNodeReady(testClient, t) fm := NewFingerprintManager( - getConfig, - node, - shutdownCh, + testClient.GetConfig, + testClient.config.Node, + testClient.shutdownCh, testClient.updateNodeFromFingerprint, testClient.updateNodeFromHealthCheck, testClient.updateNodeFromDriver, - testLogger(), + testClient.logger, ) err := fm.Run() require.Nil(err) + + node := testClient.config.Node + require.NotEqual(node.Attributes["driver.raw_exec"], "") require.Equal(node.Attributes["driver.exec"], "") require.Equal(node.Attributes["foo"], "") @@ -650,39 +650,39 @@ func TestFingerprintManager_Run_DriversInBlacklist(t *testing.T) { t.Parallel() require := require.New(t) - node := &structs.Node{ - Attributes: make(map[string]string, 0), - Links: make(map[string]string, 0), - Resources: &structs.Resources{}, - Drivers: make(map[string]*structs.DriverInfo, 0), - } - conf := config.DefaultConfig() - conf.Options = map[string]string{ - "driver.raw_exec.enable": "1", - "driver.whitelist": " raw_exec,foo,bar,baz ", - "driver.blacklist": " exec,foo,bar,baz ", - } - conf.Node = node - - testClient := &Client{config: conf} + s1, serverAddr := testServer(t, nil) + defer s1.Shutdown() + testutil.WaitForLeader(t, s1.RPC) + + testClient := TestClient(t, func(c *config.Config) { + c.RPCHandler = s1 + c.Servers = []string{serverAddr} + c.Options = map[string]string{ + "driver.raw_exec.enable": "1", + "driver.whitelist": " raw_exec,foo,bar,baz ", + "driver.blacklist": " exec,foo,bar,baz ", + } + }) - shutdownCh := make(chan struct{}) - defer (func() { - close(shutdownCh) - })() + testClient.logger = testLogger() + defer testClient.Shutdown() + waitTilNodeReady(testClient, t) fm := NewFingerprintManager( testClient.GetConfig, - node, - shutdownCh, + testClient.config.Node, + testClient.shutdownCh, testClient.updateNodeFromFingerprint, testClient.updateNodeFromHealthCheck, testClient.updateNodeFromDriver, - testLogger(), + testClient.logger, ) err := fm.Run() require.Nil(err) + + node := testClient.config.Node + require.NotEqual(node.Attributes["driver.raw_exec"], "") require.Equal(node.Attributes["driver.exec"], "") } diff --git a/nomad/structs/node.go b/nomad/structs/node.go index d8b30bf8fe07..f0d4ae5d6f84 100644 --- a/nomad/structs/node.go +++ b/nomad/structs/node.go @@ -41,7 +41,7 @@ func (di *DriverInfo) HealthCheckEquals(other *DriverInfo) bool { return false } - if di.HealthDescription == other.HealthDescription { + if di.HealthDescription != other.HealthDescription { return false } From 8597da0c5d45f718c0777e4c47099fc7970b470d Mon Sep 17 00:00:00 2001 From: Chelsea Holland Komlo Date: Thu, 15 Mar 2018 17:45:00 -0400 Subject: [PATCH 15/24] simplify logic bump log level --- client/client.go | 12 +++++++----- client/fingerprint_manager.go | 2 +- scheduler/feasible.go | 4 +--- scheduler/feasible_test.go | 8 +++++--- 4 files changed, 14 insertions(+), 12 deletions(-) diff --git a/client/client.go b/client/client.go index 374ec514ae82..496a6dc3a68e 100644 --- a/client/client.go +++ b/client/client.go @@ -1024,9 +1024,11 @@ func (c *Client) updateNodeFromDriver(name string, fingerprint, health *structs. if fingerprint != nil { if c.config.Node.Drivers[name] == nil { + // if the driver info has not yet been set, do that here hasChanged = true c.config.Node.Drivers[name] = fingerprint } else { + // the driver info has already been set, fix it up if c.config.Node.Drivers[name].Detected != fingerprint.Detected { hasChanged = true c.config.Node.Drivers[name].Detected = fingerprint.Detected @@ -1054,7 +1056,11 @@ func (c *Client) updateNodeFromDriver(name string, fingerprint, health *structs. c.config.Node.Drivers[name] = health } else { oldVal := c.config.Node.Drivers[name] - if !health.HealthCheckEquals(oldVal) { + if health.HealthCheckEquals(oldVal) { + // make sure we accurately reflect the last time a health check has been + // performed for the driver. + oldVal.UpdateTime = health.UpdateTime + } else { hasChanged = true c.config.Node.Drivers[name].MergeHealthCheck(health) @@ -1068,10 +1074,6 @@ func (c *Client) updateNodeFromDriver(name string, fingerprint, health *structs. if err := c.submitNodeEvents(events); err != nil { c.logger.Printf("[ERR] nomad.client Error when submitting node event: %v", err) } - } else { - // make sure we accurately reflect the last time a health check has been - // performed for the driver. - oldVal.UpdateTime = health.UpdateTime } } } diff --git a/client/fingerprint_manager.go b/client/fingerprint_manager.go index 9f257c3757e1..5f1207a5214c 100644 --- a/client/fingerprint_manager.go +++ b/client/fingerprint_manager.go @@ -264,7 +264,7 @@ func (fm *FingerprintManager) setupFingerprinters(fingerprints []string) error { f, err := fingerprint.NewFingerprint(name, fm.logger) if err != nil { - fm.logger.Printf("[DEBUG] client.fingerprint_manager: fingerprinting for %v failed: %+v", name, err) + fm.logger.Printf("[ERR] client.fingerprint_manager: fingerprinting for %v failed: %+v", name, err) return err } diff --git a/scheduler/feasible.go b/scheduler/feasible.go index 35582b4845c0..f6d862cb60c8 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -158,9 +158,7 @@ func (c *DriverChecker) hasDrivers(option *structs.Node) bool { return false } - if !enabled { - return false - } + return enabled } } return true diff --git a/scheduler/feasible_test.go b/scheduler/feasible_test.go index 7f4357d8f2fa..087945c6415c 100644 --- a/scheduler/feasible_test.go +++ b/scheduler/feasible_test.go @@ -9,6 +9,7 @@ import ( "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" + "github.com/stretchr/testify/require" ) func TestStaticIterator_Reset(t *testing.T) { @@ -127,7 +128,9 @@ func TestDriverChecker(t *testing.T) { } func TestDriverChecker_HealthChecks(t *testing.T) { + require := require.New(t) _, ctx := testContext(t) + nodes := []*structs.Node{ mock.Node(), mock.Node(), @@ -182,9 +185,8 @@ func TestDriverChecker_HealthChecks(t *testing.T) { testDrivers[i]: {}, } checker := NewDriverChecker(ctx, drivers) - if act := checker.Feasible(c.Node); act != c.Result { - t.Fatalf("case(%d) failed: got %v; want %v", i, act, c.Result) - } + act := checker.Feasible(c.Node) + require.Equal(act, c.Result) } } From 7b2ed01472fc88ab973cbeff565f142c443de95d Mon Sep 17 00:00:00 2001 From: Chelsea Holland Komlo Date: Thu, 15 Mar 2018 17:56:00 -0400 Subject: [PATCH 16/24] remove unused function --- client/client.go | 69 +----------------------------- client/fingerprint_manager.go | 6 --- client/fingerprint_manager_test.go | 13 ------ 3 files changed, 2 insertions(+), 86 deletions(-) diff --git a/client/client.go b/client/client.go index 496a6dc3a68e..b2189ed62485 100644 --- a/client/client.go +++ b/client/client.go @@ -259,8 +259,8 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulServic } fingerprintManager := NewFingerprintManager(c.GetConfig, c.config.Node, - c.shutdownCh, c.updateNodeFromFingerprint, c.updateNodeFromHealthCheck, - c.updateNodeFromDriver, c.logger) + c.shutdownCh, c.updateNodeFromFingerprint, c.updateNodeFromDriver, + c.logger) // Fingerprint the node and scan for drivers if err := fingerprintManager.Run(); err != nil { @@ -1086,71 +1086,6 @@ func (c *Client) updateNodeFromDriver(name string, fingerprint, health *structs. return c.config.Node } -/* Model for converging the periodic fingerprinting and health checking: - -func watcher(driver Driver) { - p := driver.Periodic() - healthChecker driver.HealthChecker() - hDur := healthChecker.Periodic() - - var t1, t2 time.Timer - - if healthChecker { - t2 = time.NewTimer(hDur) - } - - for { - select { - case shutdown: - case periodicFingerprint <- t1.C: - // Do stuff - t1.Reset(its duration) - case healthCheck <- t2.C: // Nil: never fire - // Do stuff - newHealth := driver.HealthCheck() - updateNodeFromDriver(nil, newHealth) - t2.Reset(its duration) - } - } -} -*/ - -// updateNodeFromHealthCheck receives a health check response and updates the -// node accordingly -// TODO is this even needed? -func (c *Client) updateNodeFromHealthCheck(response *cstructs.HealthCheckResponse) *structs.Node { - c.configLock.Lock() - defer c.configLock.Unlock() - - nodeHasChanged := false - - // update the node with the latest driver health information - for name, newVal := range response.Drivers { - if newVal == nil { - continue - } - oldVal := c.config.Node.Drivers[name] - if newVal.HealthCheckEquals(oldVal) { - - // make sure we accurately reflect the last time a health check has been - // performed for the driver. - oldVal.UpdateTime = newVal.UpdateTime - continue - } - nodeHasChanged = true - if oldVal == nil { - c.config.Node.Drivers[name] = newVal - } else { - } - } - - if nodeHasChanged { - c.updateNode() - } - - return c.config.Node -} - // resourcesAreEqual is a temporary function to compare whether resources are // equal. We can use this until we change fingerprinters to set pointers on a // return type. diff --git a/client/fingerprint_manager.go b/client/fingerprint_manager.go index 5f1207a5214c..d3fccb265f82 100644 --- a/client/fingerprint_manager.go +++ b/client/fingerprint_manager.go @@ -25,10 +25,6 @@ type FingerprintManager struct { // associated node updateNodeAttributes func(*cstructs.FingerprintResponse) *structs.Node - // updateHealthCheck is a callback to the client to update the state of the - // node for resources that require a health check - updateHealthCheck func(*cstructs.HealthCheckResponse) *structs.Node - updateNodeFromDriver func(string, *structs.DriverInfo, *structs.DriverInfo) *structs.Node logger *log.Logger } @@ -39,13 +35,11 @@ func NewFingerprintManager(getConfig func() *config.Config, node *structs.Node, shutdownCh chan struct{}, updateNodeAttributes func(*cstructs.FingerprintResponse) *structs.Node, - updateHealthCheck func(*cstructs.HealthCheckResponse) *structs.Node, updateNodeFromDriver func(string, *structs.DriverInfo, *structs.DriverInfo) *structs.Node, logger *log.Logger) *FingerprintManager { return &FingerprintManager{ getConfig: getConfig, updateNodeAttributes: updateNodeAttributes, - updateHealthCheck: updateHealthCheck, updateNodeFromDriver: updateNodeFromDriver, node: node, shutdownCh: shutdownCh, diff --git a/client/fingerprint_manager_test.go b/client/fingerprint_manager_test.go index f7008cd7193b..82cb2186f8c1 100644 --- a/client/fingerprint_manager_test.go +++ b/client/fingerprint_manager_test.go @@ -35,7 +35,6 @@ func TestFingerprintManager_Run_MockDriver(t *testing.T) { testClient.config.Node, testClient.shutdownCh, testClient.updateNodeFromFingerprint, - testClient.updateNodeFromHealthCheck, testClient.updateNodeFromDriver, testLogger(), ) @@ -71,7 +70,6 @@ func TestFingerprintManager_Run_ResourcesFingerprint(t *testing.T) { testClient.config.Node, testClient.shutdownCh, testClient.updateNodeFromFingerprint, - testClient.updateNodeFromHealthCheck, testClient.updateNodeFromDriver, testClient.logger, ) @@ -108,7 +106,6 @@ func TestFingerprintManager_Fingerprint_Run(t *testing.T) { testClient.config.Node, testClient.shutdownCh, testClient.updateNodeFromFingerprint, - testClient.updateNodeFromHealthCheck, testClient.updateNodeFromDriver, testClient.logger, ) @@ -150,7 +147,6 @@ func TestFingerprintManager_Fingerprint_Periodic(t *testing.T) { testClient.config.Node, testClient.shutdownCh, testClient.updateNodeFromFingerprint, - testClient.updateNodeFromHealthCheck, testClient.updateNodeFromDriver, testClient.logger, ) @@ -222,7 +218,6 @@ func TestFingerprintManager_HealthCheck_Driver(t *testing.T) { testClient.config.Node, testClient.shutdownCh, testClient.updateNodeFromFingerprint, - testClient.updateNodeFromHealthCheck, testClient.updateNodeFromDriver, testClient.logger, ) @@ -324,7 +319,6 @@ func TestFingerprintManager_HealthCheck_Periodic(t *testing.T) { testClient.config.Node, testClient.shutdownCh, testClient.updateNodeFromFingerprint, - testClient.updateNodeFromHealthCheck, testClient.updateNodeFromDriver, testClient.logger, ) @@ -429,7 +423,6 @@ func TestFimgerprintManager_Run_InWhitelist(t *testing.T) { testClient.config.Node, testClient.shutdownCh, testClient.updateNodeFromFingerprint, - testClient.updateNodeFromHealthCheck, testClient.updateNodeFromDriver, testClient.logger, ) @@ -468,7 +461,6 @@ func TestFingerprintManager_Run_InBlacklist(t *testing.T) { testClient.config.Node, testClient.shutdownCh, testClient.updateNodeFromFingerprint, - testClient.updateNodeFromHealthCheck, testClient.updateNodeFromDriver, testClient.logger, ) @@ -508,7 +500,6 @@ func TestFingerprintManager_Run_Combination(t *testing.T) { testClient.config.Node, testClient.shutdownCh, testClient.updateNodeFromFingerprint, - testClient.updateNodeFromHealthCheck, testClient.updateNodeFromDriver, testClient.logger, ) @@ -550,7 +541,6 @@ func TestFingerprintManager_Run_WhitelistDrivers(t *testing.T) { testClient.config.Node, testClient.shutdownCh, testClient.updateNodeFromFingerprint, - testClient.updateNodeFromHealthCheck, testClient.updateNodeFromDriver, testClient.logger, ) @@ -587,7 +577,6 @@ func TestFingerprintManager_Run_AllDriversBlacklisted(t *testing.T) { testClient.config.Node, testClient.shutdownCh, testClient.updateNodeFromFingerprint, - testClient.updateNodeFromHealthCheck, testClient.updateNodeFromDriver, testClient.logger, ) @@ -629,7 +618,6 @@ func TestFingerprintManager_Run_DriversWhiteListBlacklistCombination(t *testing. testClient.config.Node, testClient.shutdownCh, testClient.updateNodeFromFingerprint, - testClient.updateNodeFromHealthCheck, testClient.updateNodeFromDriver, testClient.logger, ) @@ -673,7 +661,6 @@ func TestFingerprintManager_Run_DriversInBlacklist(t *testing.T) { testClient.config.Node, testClient.shutdownCh, testClient.updateNodeFromFingerprint, - testClient.updateNodeFromHealthCheck, testClient.updateNodeFromDriver, testClient.logger, ) From 06a306e460042e30d6d001d545a7a6d44367efaf Mon Sep 17 00:00:00 2001 From: Chelsea Holland Komlo Date: Mon, 19 Mar 2018 08:06:09 -0400 Subject: [PATCH 17/24] improve comments; update watchDriver --- client/client.go | 8 +-- client/driver/docker.go | 2 +- client/fingerprint_manager.go | 104 ++++++++++++++++++++-------------- scheduler/feasible.go | 2 +- 4 files changed, 67 insertions(+), 49 deletions(-) diff --git a/client/client.go b/client/client.go index b2189ed62485..00bed1fd6ad4 100644 --- a/client/client.go +++ b/client/client.go @@ -1024,11 +1024,11 @@ func (c *Client) updateNodeFromDriver(name string, fingerprint, health *structs. if fingerprint != nil { if c.config.Node.Drivers[name] == nil { - // if the driver info has not yet been set, do that here + // If the driver info has not yet been set, do that here hasChanged = true c.config.Node.Drivers[name] = fingerprint } else { - // the driver info has already been set, fix it up + // The driver info has already been set, fix it up if c.config.Node.Drivers[name].Detected != fingerprint.Detected { hasChanged = true c.config.Node.Drivers[name].Detected = fingerprint.Detected @@ -1057,7 +1057,7 @@ func (c *Client) updateNodeFromDriver(name string, fingerprint, health *structs. } else { oldVal := c.config.Node.Drivers[name] if health.HealthCheckEquals(oldVal) { - // make sure we accurately reflect the last time a health check has been + // Make sure we accurately reflect the last time a health check has been // performed for the driver. oldVal.UpdateTime = health.UpdateTime } else { @@ -1067,7 +1067,7 @@ func (c *Client) updateNodeFromDriver(name string, fingerprint, health *structs. events := []*structs.NodeEvent{ { Subsystem: "Driver", - Message: fmt.Sprintf("Driver status changed: %s", health.HealthDescription), + Message: health.HealthDescription, Timestamp: time.Now().Unix(), }, } diff --git a/client/driver/docker.go b/client/driver/docker.go index c2fc516ce87d..c70c00df6d18 100644 --- a/client/driver/docker.go +++ b/client/driver/docker.go @@ -569,7 +569,7 @@ func (d *DockerDriver) HealthCheck(req *cstructs.HealthCheckRequest, resp *cstru _, err = client.ListContainers(docker.ListContainersOptions{All: false}) if err != nil { - d.logger.Printf("[WARN] driver.docker: failed to list Docker containers in the process of a docker health check: %v", err) + d.logger.Printf("[WARN] driver.docker: failed to list Docker containers in the process of a Docker health check: %v", err) resp.AddDriverInfo("docker", unhealthy) return err } diff --git a/client/fingerprint_manager.go b/client/fingerprint_manager.go index d3fccb265f82..2ae9047420f0 100644 --- a/client/fingerprint_manager.go +++ b/client/fingerprint_manager.go @@ -25,6 +25,8 @@ type FingerprintManager struct { // associated node updateNodeAttributes func(*cstructs.FingerprintResponse) *structs.Node + // updateNodeFromDriver is a callback to the client to update the state of a + // specific driver for the node updateNodeFromDriver func(string, *structs.DriverInfo, *structs.DriverInfo) *structs.Node logger *log.Logger } @@ -85,29 +87,26 @@ func (fm *FingerprintManager) setupDrivers(drivers []string) error { detected, err := fm.fingerprintDriver(name, d) if err != nil { - fm.logger.Printf("[DEBUG] client.fingerprint_manager: fingerprinting for %v failed: %+v", name, err) + fm.logger.Printf("[DEBUG] client.fingerprint_manager: fingerprinting driver %v failed: %+v", name, err) return err } - if hc, ok := d.(fingerprint.HealthCheck); ok { - fm.runHealthCheck(name, hc) - } else { - // for drivers which are not of type health check, update them to have their - // health status match the status of whether they are detected or not. - healthInfo := &structs.DriverInfo{ - Healthy: detected, - UpdateTime: time.Now(), - } - if node := fm.updateNodeFromDriver(name, nil, healthInfo); node != nil { - fm.nodeLock.Lock() - fm.node = node - fm.nodeLock.Unlock() - } + // For all drivers upon initialization, create a driver info which matches + // its fingerprint status. Later, for drivers that have the health check + // interface implemented, a periodic health check will be run + healthInfo := &structs.DriverInfo{ + Healthy: detected, + UpdateTime: time.Now(), + } + if node := fm.updateNodeFromDriver(name, nil, healthInfo); node != nil { + fm.nodeLock.Lock() + fm.node = node + fm.nodeLock.Unlock() } go fm.watchDriver(d, name) - // log the fingerprinters which have been applied + // Log the fingerprinters which have been applied if detected { availDrivers = append(availDrivers, name) } @@ -120,45 +119,61 @@ func (fm *FingerprintManager) setupDrivers(drivers []string) error { // watchDrivers facilitates the different periods between fingerprint and // health checking a driver func (fm *FingerprintManager) watchDriver(d driver.Driver, name string) { - _, fingerprintPeriod := d.Periodic() + isPeriodic, fingerprintPeriod := d.Periodic() + if !isPeriodic { + return + } + fm.logger.Printf("[DEBUG] client.fingerprint_manager: fingerprinting driver %s every %v", name, fingerprintPeriod) - var healthCheckPeriod time.Duration + fingerprintTimer := time.NewTicker(fingerprintPeriod) + defer fingerprintTimer.Stop() + hc, isHealthCheck := d.(fingerprint.HealthCheck) if isHealthCheck { + // For types that implement the health check interface, run both the + // fingerprint and health check periodic functions and update the node req := &cstructs.HealthCheckIntervalRequest{} var resp cstructs.HealthCheckIntervalResponse - hc.GetHealthCheckInterval(req, &resp) - if resp.Eligible { - fm.logger.Printf("[DEBUG] client.fingerprint_manager: health checking driver %s every %v", name, healthCheckPeriod) - healthCheckPeriod = resp.Period + err := hc.GetHealthCheckInterval(req, &resp) + if err != nil { + fm.logger.Printf("[ERR] client.fingerprint_manager: error when getting health check interval: %v", err) } - } - t1 := time.NewTimer(fingerprintPeriod) - defer t1.Stop() - t2 := time.NewTimer(healthCheckPeriod) - defer t2.Stop() + fm.logger.Printf("[DEBUG] client.fingerprint_manager: health checking driver %s every %v", name, resp.Period) + healthCheckTimer := time.NewTicker(resp.Period) + defer healthCheckTimer.Stop() - for { - select { - case <-fm.shutdownCh: - return - case <-t1.C: - t1.Reset(fingerprintPeriod) - _, err := fm.fingerprintDriver(name, d) - if err != nil { - fm.logger.Printf("[DEBUG] client.fingerprint_manager: periodic fingerprinting for driver %v failed: %+v", name, err) - } - case <-t2.C: - if isHealthCheck { - t2.Reset(healthCheckPeriod) + for { + select { + case <-fm.shutdownCh: + return + case <-fingerprintTimer.C: + _, err := fm.fingerprintDriver(name, d) + if err != nil { + fm.logger.Printf("[DEBUG] client.fingerprint_manager: periodic fingerprinting for driver %v failed: %+v", name, err) + } + case <-healthCheckTimer.C: err := fm.runHealthCheck(name, hc) if err != nil { fm.logger.Printf("[DEBUG] client.fingerprint_manager: health checking for %v failed: %v", name, err) } } } + } else { + // For types that do not have a health check capacity, run only the + // periodic fingerprint method + for { + select { + case <-fm.shutdownCh: + return + case <-fingerprintTimer.C: + _, err := fm.fingerprintDriver(name, d) + if err != nil { + fm.logger.Printf("[DEBUG] client.fingerprint_manager: periodic fingerprinting for driver %v failed: %+v", name, err) + } + } + } } } @@ -239,6 +254,9 @@ func (fm *FingerprintManager) runHealthCheck(name string, hc fingerprint.HealthC request := &cstructs.HealthCheckRequest{} var response cstructs.HealthCheckResponse err := hc.HealthCheck(request, &response) + if err != nil { + return err + } if node := fm.updateNodeFromDriver(name, nil, response.Drivers[name]); node != nil { fm.nodeLock.Lock() @@ -246,7 +264,7 @@ func (fm *FingerprintManager) runHealthCheck(name string, hc fingerprint.HealthC fm.nodeLock.Unlock() } - return err + return nil } // setupFingerprints is used to fingerprint the node to see if these attributes are @@ -287,7 +305,7 @@ func (fm *FingerprintManager) setupFingerprinters(fingerprints []string) error { // those which require periotic checking, it starts a periodic process for // each. func (fp *FingerprintManager) Run() error { - // first, set up all fingerprints + // First, set up all fingerprints cfg := fp.getConfig() whitelistFingerprints := cfg.ReadStringListToMap("fingerprint.whitelist") whitelistFingerprintsEnabled := len(whitelistFingerprints) > 0 @@ -320,7 +338,7 @@ func (fp *FingerprintManager) Run() error { fp.logger.Printf("[DEBUG] client.fingerprint_manager: fingerprint modules skipped due to white/blacklist: %v", skippedFingerprints) } - // next, set up drivers + // Next, set up drivers // Build the white/blacklists of drivers. whitelistDrivers := cfg.ReadStringListToMap("driver.whitelist") whitelistDriversEnabled := len(whitelistDrivers) > 0 diff --git a/scheduler/feasible.go b/scheduler/feasible.go index f6d862cb60c8..b8bfc758b1a0 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -130,7 +130,7 @@ func (c *DriverChecker) hasDrivers(option *structs.Node) bool { for driver := range c.drivers { driverStr := fmt.Sprintf("driver.%s", driver) - // COMPAT: Remove in 0.X: As of Nomad 0.8, nodes have a DriverInfo that + // COMPAT: Remove in 0.10: As of Nomad 0.8, nodes have a DriverInfo that // corresponds with every driver. As a Nomad server might be on a later // version than a Nomad client, we need to check for compatibility here // to verify the client supports this. From cba0a4d441cdc3bb137d7cf342a6718a23463fb0 Mon Sep 17 00:00:00 2001 From: Chelsea Holland Komlo Date: Mon, 19 Mar 2018 12:50:18 -0400 Subject: [PATCH 18/24] function rename and re-arrange functions in fingerprint_manager --- client/fingerprint_manager.go | 298 +++++++++++++++++----------------- 1 file changed, 149 insertions(+), 149 deletions(-) diff --git a/client/fingerprint_manager.go b/client/fingerprint_manager.go index 2ae9047420f0..351c04d533f9 100644 --- a/client/fingerprint_manager.go +++ b/client/fingerprint_manager.go @@ -49,28 +49,110 @@ func NewFingerprintManager(getConfig func() *config.Config, } } -// runFingerprint runs each fingerprinter individually on an ongoing basis -func (fm *FingerprintManager) runFingerprint(f fingerprint.Fingerprint, period time.Duration, name string) { - fm.logger.Printf("[DEBUG] client.fingerprint_manager: fingerprinting %s every %v", name, period) +// Run starts the process of fingerprinting the node. It does an initial pass, +// identifying whitelisted and blacklisted fingerprints/drivers. Then, for +// those which require periotic checking, it starts a periodic process for +// each. +func (fp *FingerprintManager) Run() error { + // First, set up all fingerprints + cfg := fp.getConfig() + whitelistFingerprints := cfg.ReadStringListToMap("fingerprint.whitelist") + whitelistFingerprintsEnabled := len(whitelistFingerprints) > 0 + blacklistFingerprints := cfg.ReadStringListToMap("fingerprint.blacklist") - timer := time.NewTimer(period) - defer timer.Stop() + fp.logger.Printf("[DEBUG] client.fingerprint_manager: built-in fingerprints: %v", fingerprint.BuiltinFingerprints()) - for { - select { - case <-timer.C: - timer.Reset(period) + var availableFingerprints []string + var skippedFingerprints []string + for _, name := range fingerprint.BuiltinFingerprints() { + // Skip modules that are not in the whitelist if it is enabled. + if _, ok := whitelistFingerprints[name]; whitelistFingerprintsEnabled && !ok { + skippedFingerprints = append(skippedFingerprints, name) + continue + } + // Skip modules that are in the blacklist + if _, ok := blacklistFingerprints[name]; ok { + skippedFingerprints = append(skippedFingerprints, name) + continue + } - _, err := fm.fingerprint(name, f) - if err != nil { - fm.logger.Printf("[DEBUG] client.fingerprint_manager: periodic fingerprinting for %v failed: %+v", name, err) - continue - } + availableFingerprints = append(availableFingerprints, name) + } - case <-fm.shutdownCh: - return + if err := fp.setupFingerprinters(availableFingerprints); err != nil { + return err + } + + if len(skippedFingerprints) != 0 { + fp.logger.Printf("[DEBUG] client.fingerprint_manager: fingerprint modules skipped due to white/blacklist: %v", skippedFingerprints) + } + + // Next, set up drivers + // Build the white/blacklists of drivers. + whitelistDrivers := cfg.ReadStringListToMap("driver.whitelist") + whitelistDriversEnabled := len(whitelistDrivers) > 0 + blacklistDrivers := cfg.ReadStringListToMap("driver.blacklist") + + var availDrivers []string + var skippedDrivers []string + + for name := range driver.BuiltinDrivers { + // Skip fingerprinting drivers that are not in the whitelist if it is + // enabled. + if _, ok := whitelistDrivers[name]; whitelistDriversEnabled && !ok { + skippedDrivers = append(skippedDrivers, name) + continue + } + // Skip fingerprinting drivers that are in the blacklist + if _, ok := blacklistDrivers[name]; ok { + skippedDrivers = append(skippedDrivers, name) + continue + } + + availDrivers = append(availDrivers, name) + } + + if err := fp.setupDrivers(availDrivers); err != nil { + return err + } + + if len(skippedDrivers) > 0 { + fp.logger.Printf("[DEBUG] client.fingerprint_manager: drivers skipped due to white/blacklist: %v", skippedDrivers) + } + return nil +} + +// setupFingerprints is used to fingerprint the node to see if these attributes are +// supported +func (fm *FingerprintManager) setupFingerprinters(fingerprints []string) error { + var appliedFingerprints []string + + for _, name := range fingerprints { + f, err := fingerprint.NewFingerprint(name, fm.logger) + + if err != nil { + fm.logger.Printf("[ERR] client.fingerprint_manager: fingerprinting for %v failed: %+v", name, err) + return err + } + + detected, err := fm.fingerprint(name, f) + if err != nil { + return err + } + + // log the fingerprinters which have been applied + if detected { + appliedFingerprints = append(appliedFingerprints, name) + } + + p, period := f.Periodic() + if p { + go fm.runFingerprint(f, period, name) } } + + fm.logger.Printf("[DEBUG] client.fingerprint_manager: detected fingerprints %v", appliedFingerprints) + return nil } // setupDrivers is used to fingerprint the node to see if these drivers are @@ -116,6 +198,54 @@ func (fm *FingerprintManager) setupDrivers(drivers []string) error { return nil } +// runFingerprint runs each fingerprinter individually on an ongoing basis +func (fm *FingerprintManager) runFingerprint(f fingerprint.Fingerprint, period time.Duration, name string) { + fm.logger.Printf("[DEBUG] client.fingerprint_manager: fingerprinting %s every %v", name, period) + + timer := time.NewTimer(period) + defer timer.Stop() + + for { + select { + case <-timer.C: + timer.Reset(period) + + _, err := fm.fingerprint(name, f) + if err != nil { + fm.logger.Printf("[DEBUG] client.fingerprint_manager: periodic fingerprinting for %v failed: %+v", name, err) + continue + } + + case <-fm.shutdownCh: + return + } + } +} + +// fingerprint does an initial fingerprint of the client. If the fingerprinter +// is meant to be run continuously, a process is launched to perform this +// fingerprint on an ongoing basis in the background. +func (fm *FingerprintManager) fingerprint(name string, f fingerprint.Fingerprint) (bool, error) { + var response cstructs.FingerprintResponse + + fm.nodeLock.Lock() + request := &cstructs.FingerprintRequest{Config: fm.getConfig(), Node: fm.node} + err := f.Fingerprint(request, &response) + fm.nodeLock.Unlock() + + if err != nil { + return false, err + } + + if node := fm.updateNodeAttributes(&response); node != nil { + fm.nodeLock.Lock() + fm.node = node + fm.nodeLock.Unlock() + } + + return response.Detected, nil +} + // watchDrivers facilitates the different periods between fingerprint and // health checking a driver func (fm *FingerprintManager) watchDriver(d driver.Driver, name string) { @@ -154,7 +284,7 @@ func (fm *FingerprintManager) watchDriver(d driver.Driver, name string) { fm.logger.Printf("[DEBUG] client.fingerprint_manager: periodic fingerprinting for driver %v failed: %+v", name, err) } case <-healthCheckTimer.C: - err := fm.runHealthCheck(name, hc) + err := fm.runDriverHealthCheck(name, hc) if err != nil { fm.logger.Printf("[DEBUG] client.fingerprint_manager: health checking for %v failed: %v", name, err) } @@ -225,32 +355,8 @@ func (fm *FingerprintManager) fingerprintDriver(name string, f fingerprint.Finge return response.Detected, nil } -// fingerprint does an initial fingerprint of the client. If the fingerprinter -// is meant to be run continuously, a process is launched to perform this -// fingerprint on an ongoing basis in the background. -func (fm *FingerprintManager) fingerprint(name string, f fingerprint.Fingerprint) (bool, error) { - var response cstructs.FingerprintResponse - - fm.nodeLock.Lock() - request := &cstructs.FingerprintRequest{Config: fm.getConfig(), Node: fm.node} - err := f.Fingerprint(request, &response) - fm.nodeLock.Unlock() - - if err != nil { - return false, err - } - - if node := fm.updateNodeAttributes(&response); node != nil { - fm.nodeLock.Lock() - fm.node = node - fm.nodeLock.Unlock() - } - - return response.Detected, nil -} - -// healthcheck checks the health of the specified resource. -func (fm *FingerprintManager) runHealthCheck(name string, hc fingerprint.HealthCheck) error { +// runDriverHealthCheck checks the health of the specified resource. +func (fm *FingerprintManager) runDriverHealthCheck(name string, hc fingerprint.HealthCheck) error { request := &cstructs.HealthCheckRequest{} var response cstructs.HealthCheckResponse err := hc.HealthCheck(request, &response) @@ -266,109 +372,3 @@ func (fm *FingerprintManager) runHealthCheck(name string, hc fingerprint.HealthC return nil } - -// setupFingerprints is used to fingerprint the node to see if these attributes are -// supported -func (fm *FingerprintManager) setupFingerprinters(fingerprints []string) error { - var appliedFingerprints []string - - for _, name := range fingerprints { - f, err := fingerprint.NewFingerprint(name, fm.logger) - - if err != nil { - fm.logger.Printf("[ERR] client.fingerprint_manager: fingerprinting for %v failed: %+v", name, err) - return err - } - - detected, err := fm.fingerprint(name, f) - if err != nil { - return err - } - - // log the fingerprinters which have been applied - if detected { - appliedFingerprints = append(appliedFingerprints, name) - } - - p, period := f.Periodic() - if p { - go fm.runFingerprint(f, period, name) - } - } - - fm.logger.Printf("[DEBUG] client.fingerprint_manager: detected fingerprints %v", appliedFingerprints) - return nil -} - -// Run starts the process of fingerprinting the node. It does an initial pass, -// identifying whitelisted and blacklisted fingerprints/drivers. Then, for -// those which require periotic checking, it starts a periodic process for -// each. -func (fp *FingerprintManager) Run() error { - // First, set up all fingerprints - cfg := fp.getConfig() - whitelistFingerprints := cfg.ReadStringListToMap("fingerprint.whitelist") - whitelistFingerprintsEnabled := len(whitelistFingerprints) > 0 - blacklistFingerprints := cfg.ReadStringListToMap("fingerprint.blacklist") - - fp.logger.Printf("[DEBUG] client.fingerprint_manager: built-in fingerprints: %v", fingerprint.BuiltinFingerprints()) - - var availableFingerprints []string - var skippedFingerprints []string - for _, name := range fingerprint.BuiltinFingerprints() { - // Skip modules that are not in the whitelist if it is enabled. - if _, ok := whitelistFingerprints[name]; whitelistFingerprintsEnabled && !ok { - skippedFingerprints = append(skippedFingerprints, name) - continue - } - // Skip modules that are in the blacklist - if _, ok := blacklistFingerprints[name]; ok { - skippedFingerprints = append(skippedFingerprints, name) - continue - } - - availableFingerprints = append(availableFingerprints, name) - } - - if err := fp.setupFingerprinters(availableFingerprints); err != nil { - return err - } - - if len(skippedFingerprints) != 0 { - fp.logger.Printf("[DEBUG] client.fingerprint_manager: fingerprint modules skipped due to white/blacklist: %v", skippedFingerprints) - } - - // Next, set up drivers - // Build the white/blacklists of drivers. - whitelistDrivers := cfg.ReadStringListToMap("driver.whitelist") - whitelistDriversEnabled := len(whitelistDrivers) > 0 - blacklistDrivers := cfg.ReadStringListToMap("driver.blacklist") - - var availDrivers []string - var skippedDrivers []string - - for name := range driver.BuiltinDrivers { - // Skip fingerprinting drivers that are not in the whitelist if it is - // enabled. - if _, ok := whitelistDrivers[name]; whitelistDriversEnabled && !ok { - skippedDrivers = append(skippedDrivers, name) - continue - } - // Skip fingerprinting drivers that are in the blacklist - if _, ok := blacklistDrivers[name]; ok { - skippedDrivers = append(skippedDrivers, name) - continue - } - - availDrivers = append(availDrivers, name) - } - - if err := fp.setupDrivers(availDrivers); err != nil { - return err - } - - if len(skippedDrivers) > 0 { - fp.logger.Printf("[DEBUG] client.fingerprint_manager: drivers skipped due to white/blacklist: %v", skippedDrivers) - } - return nil -} From 9c143301de1008a16961782273b0566288e3351a Mon Sep 17 00:00:00 2001 From: Chelsea Holland Komlo Date: Tue, 20 Mar 2018 13:25:07 -0400 Subject: [PATCH 19/24] fix issue when updating node events --- client/client.go | 18 +++++++++-------- client/fingerprint_manager.go | 38 +++++++++++++++++++++-------------- nomad/structs/node.go | 4 ++-- nomad/structs/node_test.go | 8 +++++--- 4 files changed, 40 insertions(+), 28 deletions(-) diff --git a/client/client.go b/client/client.go index 00bed1fd6ad4..f1022f444285 100644 --- a/client/client.go +++ b/client/client.go @@ -1062,18 +1062,20 @@ func (c *Client) updateNodeFromDriver(name string, fingerprint, health *structs. oldVal.UpdateTime = health.UpdateTime } else { hasChanged = true - c.config.Node.Drivers[name].MergeHealthCheck(health) - events := []*structs.NodeEvent{ - { + // Only emit an event if the health status has changed, not if we are + // simply updating a node on startup + if health.Healthy != oldVal.Healthy && oldVal.HealthDescription != "" { + event := &structs.NodeEvent{ Subsystem: "Driver", Message: health.HealthDescription, Timestamp: time.Now().Unix(), - }, - } - if err := c.submitNodeEvents(events); err != nil { - c.logger.Printf("[ERR] nomad.client Error when submitting node event: %v", err) + } + c.triggerNodeEvent(event) } + + // Update the node with the latest information + c.config.Node.Drivers[name].MergeHealthCheck(health) } } } @@ -1267,7 +1269,7 @@ func (c *Client) watchNodeEvents() { timer.Reset(c.retryIntv(nodeUpdateRetryIntv)) } else { // Reset the events since we successfully sent them. - batchEvents = nil + batchEvents = []*structs.NodeEvent{} } case <-c.shutdownCh: return diff --git a/client/fingerprint_manager.go b/client/fingerprint_manager.go index 351c04d533f9..00f2aed480c6 100644 --- a/client/fingerprint_manager.go +++ b/client/fingerprint_manager.go @@ -173,17 +173,25 @@ func (fm *FingerprintManager) setupDrivers(drivers []string) error { return err } - // For all drivers upon initialization, create a driver info which matches - // its fingerprint status. Later, for drivers that have the health check - // interface implemented, a periodic health check will be run - healthInfo := &structs.DriverInfo{ - Healthy: detected, - UpdateTime: time.Now(), - } - if node := fm.updateNodeFromDriver(name, nil, healthInfo); node != nil { - fm.nodeLock.Lock() - fm.node = node - fm.nodeLock.Unlock() + hc, isHealthCheck := d.(fingerprint.HealthCheck) + if isHealthCheck { + err := fm.runDriverHealthCheck(name, hc) + if err != nil { + fm.logger.Printf("[DEBUG] client.fingerprint_manager: health checking for %v failed: %v", name, err) + } + } else { + // For all drivers without health checking enabled , create a driver info which matches + // its fingerprint status. Later, for drivers that have the health check + // interface implemented, a periodic health check will be run + healthInfo := &structs.DriverInfo{ + Healthy: detected, + UpdateTime: time.Now(), + } + if node := fm.updateNodeFromDriver(name, nil, healthInfo); node != nil { + fm.nodeLock.Lock() + fm.node = node + fm.nodeLock.Unlock() + } } go fm.watchDriver(d, name) @@ -360,15 +368,15 @@ func (fm *FingerprintManager) runDriverHealthCheck(name string, hc fingerprint.H request := &cstructs.HealthCheckRequest{} var response cstructs.HealthCheckResponse err := hc.HealthCheck(request, &response) - if err != nil { - return err - } + // Update the status of the node irregardless if there was an error- in the + // case of periodic health checks, an error will occur if a health check + // fails if node := fm.updateNodeFromDriver(name, nil, response.Drivers[name]); node != nil { fm.nodeLock.Lock() fm.node = node fm.nodeLock.Unlock() } - return nil + return err } diff --git a/nomad/structs/node.go b/nomad/structs/node.go index f0d4ae5d6f84..a4eb91e719c6 100644 --- a/nomad/structs/node.go +++ b/nomad/structs/node.go @@ -33,8 +33,8 @@ func (di *DriverInfo) MergeFingerprintInfo(other *DriverInfo) { // in the process of health checking, we only check the fields that are // computed by the health checker. In the future, this will be merged. func (di *DriverInfo) HealthCheckEquals(other *DriverInfo) bool { - if di == nil && other != nil || di != nil && other == nil { - return false + if di == nil && other == nil { + return true } if di.Healthy != other.Healthy { diff --git a/nomad/structs/node_test.go b/nomad/structs/node_test.go index 877b8cabb556..aee21accb01b 100644 --- a/nomad/structs/node_test.go +++ b/nomad/structs/node_test.go @@ -40,16 +40,18 @@ func TestDriverInfoEquals(t *testing.T) { { []*DriverInfo{ { + Detected: false, Healthy: true, - HealthDescription: "running", + HealthDescription: "This driver is ok", }, { + Detected: true, Healthy: true, - HealthDescription: "running", + HealthDescription: "This driver is ok", }, }, true, - "Different health description values should not be equal.", + "Same health check should be equal", }, } for _, testCase := range driverInfoTest { From ffe9292e243c62b83ad3515e800f83464f17bf67 Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Tue, 20 Mar 2018 14:56:49 -0700 Subject: [PATCH 20/24] Only run health check if driver is detected --- client/fingerprint_manager.go | 108 +++++++++++++++++----------------- 1 file changed, 54 insertions(+), 54 deletions(-) diff --git a/client/fingerprint_manager.go b/client/fingerprint_manager.go index 00f2aed480c6..d8b94cac022a 100644 --- a/client/fingerprint_manager.go +++ b/client/fingerprint_manager.go @@ -173,16 +173,11 @@ func (fm *FingerprintManager) setupDrivers(drivers []string) error { return err } - hc, isHealthCheck := d.(fingerprint.HealthCheck) - if isHealthCheck { - err := fm.runDriverHealthCheck(name, hc) - if err != nil { - fm.logger.Printf("[DEBUG] client.fingerprint_manager: health checking for %v failed: %v", name, err) - } - } else { - // For all drivers without health checking enabled , create a driver info which matches - // its fingerprint status. Later, for drivers that have the health check - // interface implemented, a periodic health check will be run + // For all drivers without health checking enabled , create a driver + // info which matches its fingerprint status. Later, for drivers that + // have the health check interface implemented, a periodic health check + // will be run + if _, isHealthCheck := d.(fingerprint.HealthCheck); !isHealthCheck { healthInfo := &structs.DriverInfo{ Healthy: detected, UpdateTime: time.Now(), @@ -194,6 +189,8 @@ func (fm *FingerprintManager) setupDrivers(drivers []string) error { } } + // Start a periodic watcher to detect changes to a drivers health and + // attributes. go fm.watchDriver(d, name) // Log the fingerprinters which have been applied @@ -257,58 +254,59 @@ func (fm *FingerprintManager) fingerprint(name string, f fingerprint.Fingerprint // watchDrivers facilitates the different periods between fingerprint and // health checking a driver func (fm *FingerprintManager) watchDriver(d driver.Driver, name string) { + var fingerprintTicker, healthTicker <-chan time.Time + + // Determine whether the fingerprinter is periodic and health checking isPeriodic, fingerprintPeriod := d.Periodic() - if !isPeriodic { + hc, isHealthCheck := d.(fingerprint.HealthCheck) + + // Nothing to do since the state of this driver will never change + if !isPeriodic && !isHealthCheck { return } - fm.logger.Printf("[DEBUG] client.fingerprint_manager: fingerprinting driver %s every %v", name, fingerprintPeriod) - - fingerprintTimer := time.NewTicker(fingerprintPeriod) - defer fingerprintTimer.Stop() - - hc, isHealthCheck := d.(fingerprint.HealthCheck) + // Setup the required tickers + if isPeriodic { + ticker := time.NewTicker(fingerprintPeriod) + fingerprintTicker = ticker.C + defer ticker.Stop() + fm.logger.Printf("[DEBUG] client.fingerprint_manager: fingerprinting driver %s every %v", name, fingerprintPeriod) + } if isHealthCheck { - // For types that implement the health check interface, run both the - // fingerprint and health check periodic functions and update the node + // Determine the interval at which to health check req := &cstructs.HealthCheckIntervalRequest{} var resp cstructs.HealthCheckIntervalResponse - err := hc.GetHealthCheckInterval(req, &resp) - if err != nil { - fm.logger.Printf("[ERR] client.fingerprint_manager: error when getting health check interval: %v", err) + + if err := hc.GetHealthCheckInterval(req, &resp); err != nil { + fm.logger.Printf("[ERR] client.fingerprint_manager: error getting health check interval for driver %s: %v", name, err) + } else if resp.Eligible { + ticker := time.NewTicker(resp.Period) + healthTicker = ticker.C + defer ticker.Stop() + fm.logger.Printf("[DEBUG] client.fingerprint_manager: health checking driver %s every %v", name, resp.Period) } + } - fm.logger.Printf("[DEBUG] client.fingerprint_manager: health checking driver %s every %v", name, resp.Period) - healthCheckTimer := time.NewTicker(resp.Period) - defer healthCheckTimer.Stop() - - for { - select { - case <-fm.shutdownCh: - return - case <-fingerprintTimer.C: - _, err := fm.fingerprintDriver(name, d) - if err != nil { - fm.logger.Printf("[DEBUG] client.fingerprint_manager: periodic fingerprinting for driver %v failed: %+v", name, err) - } - case <-healthCheckTimer.C: - err := fm.runDriverHealthCheck(name, hc) - if err != nil { - fm.logger.Printf("[DEBUG] client.fingerprint_manager: health checking for %v failed: %v", name, err) - } + for { + select { + case <-fm.shutdownCh: + return + case <-fingerprintTicker: + if _, err := fm.fingerprintDriver(name, d); err != nil { + fm.logger.Printf("[DEBUG] client.fingerprint_manager: periodic fingerprinting for driver %v failed: %+v", name, err) } - } - } else { - // For types that do not have a health check capacity, run only the - // periodic fingerprint method - for { - select { - case <-fm.shutdownCh: - return - case <-fingerprintTimer.C: - _, err := fm.fingerprintDriver(name, d) - if err != nil { - fm.logger.Printf("[DEBUG] client.fingerprint_manager: periodic fingerprinting for driver %v failed: %+v", name, err) + case <-healthTicker: + // Determine if we should run the health check + fm.nodeLock.Lock() + driver, detected := fm.node.Drivers[name] + if detected && driver != nil { + detected = driver.Detected + } + fm.nodeLock.Unlock() + + if detected { + if err := fm.runDriverHealthCheck(name, hc); err != nil { + fm.logger.Printf("[DEBUG] client.fingerprint_manager: health checking for %v failed: %v", name, err) } } } @@ -367,7 +365,9 @@ func (fm *FingerprintManager) fingerprintDriver(name string, f fingerprint.Finge func (fm *FingerprintManager) runDriverHealthCheck(name string, hc fingerprint.HealthCheck) error { request := &cstructs.HealthCheckRequest{} var response cstructs.HealthCheckResponse - err := hc.HealthCheck(request, &response) + if err := hc.HealthCheck(request, &response); err != nil { + return err + } // Update the status of the node irregardless if there was an error- in the // case of periodic health checks, an error will occur if a health check @@ -378,5 +378,5 @@ func (fm *FingerprintManager) runDriverHealthCheck(name string, hc fingerprint.H fm.nodeLock.Unlock() } - return err + return nil } From b59bea98b063ffdfbea8871bb0d7ae87a1405c0b Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Tue, 20 Mar 2018 15:05:43 -0700 Subject: [PATCH 21/24] Docker driver doesn't return errors but injects into the DriverInfo --- client/driver/docker.go | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/client/driver/docker.go b/client/driver/docker.go index c70c00df6d18..e5ffb1afa80f 100644 --- a/client/driver/docker.go +++ b/client/driver/docker.go @@ -555,32 +555,30 @@ func (d *DockerDriver) Fingerprint(req *cstructs.FingerprintRequest, resp *cstru // performs a health check on the docker driver, asserting whether the docker // driver is responsive to a `docker ps` command. func (d *DockerDriver) HealthCheck(req *cstructs.HealthCheckRequest, resp *cstructs.HealthCheckResponse) error { - unhealthy := &structs.DriverInfo{ - HealthDescription: "Docker driver is available but unresponsive", - UpdateTime: time.Now(), + dinfo := &structs.DriverInfo{ + UpdateTime: time.Now(), } client, _, err := d.dockerClients() if err != nil { d.logger.Printf("[WARN] driver.docker: failed to retrieve Docker client in the process of a docker health check: %v", err) - resp.AddDriverInfo("docker", unhealthy) - return err + dinfo.HealthDescription = fmt.Sprintf("Failed retrieving Docker client: %v", err) + resp.AddDriverInfo("docker", dinfo) + return nil } _, err = client.ListContainers(docker.ListContainersOptions{All: false}) if err != nil { d.logger.Printf("[WARN] driver.docker: failed to list Docker containers in the process of a Docker health check: %v", err) - resp.AddDriverInfo("docker", unhealthy) - return err + dinfo.HealthDescription = fmt.Sprintf("Failed to list Docker containers: %v", err) + resp.AddDriverInfo("docker", dinfo) + return nil } d.logger.Printf("[TRACE] driver.docker: docker driver is available and is responsive to `docker ps`") - healthy := &structs.DriverInfo{ - Healthy: true, - HealthDescription: "Docker driver is available and responsive", - UpdateTime: time.Now(), - } - resp.AddDriverInfo("docker", healthy) + dinfo.Healthy = true + dinfo.HealthDescription = "Docker driver is available and responsive" + resp.AddDriverInfo("docker", dinfo) return nil } From 127b2c6ef7c999354013c1a4598a64318aea3e3c Mon Sep 17 00:00:00 2001 From: Chelsea Holland Komlo Date: Wed, 21 Mar 2018 13:05:37 -0400 Subject: [PATCH 22/24] set driver to unhealthy once if it cannot be detected in periodic check --- client/fingerprint_manager.go | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/client/fingerprint_manager.go b/client/fingerprint_manager.go index d8b94cac022a..066a809f066f 100644 --- a/client/fingerprint_manager.go +++ b/client/fingerprint_manager.go @@ -1,6 +1,7 @@ package client import ( + "fmt" "log" "strings" "sync" @@ -308,6 +309,19 @@ func (fm *FingerprintManager) watchDriver(d driver.Driver, name string) { if err := fm.runDriverHealthCheck(name, hc); err != nil { fm.logger.Printf("[DEBUG] client.fingerprint_manager: health checking for %v failed: %v", name, err) } + } else { + // If the driver is undetected, change the health status to unhealthy + // only once. + healthInfo := &structs.DriverInfo{ + Healthy: false, + HealthDescription: fmt.Sprintf("Driver %s is not detected", name), + UpdateTime: time.Now(), + } + if node := fm.updateNodeFromDriver(name, nil, healthInfo); node != nil { + fm.nodeLock.Lock() + fm.node = node + fm.nodeLock.Unlock() + } } } } From eb3a53efa28e7bd8890c438c33678e3d2c175c97 Mon Sep 17 00:00:00 2001 From: Chelsea Holland Komlo Date: Wed, 21 Mar 2018 14:33:10 -0400 Subject: [PATCH 23/24] always set initial health status for every driver --- client/fingerprint_manager.go | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/client/fingerprint_manager.go b/client/fingerprint_manager.go index 066a809f066f..f1452aeb5547 100644 --- a/client/fingerprint_manager.go +++ b/client/fingerprint_manager.go @@ -174,20 +174,17 @@ func (fm *FingerprintManager) setupDrivers(drivers []string) error { return err } - // For all drivers without health checking enabled , create a driver - // info which matches its fingerprint status. Later, for drivers that - // have the health check interface implemented, a periodic health check - // will be run - if _, isHealthCheck := d.(fingerprint.HealthCheck); !isHealthCheck { - healthInfo := &structs.DriverInfo{ - Healthy: detected, - UpdateTime: time.Now(), - } - if node := fm.updateNodeFromDriver(name, nil, healthInfo); node != nil { - fm.nodeLock.Lock() - fm.node = node - fm.nodeLock.Unlock() - } + // Set the initial health check status to be the driver detected status. + // Later, the periodic health checker will update this value for drivers + // where health checks are enabled. + healthInfo := &structs.DriverInfo{ + Healthy: detected, + UpdateTime: time.Now(), + } + if node := fm.updateNodeFromDriver(name, nil, healthInfo); node != nil { + fm.nodeLock.Lock() + fm.node = node + fm.nodeLock.Unlock() } // Start a periodic watcher to detect changes to a drivers health and From 89ffc9602d543afe2a9be00824f92492fb23b747 Mon Sep 17 00:00:00 2001 From: Chelsea Holland Komlo Date: Wed, 21 Mar 2018 15:32:40 -0400 Subject: [PATCH 24/24] fix up scheduling test --- scheduler/feasible.go | 31 ++++++++++++++++--------------- scheduler/feasible_test.go | 2 +- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/scheduler/feasible.go b/scheduler/feasible.go index b8bfc758b1a0..4b1a995710bb 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -134,8 +134,7 @@ func (c *DriverChecker) hasDrivers(option *structs.Node) bool { // corresponds with every driver. As a Nomad server might be on a later // version than a Nomad client, we need to check for compatibility here // to verify the client supports this. - if option.Drivers != nil { - driverInfo := option.Drivers[driver] + if driverInfo, ok := option.Drivers[driver]; ok { if driverInfo == nil { c.ctx.Logger(). Printf("[WARN] scheduler.DriverChecker: node %v has no driver info set for %v", @@ -144,21 +143,23 @@ func (c *DriverChecker) hasDrivers(option *structs.Node) bool { } return driverInfo.Detected && driverInfo.Healthy - } else { - value, ok := option.Attributes[driverStr] - if !ok { - return false - } + } - enabled, err := strconv.ParseBool(value) - if err != nil { - c.ctx.Logger(). - Printf("[WARN] scheduler.DriverChecker: node %v has invalid driver setting %v: %v", - option.ID, driverStr, value) - return false - } + value, ok := option.Attributes[driverStr] + if !ok { + return false + } - return enabled + enabled, err := strconv.ParseBool(value) + if err != nil { + c.ctx.Logger(). + Printf("[WARN] scheduler.DriverChecker: node %v has invalid driver setting %v: %v", + option.ID, driverStr, value) + return false + } + + if !enabled { + return false } } return true diff --git a/scheduler/feasible_test.go b/scheduler/feasible_test.go index 087945c6415c..7b2129e1c78d 100644 --- a/scheduler/feasible_test.go +++ b/scheduler/feasible_test.go @@ -127,7 +127,7 @@ func TestDriverChecker(t *testing.T) { } } -func TestDriverChecker_HealthChecks(t *testing.T) { +func Test_HealthChecks(t *testing.T) { require := require.New(t) _, ctx := testContext(t)