From c06891956d11f07c7bada82898cd014372932b36 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Thu, 19 Apr 2018 11:11:33 -0700 Subject: [PATCH 1/3] rkt: create parent cgroup to enable stats Having the Nomad executor create parent cgroups that rkt is launched within allows the stats collection code used for the exec driver to Just Work. The only downside is that now the Nomad executor's resource utilization counts against the cgroups resource limits just as it does for the exec driver. --- client/driver/rkt.go | 17 ++++++++++++----- client/driver/rkt_test.go | 1 - 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/client/driver/rkt.go b/client/driver/rkt.go index 0a990dc839d4..1ca82ebb3d5f 100644 --- a/client/driver/rkt.go +++ b/client/driver/rkt.go @@ -194,7 +194,10 @@ func rktGetDriverNetwork(uuid string, driverConfigPortMap map[string]string, log return nil, lastErr } - // This is a successful landing. + // This is a successful landing; log if its not the first attempt. + if try > 1 { + logger.Printf("[DEBUG] driver.rkt: retrieved network info for pod UUID %s on attempt %d", uuid, try) + } return &cstructs.DriverNetwork{ PortMap: portmap, IP: status.Networks[0].IP.String(), @@ -211,7 +214,7 @@ func rktGetDriverNetwork(uuid string, driverConfigPortMap map[string]string, log } waitTime := getJitteredNetworkRetryTime() - logger.Printf("[DEBUG] driver.rkt: getting network info for pod UUID %s failed attempt %d: %v. Sleeping for %v", uuid, try, lastErr, waitTime) + logger.Printf("[DEBUG] driver.rkt: failed getting network info for pod UUID %s attempt %d: %v. Sleeping for %v", uuid, try, lastErr, waitTime) time.Sleep(waitTime) } return nil, fmt.Errorf("timed out, last error: %v", lastErr) @@ -665,9 +668,13 @@ func (d *RktDriver) Start(ctx *ExecContext, task *structs.Task) (*StartResponse, return nil, fmt.Errorf("failed to set executor context: %v", err) } + // Enable ResourceLimits to place the executor in a parent cgroup of + // the rkt container. This allows stats collection via the executor to + // work just like it does for exec. execCmd := &executor.ExecCommand{ - Cmd: absPath, - Args: runArgs, + Cmd: absPath, + Args: runArgs, + ResourceLimits: true, } ps, err := execIntf.LaunchCmd(execCmd) if err != nil { @@ -818,7 +825,7 @@ func (h *rktHandle) Kill() error { } func (h *rktHandle) Stats() (*cstructs.TaskResourceUsage, error) { - return nil, DriverStatsNotImplemented + return h.executor.Stats() } func (h *rktHandle) run() { diff --git a/client/driver/rkt_test.go b/client/driver/rkt_test.go index 914c704ed7a3..b7727eddcb57 100644 --- a/client/driver/rkt_test.go +++ b/client/driver/rkt_test.go @@ -94,7 +94,6 @@ func TestRktDriver_Start_DNS(t *testing.T) { } ctestutils.RktCompatible(t) - // TODO: use test server to load from a fixture task := &structs.Task{ Name: "etcd", Driver: "rkt", From a3dba1db78baaf6e5a4a1201efbbd2d25a686659 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Fri, 20 Apr 2018 14:38:03 -0700 Subject: [PATCH 2/3] rkt: test Stats() and always run tests Remove the NOMAD_TEST_RKT flag as a guard for rkt tests. Still require Linux, root, and rkt to be installed. Only check for rkt installation once in hopes of speeding up rkt tests a bit. --- GNUmakefile | 3 +- client/driver/exec_test.go | 2 +- client/driver/rkt_test.go | 160 ++++++++++-------- client/testutil/driver_compatible.go | 19 ++- e2e/rescheduling/server_side_restarts_test.go | 1 - 5 files changed, 111 insertions(+), 74 deletions(-) diff --git a/GNUmakefile b/GNUmakefile index e221a89da080..048951b0586d 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -243,8 +243,7 @@ test: ## Run the Nomad test suite and/or the Nomad UI test suite .PHONY: test-nomad test-nomad: dev ## Run Nomad test suites @echo "==> Running Nomad test suites:" - @NOMAD_TEST_RKT=1 \ - go test $(if $(VERBOSE),-v) \ + @go test $(if $(VERBOSE),-v) \ -cover \ -timeout=900s \ -tags="$(if $(HAS_LXC),lxc)" ./... $(if $(VERBOSE), >test.log ; echo $$? > exit-code) diff --git a/client/driver/exec_test.go b/client/driver/exec_test.go index d854c56a0974..3fab5c8cf5c4 100644 --- a/client/driver/exec_test.go +++ b/client/driver/exec_test.go @@ -331,7 +331,7 @@ func TestExecDriver_HandlerExec(t *testing.T) { handle := resp.Handle // Exec a command that should work and dump the environment - out, code, err := handle.Exec(context.Background(), "/bin/sh", []string{"-c", "env | grep NOMAD"}) + out, code, err := handle.Exec(context.Background(), "/bin/sh", []string{"-c", "env | grep ^NOMAD"}) if err != nil { t.Fatalf("error exec'ing stat: %v", err) } diff --git a/client/driver/rkt_test.go b/client/driver/rkt_test.go index b7727eddcb57..3aae9ff57c47 100644 --- a/client/driver/rkt_test.go +++ b/client/driver/rkt_test.go @@ -22,14 +22,12 @@ import ( "github.com/hashicorp/nomad/testutil" "github.com/stretchr/testify/assert" - ctestutils "github.com/hashicorp/nomad/client/testutil" + ctestutil "github.com/hashicorp/nomad/client/testutil" ) func TestRktVersionRegex(t *testing.T) { + ctestutil.RktCompatible(t) t.Parallel() - if os.Getenv("NOMAD_TEST_RKT") == "" { - t.Skip("NOMAD_TEST_RKT unset, skipping") - } inputRkt := "rkt version 0.8.1" inputAppc := "appc version 1.2.0" @@ -47,12 +45,9 @@ func TestRktVersionRegex(t *testing.T) { // The fingerprinter test should always pass, even if rkt is not installed. func TestRktDriver_Fingerprint(t *testing.T) { + ctestutil.RktCompatible(t) t.Parallel() - if os.Getenv("NOMAD_TEST_RKT") == "" { - t.Skip("skipping rkt tests") - } - ctestutils.RktCompatible(t) ctx := testDriverContexts(t, &structs.Task{Name: "foo", Driver: "rkt"}) d := NewRktDriver(ctx.DriverCtx) node := &structs.Node{ @@ -86,14 +81,11 @@ func TestRktDriver_Fingerprint(t *testing.T) { } func TestRktDriver_Start_DNS(t *testing.T) { + ctestutil.RktCompatible(t) if !testutil.IsTravis() { t.Parallel() } - if os.Getenv("NOMAD_TEST_RKT") == "" { - t.Skip("skipping rkt tests") - } - ctestutils.RktCompatible(t) task := &structs.Task{ Name: "etcd", Driver: "rkt", @@ -139,14 +131,11 @@ func TestRktDriver_Start_DNS(t *testing.T) { } func TestRktDriver_Start_Wait(t *testing.T) { + ctestutil.RktCompatible(t) if !testutil.IsTravis() { t.Parallel() } - if os.Getenv("NOMAD_TEST_RKT") == "" { - t.Skip("skipping rkt tests") - } - ctestutils.RktCompatible(t) task := &structs.Task{ Name: "etcd", Driver: "rkt", @@ -213,14 +202,11 @@ func TestRktDriver_Start_Wait(t *testing.T) { } func TestRktDriver_Start_Wait_Skip_Trust(t *testing.T) { + ctestutil.RktCompatible(t) if !testutil.IsTravis() { t.Parallel() } - if os.Getenv("NOMAD_TEST_RKT") == "" { - t.Skip("skipping rkt tests") - } - ctestutils.RktCompatible(t) task := &structs.Task{ Name: "etcd", Driver: "rkt", @@ -269,14 +255,10 @@ func TestRktDriver_Start_Wait_Skip_Trust(t *testing.T) { } func TestRktDriver_Start_Wait_AllocDir(t *testing.T) { + ctestutil.RktCompatible(t) if !testutil.IsTravis() { t.Parallel() } - if os.Getenv("NOMAD_TEST_RKT") == "" { - t.Skip("skipping rkt tests") - } - - ctestutils.RktCompatible(t) exp := []byte{'w', 'i', 'n'} file := "output.txt" @@ -346,13 +328,10 @@ func TestRktDriver_Start_Wait_AllocDir(t *testing.T) { // TestRktDriver_UserGroup asserts tasks may override the user and group of the // rkt image. func TestRktDriver_UserGroup(t *testing.T) { + ctestutil.RktCompatible(t) if !testutil.IsTravis() { t.Parallel() } - if os.Getenv("NOMAD_TEST_RKT") == "" { - t.Skip("skipping rkt tests") - } - ctestutils.RktCompatible(t) require := assert.New(t) task := &structs.Task{ @@ -407,13 +386,11 @@ func TestRktDriver_UserGroup(t *testing.T) { } func TestRktTrustPrefix(t *testing.T) { + ctestutil.RktCompatible(t) if !testutil.IsTravis() { t.Parallel() } - if os.Getenv("NOMAD_TEST_RKT") == "" { - t.Skip("skipping rkt tests") - } - ctestutils.RktCompatible(t) + task := &structs.Task{ Name: "etcd", Driver: "rkt", @@ -451,8 +428,9 @@ func TestRktTrustPrefix(t *testing.T) { } func TestRktTaskValidate(t *testing.T) { + ctestutil.RktCompatible(t) t.Parallel() - ctestutils.RktCompatible(t) + task := &structs.Task{ Name: "etcd", Driver: "rkt", @@ -475,16 +453,12 @@ func TestRktTaskValidate(t *testing.T) { } } -// TODO: Port Mapping test should be ran with proper ACI image and test the port access. -func TestRktDriver_PortsMapping(t *testing.T) { +func TestRktDriver_PortMapping(t *testing.T) { + ctestutil.RktCompatible(t) if !testutil.IsTravis() { t.Parallel() } - if os.Getenv("NOMAD_TEST_RKT") == "" { - t.Skip("skipping rkt tests") - } - ctestutils.RktCompatible(t) task := &structs.Task{ Name: "etcd", Driver: "rkt", @@ -524,6 +498,7 @@ func TestRktDriver_PortsMapping(t *testing.T) { if err != nil { t.Fatalf("err: %v", err) } + defer resp.Handle.Kill() if resp.Network == nil { t.Fatalf("Expected driver to set a DriverNetwork, but it did not!") } @@ -548,14 +523,11 @@ func TestRktDriver_PortsMapping(t *testing.T) { // TestRktDriver_PortsMapping_Host asserts that port_map isn't required when // host networking is used. func TestRktDriver_PortsMapping_Host(t *testing.T) { + ctestutil.RktCompatible(t) if !testutil.IsTravis() { t.Parallel() } - if os.Getenv("NOMAD_TEST_RKT") == "" { - t.Skip("skipping rkt tests") - } - ctestutils.RktCompatible(t) task := &structs.Task{ Name: "etcd", Driver: "rkt", @@ -590,6 +562,7 @@ func TestRktDriver_PortsMapping_Host(t *testing.T) { if err != nil { t.Fatalf("err: %v", err) } + defer resp.Handle.Kill() if resp.Network != nil { t.Fatalf("No network should be returned with --net=host but found: %#v", resp.Network) } @@ -612,14 +585,11 @@ func TestRktDriver_PortsMapping_Host(t *testing.T) { } func TestRktDriver_HandlerExec(t *testing.T) { + ctestutil.RktCompatible(t) if !testutil.IsTravis() { t.Parallel() } - if os.Getenv("NOMAD_TEST_RKT") == "" { - t.Skip("skipping rkt tests") - } - ctestutils.RktCompatible(t) task := &structs.Task{ Name: "etcd", Driver: "rkt", @@ -649,24 +619,28 @@ func TestRktDriver_HandlerExec(t *testing.T) { if err != nil { t.Fatalf("err: %v", err) } - - // Give the pod a second to start - time.Sleep(time.Second) + defer resp.Handle.Kill() // Exec a command that should work - out, code, err := resp.Handle.Exec(context.TODO(), "/etcd", []string{"--version"}) - if err != nil { - t.Fatalf("error exec'ing etcd --version: %v", err) - } - if code != 0 { - t.Fatalf("expected `etcd --version` to succeed but exit code was: %d\n%s", code, string(out)) - } - if expected := []byte("etcd version "); !bytes.HasPrefix(out, expected) { - t.Fatalf("expected output to start with %q but found:\n%q", expected, out) - } + testutil.WaitForResult(func() (bool, error) { + out, code, err := resp.Handle.Exec(context.TODO(), "/etcd", []string{"--version"}) + if err != nil { + return false, fmt.Errorf("error exec'ing etcd --version: %v", err) + } + if code != 0 { + return false, fmt.Errorf("expected `etcd --version` to succeed but exit code was: %d\n%s", code, string(out)) + } + if expected := []byte("etcd version "); !bytes.HasPrefix(out, expected) { + return false, fmt.Errorf("expected output to start with %q but found:\n%q", expected, out) + } + + return true, nil + }, func(err error) { + t.Fatalf("err: %v", err) + }) // Exec a command that should fail - out, code, err = resp.Handle.Exec(context.TODO(), "/etcd", []string{"--kaljdshf"}) + out, code, err := resp.Handle.Exec(context.TODO(), "/etcd", []string{"--kaljdshf"}) if err != nil { t.Fatalf("error exec'ing bad command: %v", err) } @@ -682,15 +656,69 @@ func TestRktDriver_HandlerExec(t *testing.T) { } } -func TestRktDriver_Remove_Error(t *testing.T) { +func TestRktDriver_Stats(t *testing.T) { + ctestutil.RktCompatible(t) if !testutil.IsTravis() { t.Parallel() } - if os.Getenv("NOMAD_TEST_RKT") == "" { - t.Skip("skipping rkt tests") + + task := &structs.Task{ + Name: "etcd", + Driver: "rkt", + Config: map[string]interface{}{ + "trust_prefix": "coreos.com/etcd", + "image": "coreos.com/etcd:v2.0.4", + "command": "/etcd", + }, + LogConfig: &structs.LogConfig{ + MaxFiles: 10, + MaxFileSizeMB: 10, + }, + Resources: &structs.Resources{ + MemoryMB: 128, + CPU: 100, + }, } - ctestutils.RktCompatible(t) + ctx := testDriverContexts(t, task) + defer ctx.AllocDir.Destroy() + d := NewRktDriver(ctx.DriverCtx) + + if _, err := d.Prestart(ctx.ExecCtx, task); err != nil { + t.Fatalf("error in prestart: %v", err) + } + resp, err := d.Start(ctx.ExecCtx, task) + if err != nil { + t.Fatalf("err: %v", err) + } + defer resp.Handle.Kill() + + testutil.WaitForResult(func() (bool, error) { + stats, err := resp.Handle.Stats() + if err != nil { + return false, err + } + if stats == nil || stats.ResourceUsage == nil { + return false, fmt.Errorf("stats is nil") + } + if stats.ResourceUsage.CpuStats.TotalTicks == 0 { + return false, fmt.Errorf("cpu ticks unset") + } + if stats.ResourceUsage.MemoryStats.RSS == 0 { + return false, fmt.Errorf("rss stats unset") + } + return true, nil + }, func(err error) { + t.Fatalf("error: %v", err) + }) + +} + +func TestRktDriver_Remove_Error(t *testing.T) { + ctestutil.RktCompatible(t) + if !testutil.IsTravis() { + t.Parallel() + } // Removing a nonexistent pod should return an error if err := rktRemove("00000000-0000-0000-0000-000000000000"); err == nil { diff --git a/client/testutil/driver_compatible.go b/client/testutil/driver_compatible.go index 97f994c444db..7d928459a533 100644 --- a/client/testutil/driver_compatible.go +++ b/client/testutil/driver_compatible.go @@ -3,6 +3,7 @@ package testutil import ( "os/exec" "runtime" + "sync" "syscall" "testing" ) @@ -38,13 +39,23 @@ func QemuCompatible(t *testing.T) { } } +var rktExists bool +var rktOnce sync.Once + func RktCompatible(t *testing.T) { - if runtime.GOOS == "windows" || syscall.Geteuid() != 0 { - t.Skip("Must be root on non-windows environments to run test") + if runtime.GOOS != "linux" || syscall.Geteuid() != 0 { + t.Skip("Must be root on Linux to run test") } + // else see if rkt exists - _, err := exec.Command("rkt", "version").CombinedOutput() - if err != nil { + rktOnce.Do(func() { + _, err := exec.Command("rkt", "version").CombinedOutput() + if err == nil { + rktExists = true + } + }) + + if !rktExists { t.Skip("Must have rkt installed for rkt specific tests to run") } } diff --git a/e2e/rescheduling/server_side_restarts_test.go b/e2e/rescheduling/server_side_restarts_test.go index fb260b0b8649..9f0f402df4af 100644 --- a/e2e/rescheduling/server_side_restarts_test.go +++ b/e2e/rescheduling/server_side_restarts_test.go @@ -6,7 +6,6 @@ import ( "github.com/hashicorp/nomad/api" "github.com/hashicorp/nomad/jobspec" - _ "github.com/hashicorp/nomad/jobspec" . "github.com/onsi/ginkgo" . "github.com/onsi/gomega" From a98ec5a8012cea1e6911de6bad57e854aae521be Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Tue, 24 Apr 2018 14:10:10 -0700 Subject: [PATCH 3/3] docs: add changelog entry for rkt stats #4188 --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a71978ec71fd..199ebe5cc438 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ IMPROVEMENTS: * client: Create new process group on process startup. [[GH-3572](https://github.com/hashicorp/nomad/issues/3572)] + * driver/rkt: Enable stats collection for rkt tasks [[GH-4188](https://github.com/hashicorp/nomad/pull/4188)] BUG FIXES: * driver/exec: Create process group for Windows process and send Ctrl-Break signal on Shutdown [[GH-2117](https://github.com/hashicorp/nomad/issues/2117)]