From 6bc3b758d853347b4389b873d5e5054145d96eec Mon Sep 17 00:00:00 2001 From: Jonathan Boulle Date: Fri, 13 Jun 2014 16:50:52 -0700 Subject: [PATCH] fix(tests): plethora of fixes to functional tests - remove update-ca-certificates from node start, dramatically reducing container spin-up time - removing time.Sleep when spinning up a node; instead, poll until it looks like the container is fully booted - cleaning up network interfaces more thoroughly between tests, preventing a race where one test failure will cause a cascading failure in subsequent tests - copies /etc/nsswitch.conf into container to ensure users are available during startup (in the non-container world this is handled by initrd) --- functional/README.md | 8 ++- functional/cluster_test.go | 11 +--- functional/platform/nspawn.go | 121 ++++++++++++++++++++++++---------- functional/util/util.go | 1 - 4 files changed, 93 insertions(+), 48 deletions(-) diff --git a/functional/README.md b/functional/README.md index 11d26fa18..da20f47a2 100644 --- a/functional/README.md +++ b/functional/README.md @@ -47,12 +47,14 @@ ssh-add functional/fixtures/id_rsa export GOPATH="$(pwd)/gopath" export FLEET_BIN="$(pwd)/bin/fleet" export FLEETCTL_BIN="$(pwd)/bin/fleetctl" -sudo -E env PATH=$PATH go test github.com/coreos/fleet/functional +sudo -E env PATH=$PATH go test github.com/coreos/fleet/functional -v ``` If the tests are aborted partway through, it's currently possible for them to leave residual state as a result of the systemd-nspawn operations. Manual cleanup can be performed as follows: ``` -sudo rm -fr /run/systemd/system/*smoke* -machinectl --no-legend | cut -d ' ' -f1 | sudo xargs machinectl terminate +machinectl --no-legend | cut -d ' ' -f1 | sudo xargs -r machinectl terminate +sudo pkill -9 systemd-nspawn +sudo rm -fr /run/systemd/system/*smoke* /tmp/smoke sudo systemctl daemon-reload +ip link show fleet0 >/dev/null && sudo ip link del fleet0 ``` diff --git a/functional/cluster_test.go b/functional/cluster_test.go index 296788686..27093ae0a 100644 --- a/functional/cluster_test.go +++ b/functional/cluster_test.go @@ -140,18 +140,9 @@ func TestDynamicClusterMemberReboot(t *testing.T) { t.Fatal(err) } - found := false for _, unit := range []string{"conflict.0.service", "conflict.1.service", "conflict.2.service"} { if oldStates[unit].Machine != newStates[unit].Machine { - if found { - t.Fatal("More than one unit migrated") - } else { - found = true - } + t.Fatalf("Unit %s migrated unexpectedly", unit) } } - - if !found { - t.Fatal("Could not find migrated unit") - } } diff --git a/functional/platform/nspawn.go b/functional/platform/nspawn.go index 0daee076b..2dfb9437b 100644 --- a/functional/platform/nspawn.go +++ b/functional/platform/nspawn.go @@ -32,9 +32,14 @@ func init() { } } +type member struct { + ip string + pid int +} + type nspawnCluster struct { name string - members map[string]string + members map[string]*member } func (nc *nspawnCluster) keyspace() string { @@ -156,7 +161,7 @@ func (nc *nspawnCluster) Members() []string { } func (nc *nspawnCluster) MemberCommand(member string, args ...string) (string, error) { - ip := nc.members[member] + ip := nc.members[member].ip baseArgs := []string{"-o", "UserKnownHostsFile=/dev/null", "-o", "StrictHostKeyChecking=no", fmt.Sprintf("core@%s", ip)} args = append(baseArgs, args...) log.Printf("ssh %s", strings.Join(args, " ")) @@ -172,8 +177,8 @@ func (nc *nspawnCluster) findUsableIP() (string, error) { ip: for octet := base; octet < 256; octet++ { ip := fmt.Sprintf("172.17.1.%d", octet) - for _, memberIP := range nc.members { - if ip == memberIP { + for _, member := range nc.members { + if ip == member.ip { continue ip } } @@ -188,30 +193,57 @@ func (nc *nspawnCluster) CreateMember(name string, cfg MachineConfig) (err error if err != nil { return err } - nc.members[name] = ip + nc.members[name] = &member{ip: ip} basedir := path.Join(os.TempDir(), nc.name) fsdir := path.Join(basedir, name, "fs") cmds := []string{ + // set up directory for fleet service fmt.Sprintf("mkdir -p %s/etc/systemd/system", fsdir), - fmt.Sprintf("cp /etc/os-release %s/etc", fsdir), + + // update-ca-certificates takes an inordinate amount of time, so simply mask it for now + // until https://github.com/coreos/coreos-overlay/pull/681 is integrated + fmt.Sprintf("ln -s /dev/null %s/etc/systemd/system/update-ca-certificates.service", fsdir), + + // since we're in a container we lack initrd bootstrapping magic + // https://github.com/coreos/bootengine/blob/master/dracut/80setup-root/pre-pivot-setup-root.sh#L28 + // so until this is fixed, manually copy nsswitch.conf so that systemd-tmpfiles.service can access users it needs + // https://github.com/coreos/init/pull/111/ + fmt.Sprintf("cp /etc/nsswitch.conf %s/etc", fsdir), + + // minimum requirements for running systemd/coreos in a container fmt.Sprintf("mkdir -p %s/usr", fsdir), + fmt.Sprintf("cp /etc/os-release %s/etc", fsdir), + fmt.Sprintf("ln -s /proc/self/mounts %s/etc/mtab", fsdir), fmt.Sprintf("ln -s usr/lib64 %s/lib64", fsdir), fmt.Sprintf("ln -s lib64 %s/lib", fsdir), fmt.Sprintf("ln -s usr/bin %s/bin", fsdir), fmt.Sprintf("ln -s usr/sbin %s/sbin", fsdir), fmt.Sprintf("mkdir -p %s/home/core", fsdir), fmt.Sprintf("chown core:core %s/home/core", fsdir), + + // set up directory for machine-id (see below) + fmt.Sprintf("mkdir -p %s/var/lib/dbus", fsdir), } for _, cmd := range cmds { - _, _, err = run(cmd) + var stderr, stdout string + stdout, stderr, err = run(cmd) if err != nil { - log.Printf("Command '%s' failed: %v", cmd, err) + log.Printf("Command '%s' failed:\nstdout:: %s\nstderr: %s\nerr: %v", cmd, stdout, stderr, err) return } } + // Write machine-id manually to override systemd picking up the host OS's machine-id in the case of the host being a KVM + // (otherwise all smoke machines will have the same machine-id, causing havoc) + // TODO(jonboulle): this should be fixed upstream in fdd25311706bd32580ec4d43211cdf4665d2f9de; remove once newer systemd is deployed + uuid := fmt.Sprintf("0000000000000000000000000000000%s\n", name) + if err = ioutil.WriteFile(path.Join(fsdir, "/var/lib/dbus/machine-id"), []byte(uuid), 0755); err != nil { + log.Printf("Failed writing machine-id: %v", err) + return + } + sshKeySrc := path.Join("fixtures", "id_rsa.pub") if err = nc.prepFleet(fsdir, ip, sshKeySrc, fleetBinPath, cfg); err != nil { log.Printf("Failed preparing fleet in filesystem: %v", err) @@ -226,29 +258,53 @@ func (nc *nspawnCluster) CreateMember(name string, cfg MachineConfig) (err error return } - time.Sleep(time.Second) + pid, err := nc.machinePID(name) + if err != nil { + log.Printf("Failed detecting machine %s%s PID: %v", nc.name, name, err) + return err + } + + alarm := time.After(10 * time.Second) + for { + select { + case <-alarm: + log.Printf("Timed out waiting for machine to start") + return + default: + } + // TODO(jonboulle): probably a cleaner way to check here + if _, _, e := nc.nsenter(pid, "systemd-analyze"); e == nil { + break + } + time.Sleep(100 * time.Millisecond) + + } + + nc.members[name].pid = pid + + var stderr string cmd := fmt.Sprintf("ip addr add %s/16 dev host0", ip) - err = nc.nsenter(name, cmd) + _, stderr, err = nc.nsenter(pid, cmd) if err != nil { - log.Printf("Failed adding IP address to container") + log.Printf("Failed adding IP address to container: %s", stderr) return } cmd = fmt.Sprintf("update-ssh-keys -u core -a fleet /opt/fleet/id_rsa.pub") - err = nc.nsenter(name, cmd) + _, _, err = nc.nsenter(pid, cmd) if err != nil { log.Printf("Failed authorizing SSH key in container") return } - err = nc.nsenter(name, "ln -s /opt/fleet/fleet.service /etc/systemd/system/fleet.service") + _, _, err = nc.nsenter(pid, "ln -s /opt/fleet/fleet.service /etc/systemd/system/fleet.service") if err != nil { log.Printf("Failed symlinking fleet.service: %v", err) return } - err = nc.nsenter(name, "systemctl start fleet.service") + _, _, err = nc.nsenter(pid, "systemctl start fleet.service") if err != nil { log.Printf("Failed starting fleet.service: %v", err) return @@ -273,15 +329,19 @@ func (nc *nspawnCluster) Destroy() error { // altogether until this is fixed. run("etcdctl rm --recursive " + nc.keyspace()) + run("ip link del fleet0") + return nil } func (nc *nspawnCluster) DestroyMember(name string) error { dir := path.Join(os.TempDir(), nc.name, name) + label := fmt.Sprintf("%s%s", nc.name, name) cmds := []string{ fmt.Sprintf("machinectl terminate %s%s", nc.name, name), - fmt.Sprintf("rm -f /run/systemd/system/%s%s.service", nc.name, name), - fmt.Sprintf("rm -fr /run/systemd/system/%s%s.service.d", nc.name, name), + fmt.Sprintf("rm -f /run/systemd/system/machine-%s.scope", label), + fmt.Sprintf("rm -f /run/systemd/system/%s.service", label), + fmt.Sprintf("rm -fr /run/systemd/system/%s.service.d", label), fmt.Sprintf("rm -r %s", dir), } @@ -292,6 +352,10 @@ func (nc *nspawnCluster) DestroyMember(name string) error { } } + // Unfortunately nspawn doesn't always seem to tear down the interfaces + // in time, which can result in subsequent tests failing + run(fmt.Sprintf("ip link del vb-%s", label)) + if err := nc.systemdReload(); err != nil { log.Printf("Failed systemd daemon-reload: %v", err) } @@ -335,13 +399,14 @@ func (nc *nspawnCluster) systemd(unitName, exec string) error { return err } +// wait up to 10s for a machine to be started func (nc *nspawnCluster) machinePID(name string) (int, error) { - for i := 0; i < 5; i++ { + for i := 0; i < 100; i++ { mach := fmt.Sprintf("%s%s", nc.name, name) stdout, _, err := run(fmt.Sprintf("machinectl show -p Leader %s", mach)) if err != nil { - if i != 4 { - time.Sleep(time.Second) + if i != -1 { + time.Sleep(100 * time.Millisecond) continue } return -1, fmt.Errorf("failed detecting machine %s status: %v", mach, err) @@ -353,25 +418,13 @@ func (nc *nspawnCluster) machinePID(name string) (int, error) { return -1, fmt.Errorf("unable to detect machine PID") } -func (nc *nspawnCluster) nsenter(name string, cmd string) error { - pid, err := nc.machinePID(name) - if err != nil { - log.Printf("Failed detecting machine %s%s PID: %v", nc.name, name, err) - return err - } - +func (nc *nspawnCluster) nsenter(pid int, cmd string) (string, string, error) { cmd = fmt.Sprintf("nsenter -t %d -m -n -p -- %s", pid, cmd) - _, _, err = run(cmd) - if err != nil { - log.Printf("Command '%s' failed: %v", cmd, err) - return err - } - - return nil + return run(cmd) } func NewNspawnCluster(name string) (Cluster, error) { - nc := &nspawnCluster{name, map[string]string{}} + nc := &nspawnCluster{name, map[string]*member{}} err := nc.prepCluster() return nc, err } diff --git a/functional/util/util.go b/functional/util/util.go index 1646bec8c..91360cc54 100644 --- a/functional/util/util.go +++ b/functional/util/util.go @@ -66,7 +66,6 @@ func RunFleetctlWithInput(input string, args ...string) (string, string, error) // Wait up to 10s to find the specified number of machines, retrying periodically. func WaitForNMachines(fleetctl fleetfunc, count int) ([]string, error) { var machines []string - timeout := 10 * time.Second alarm := time.After(timeout)