Skip to content

Commit

Permalink
🐞 Fix: deploying opsman to vSphere 15% boot fail
Browse files Browse the repository at this point in the history
When deploying opsman to vSphere, it fails to boot 15% of the time. It
happens very early in the boot process, apparently even before loading
the kernel. When viewing the opsman's VM's console, the symptom is a
flashing cursor in the upper left hand side of the screen.

This commit fixes that failure by waiting 80 seconds for the opsman VM
to report its IP address to vCenter, and if it hasn't reported its IP
address by then, it sends a hardware reset to the VM. An opsman VM
typically reports its IP address to vCenter 43 seconds after being
powered-on.

We verified this fix by successfully deploying & booting opsman 146
times in a row.

More about the boot failure:

- The boot failure only occurs the very first time an opsman is booted;
  subsequent boots will always succeed. We tested 100 shutdown/boots to
  confirm.
- The failure was seen both on vSphere 7 and vSphere 8.
- Sending a reset or a ctl-alt-del to the machine within the first few
  seconds of being powered-on reduced but did not eliminate the failure.

This fix should have negligible impact on the length of time to deploy
opsman.

Typical output when resetting a failed initial boot:

```
Executing: "govc vm.info -vm.ipath=/dc/vm/pcf_vms/om.tas.nono.io -waitip"
This could take a few moments...
VM hasn't acquired IP, is probably stuck, resetting VM to free it

Executing: "govc vm.power -vm.ipath=/dc/vm/pcf_vms/om.tas.nono.io -reset"
This could take a few moments...
govc[stdout]: Reset VirtualMachine:vm-42616... OK
```
  • Loading branch information
cunnie authored and wayneadams committed May 1, 2024
1 parent 2af88e8 commit 206d60e
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 10 deletions.
7 changes: 6 additions & 1 deletion vmlifecycle/runner/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package runner

import (
"bytes"
"context"
"fmt"
"github.com/fatih/color"
"github.com/onsi/gomega/gexec"
Expand Down Expand Up @@ -35,6 +36,10 @@ func (r *Runner) Execute(args []interface{}) (*bytes.Buffer, *bytes.Buffer, erro
}

func (r *Runner) ExecuteWithEnvVars(env []string, args []interface{}) (*bytes.Buffer, *bytes.Buffer, error) {
return r.ExecuteWithEnvVarsCtx(context.Background(), env, args)
}

func (r *Runner) ExecuteWithEnvVarsCtx(ctx context.Context, env []string, args []interface{}) (*bytes.Buffer, *bytes.Buffer, error) {
var outBufWriter bytes.Buffer
var errBufWriter bytes.Buffer

Expand All @@ -53,7 +58,7 @@ func (r *Runner) ExecuteWithEnvVars(env []string, args []interface{}) (*bytes.Bu
}
}

command := exec.Command(r.command, stringArgs...)
command := exec.CommandContext(ctx, r.command, stringArgs...)
if len(env) > 0 {
command.Env = append(os.Environ(), env...)
}
Expand Down
98 changes: 98 additions & 0 deletions vmlifecycle/vmmanagers/fakes/govcRunner.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

49 changes: 42 additions & 7 deletions vmlifecycle/vmmanagers/vsphere.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,18 @@ package vmmanagers
import (
"archive/tar"
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"github.com/blang/semver"
"github.com/pivotal-cf/om/vmlifecycle/extractopsmansemver"
"io/ioutil"
"log"
"os"
"strconv"
"strings"

"github.com/blang/semver"
"github.com/pivotal-cf/om/vmlifecycle/extractopsmansemver"
"time"
)

type VcenterCredential struct {
Expand Down Expand Up @@ -74,6 +75,7 @@ type networkMapping struct {
//go:generate counterfeiter -o ./fakes/govcRunner.go --fake-name GovcRunner . govcRunner
type govcRunner interface {
ExecuteWithEnvVars(env []string, args []interface{}) (*bytes.Buffer, *bytes.Buffer, error)
ExecuteWithEnvVarsCtx(ctx context.Context, env []string, args []interface{}) (*bytes.Buffer, *bytes.Buffer, error)
}

type VsphereVMManager struct {
Expand Down Expand Up @@ -166,7 +168,7 @@ func (v *VsphereVMManager) CreateVM() (Status, StateInfo, error) {

ipath := v.createIpath()

errBufWriter, err := v.createVM(env, optionFilename)
errBufWriter, err := v.createVM(env, optionFilename, ipath)
fullState := StateInfo{IAAS: "vsphere", ID: ipath}

if err != nil {
Expand Down Expand Up @@ -319,14 +321,47 @@ func (v *VsphereVMManager) validateImage() error {
}
}

func (v *VsphereVMManager) createVM(env []string, optionFilename string) (errorBuffer *bytes.Buffer, err error) {
_, errBufWriter, err := v.runner.ExecuteWithEnvVars(env, []interface{}{
func (v *VsphereVMManager) createVM(env []string, optionFilename string, ipath string) (errBufWriter *bytes.Buffer, err error) {
_, errBufWriter, err = v.runner.ExecuteWithEnvVars(env, []interface{}{
"import.ova",
"-options=" + optionFilename,
v.ImageOVA,
})
if err != nil {
return errBufWriter, checkFormatedError("govc error: %s", err)
}

return errBufWriter, checkFormatedError("govc error: %s", err)
ctx, cancel := context.WithTimeout(context.Background(), 80*time.Second) // 80 seconds is adequate time for OM to get IP; typically it's 43 seconds
defer cancel()
// Wait 80 seconds for VM to boot and acquire its IP
_, errBufWriter, err = v.runner.ExecuteWithEnvVarsCtx(ctx, env, []interface{}{
"vm.info",
fmt.Sprintf(`-vm.ipath=%s`, ipath),
"-waitip",
})
if ctx.Err() != nil {
// VM hasn't acquired IP, is likely stuck, reset VM to free it (to boot)
buf, errPowerReset := v.resetVM(env, ipath)
if errPowerReset != nil {
// we don't need to return errBuffWriter because we already know it's nil
// because the ExecuteWithEnvVarsCtx that sets it never completes
return buf, fmt.Errorf("govc error: could not power-reset: %s", errPowerReset)
}
} else {
if err != nil {
return errBufWriter, checkFormatedError("govc error: %s", err)
}
}
return errBufWriter, nil
}

func (v *VsphereVMManager) resetVM(env []string, ipath string) (errBufWriter *bytes.Buffer, err error) {
_, errBufWriter, err = v.runner.ExecuteWithEnvVars(env, []interface{}{
"vm.power",
fmt.Sprintf(`-vm.ipath=%s`, ipath),
"-reset",
})
return errBufWriter, err
}

func (v *VsphereVMManager) addDefaultConfigFields() {
Expand Down
11 changes: 9 additions & 2 deletions vmlifecycle/vmmanagers/vsphere_test.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
package vmmanagers_test

import (
"archive/tar"
"fmt"
"io/ioutil"
"os"

"archive/tar"

"bytes"
"errors"
"io"
Expand Down Expand Up @@ -111,6 +110,14 @@ opsman-configuration:
"-on=true",
"-vm.ipath=/datacenter/vm/folder/vm_name",
))

_, _, args = runner.ExecuteWithEnvVarsCtxArgsForCall(0)
Expect(args).To(matchers.OrderedConsistOf(
"vm.info",
"-vm.ipath=/datacenter/vm/folder/vm_name",
"-waitip",
))
Expect(runner.ExecuteWithEnvVarsCtxCallCount()).To(Equal(1))
})

When("setting custom cpu and memory", func() {
Expand Down

0 comments on commit 206d60e

Please sign in to comment.