Skip to content

Commit

Permalink
cmd/openshift-install/analyze: Attempt to analyze bootstrap tarballs
Browse files Browse the repository at this point in the history
Instead of just dropping them into the users lap "here's a big
tarball, have fun", look through them for obvious things that we can
summarize.  With:

   func runGatherBootstrapCmd(directory string) error {
  +       return analyzeGatheredBootstrap("/tmp/log-bundle.tar.gz")

to feed [1] into the analysis logic, the output looks like:

  WARNING control-plane/10.0.134.229 had failing systemd units: crio.service
  WARNING control-plane/10.0.134.229: crio.service:
  ● crio.service - Open Container Initiative Daemon
     Loaded: loaded (/usr/lib/systemd/system/crio.service; disabled; vendor preset: disabled)
    Drop-In: /etc/systemd/system/crio.service.d
             └─10-default-env.conf
     Active: failed (Result: exit-code) since Thu 2019-10-24 11:11:31 UTC; 320ms ago
       Docs: https://github.com/cri-o/cri-o
    Process: 8491 ExecStart=/usr/bin/crio $CRIO_STORAGE_OPTIONS $CRIO_NETWORK_OPTIONS $CRIO_METRICS_OPTIONS (code=exited, status=1/FAILURE)
   Main PID: 8491 (code=exited, status=1/FAILURE)
        CPU: 144ms

  Oct 24 11:11:31 ip-10-0-134-229 systemd[1]: Starting Open Container Initiative Daemon...
  Oct 24 11:11:31 ip-10-0-134-229 crio[8491]: time="2019-10-24 11:11:31.895986612Z" level=fatal msg="opening seccomp profile (/etc/crio/seccomp.json) failed: open /etc/crio/seccomp.json: no such file or directory"
  Oct 24 11:11:31 ip-10-0-134-229 systemd[1]: crio.service: Main process exited, code=exited, status=1/FAILURE
  Oct 24 11:11:31 ip-10-0-134-229 systemd[1]: crio.service: Failed with result 'exit-code'.
  Oct 24 11:11:31 ip-10-0-134-229 systemd[1]: Failed to start Open Container Initiative Daemon.
  Oct 24 11:11:31 ip-10-0-134-229 systemd[1]: crio.service: Consumed 144ms CPU time
  WARNING control-plane/10.0.134.243 had failing systemd units: crio.service
  WARNING control-plane/10.0.134.243: crio.service:
  ● crio.service - Open Container Initiative Daemon
     Loaded: loaded (/usr/lib/systemd/system/crio.service; disabled; vendor preset: disabled)
    Drop-In: /etc/systemd/system/crio.service.d
             └─10-default-env.conf
     Active: failed (Result: exit-code) since Thu 2019-10-24 11:11:35 UTC; 8s ago
       Docs: https://github.com/cri-o/cri-o
    Process: 8439 ExecStart=/usr/bin/crio $CRIO_STORAGE_OPTIONS $CRIO_NETWORK_OPTIONS $CRIO_METRICS_OPTIONS (code=exited, status=1/FAILURE)
   Main PID: 8439 (code=exited, status=1/FAILURE)
        CPU: 151ms

  Oct 24 11:11:35 ip-10-0-134-243 systemd[1]: Starting Open Container Initiative Daemon...
  Oct 24 11:11:35 ip-10-0-134-243 crio[8439]: time="2019-10-24 11:11:35.238163016Z" level=fatal msg="opening seccomp profile (/etc/crio/seccomp.json) failed: open /etc/crio/seccomp.json: no such file or directory"
  Oct 24 11:11:35 ip-10-0-134-243 systemd[1]: crio.service: Main process exited, code=exited, status=1/FAILURE
  Oct 24 11:11:35 ip-10-0-134-243 systemd[1]: crio.service: Failed with result 'exit-code'.
  Oct 24 11:11:35 ip-10-0-134-243 systemd[1]: Failed to start Open Container Initiative Daemon.
  Oct 24 11:11:35 ip-10-0-134-243 systemd[1]: crio.service: Consumed 151ms CPU time
  WARNING control-plane/10.0.157.61 had failing systemd units: crio.service
  WARNING control-plane/10.0.157.61: crio.service:
  ● crio.service - Open Container Initiative Daemon
     Loaded: loaded (/usr/lib/systemd/system/crio.service; disabled; vendor preset: disabled)
    Drop-In: /etc/systemd/system/crio.service.d
             └─10-default-env.conf
     Active: failed (Result: exit-code) since Thu 2019-10-24 11:11:36 UTC; 1s ago
       Docs: https://github.com/cri-o/cri-o
    Process: 8379 ExecStart=/usr/bin/crio $CRIO_STORAGE_OPTIONS $CRIO_NETWORK_OPTIONS $CRIO_METRICS_OPTIONS (code=exited, status=1/FAILURE)
   Main PID: 8379 (code=exited, status=1/FAILURE)
        CPU: 158ms

  Oct 24 11:11:36 ip-10-0-157-61 systemd[1]: Starting Open Container Initiative Daemon...
  Oct 24 11:11:36 ip-10-0-157-61 crio[8379]: time="2019-10-24 11:11:36.807284677Z" level=fatal msg="opening seccomp profile (/etc/crio/seccomp.json) failed: open /etc/crio/seccomp.json: no such file or directory"
  Oct 24 11:11:36 ip-10-0-157-61 systemd[1]: crio.service: Main process exited, code=exited, status=1/FAILURE
  Oct 24 11:11:36 ip-10-0-157-61 systemd[1]: crio.service: Failed with result 'exit-code'.
  Oct 24 11:11:36 ip-10-0-157-61 systemd[1]: Failed to start Open Container Initiative Daemon.
  Oct 24 11:11:36 ip-10-0-157-61 systemd[1]: crio.service: Consumed 158ms CPU time

That's maybe a bit noisy, but mostly because all three control-plane
machines failed the same way.

It might be worth exposing this as:

  $ openshift-install analyze bootstrap PATH

so folks could look at bootstrap logs which had been gathered by
third-parties, but I'm punting on that for now.

[1]: https://storage.googleapis.com/origin-ci-test/logs/release-promote-openshift-machine-os-content-e2e-aws-4.3/2455/artifacts/e2e-aws/installer/log-bundle-20191024111122.tar
  • Loading branch information
wking committed Oct 24, 2019
1 parent 7b6e7f0 commit bbc87bc
Show file tree
Hide file tree
Showing 2 changed files with 132 additions and 0 deletions.
128 changes: 128 additions & 0 deletions cmd/openshift-install/analyze.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
package main

import (
"archive/tar"
"bufio"
"compress/gzip"
"io"
"io/ioutil"
"os"
"path"
"regexp"
"sort"
"strings"

"github.com/pkg/errors"
"github.com/sirupsen/logrus"
)

var failedUnitRegexp = regexp.MustCompile("^[*] (?P<unit>[^ ]*) .*")

func analyzeGatheredBootstrap(tarPath string) error {
file, err := os.Open(tarPath)
if err != nil {
return err
}
defer file.Close()

gzipReader, err := gzip.NewReader(file)
if err != nil {
return err
}
defer gzipReader.Close()

tarReader := tar.NewReader(gzipReader)
failedUnitSummary := map[string][]string{}
failedUnitDetail := map[string]map[string]string{}
for {
header, err := tarReader.Next()
if err == io.EOF {
break
}
if err != nil {
return err
}

baseName := path.Base(header.Name)

if baseName == "failed-units.txt" {
hostKey := path.Dir(header.Name) // ./control-plane/10.0.0.1/failed-units.txt -> ./control-plane/10.0.0.1
failedUnitSummary[hostKey] = []string{}

scanner := bufio.NewScanner(tarReader)
for scanner.Scan() {
line := scanner.Text()
matches := failedUnitRegexp.FindStringSubmatch(line)
if matches != nil {
for i, name := range failedUnitRegexp.SubexpNames() {
if name == "unit" {
failedUnitSummary[hostKey] = append(failedUnitSummary[hostKey], matches[i])
}
}
}
}
if err := scanner.Err(); err != nil {
return errors.Wrapf(err, "reading line from %q", header.Name)
}
}

if path.Base(path.Dir(header.Name)) == "unit-status" {
hostKey := path.Dir(path.Dir(header.Name)) // ./control-plane/10.0.0.1/unit-status/crio.service.txt -> ./control-plane/10.0.0.1
unitKey := strings.TrimSuffix(baseName, path.Ext(baseName)) // crio.service.txt -> crio.service
if _, ok := failedUnitDetail[hostKey]; !ok {
failedUnitDetail[hostKey] = map[string]string{}
}

data, err := ioutil.ReadAll(tarReader)
if err != nil {
return errors.Wrapf(err, "reading %q", header.Name)
}
failedUnitDetail[hostKey][unitKey] = string(data)
}
}

if len(failedUnitSummary) == 0 {
return errors.New("no failed-units.txt entries found")
}

hosts := make([]string, 0, len(failedUnitSummary))
for hostKey := range failedUnitSummary {
hosts = append(hosts, hostKey)
}
sort.Strings(hosts)

for _, hostKey := range hosts {
host := hostKey
if hostKey == "." {
host = "bootstrap"
}
units := failedUnitSummary[hostKey]

if len(units) == 0 {
logrus.Infof("%s had no failing systemd units", host)
continue
}

details, ok := failedUnitDetail[hostKey]
if !ok {
logrus.Warnf("%s had failing systemd units (%s), but we did not collect details", host, strings.Join(units, ", "))
continue
}

logrus.Warnf("%s had failing systemd units: %s", host, strings.Join(units, ", "))
for _, unitKey := range units {
detail, ok := details[unitKey]
if ok && len(detail) > 0 {
logrus.Warnf("%s: %s:\n%s", host, unitKey, detail)
} else {
logrus.Warnf("%s: %s: failed, but we did not collect details", host, unitKey)
}
}
}

if len(hosts) == 1 && hosts[0] == "." {
logrus.Warnf("no control plane machines in the gathered tarball")
}

return nil
}
4 changes: 4 additions & 0 deletions cmd/openshift-install/gather.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,10 @@ func logGatherBootstrap(bootstrap string, port int, masters []string, directory
return errors.Wrap(err, "failed to pull log file from remote")
}
logrus.Infof("Bootstrap gather logs captured here %q", file)

if err := analyzeGatheredBootstrap(file); err != nil {
return errors.Wrapf(err, "failed to analyze %q", file)
}
return nil
}

Expand Down

0 comments on commit bbc87bc

Please sign in to comment.