Skip to content

Commit

Permalink
cmd/openshift-install/analyze: Attempt to analyze bootstrap tarballs
Browse files Browse the repository at this point in the history
Instead of just dropping them into the users lap "here's a big
tarball, have fun", look through them for obvious things that we can
summarize.  With:

   func runGatherBootstrapCmd(directory string) error {
  +       return analyzeGatheredBootstrap("/tmp/log-bundle.tar.gz")

to feed [1] into the analysis logic, the output looks like:

  WARNING control-plane/10.0.134.229 had failing systemd units: crio.service
  WARNING control-plane/10.0.134.229: crio.service:
  ● crio.service - Open Container Initiative Daemon
     Loaded: loaded (/usr/lib/systemd/system/crio.service; disabled; vendor preset: disabled)
    Drop-In: /etc/systemd/system/crio.service.d
             └─10-default-env.conf
     Active: failed (Result: exit-code) since Thu 2019-10-24 11:11:31 UTC; 320ms ago
       Docs: https://github.com/cri-o/cri-o
    Process: 8491 ExecStart=/usr/bin/crio $CRIO_STORAGE_OPTIONS $CRIO_NETWORK_OPTIONS $CRIO_METRICS_OPTIONS (code=exited, status=1/FAILURE)
   Main PID: 8491 (code=exited, status=1/FAILURE)
        CPU: 144ms

  Oct 24 11:11:31 ip-10-0-134-229 systemd[1]: Starting Open Container Initiative Daemon...
  Oct 24 11:11:31 ip-10-0-134-229 crio[8491]: time="2019-10-24 11:11:31.895986612Z" level=fatal msg="opening seccomp profile (/etc/crio/seccomp.json) failed: open /etc/crio/seccomp.json: no such file or directory"
  Oct 24 11:11:31 ip-10-0-134-229 systemd[1]: crio.service: Main process exited, code=exited, status=1/FAILURE
  Oct 24 11:11:31 ip-10-0-134-229 systemd[1]: crio.service: Failed with result 'exit-code'.
  Oct 24 11:11:31 ip-10-0-134-229 systemd[1]: Failed to start Open Container Initiative Daemon.
  Oct 24 11:11:31 ip-10-0-134-229 systemd[1]: crio.service: Consumed 144ms CPU time
  WARNING control-plane/10.0.134.243 had failing systemd units: crio.service
  WARNING control-plane/10.0.134.243: crio.service:
  ● crio.service - Open Container Initiative Daemon
     Loaded: loaded (/usr/lib/systemd/system/crio.service; disabled; vendor preset: disabled)
    Drop-In: /etc/systemd/system/crio.service.d
             └─10-default-env.conf
     Active: failed (Result: exit-code) since Thu 2019-10-24 11:11:35 UTC; 8s ago
       Docs: https://github.com/cri-o/cri-o
    Process: 8439 ExecStart=/usr/bin/crio $CRIO_STORAGE_OPTIONS $CRIO_NETWORK_OPTIONS $CRIO_METRICS_OPTIONS (code=exited, status=1/FAILURE)
   Main PID: 8439 (code=exited, status=1/FAILURE)
        CPU: 151ms

  Oct 24 11:11:35 ip-10-0-134-243 systemd[1]: Starting Open Container Initiative Daemon...
  Oct 24 11:11:35 ip-10-0-134-243 crio[8439]: time="2019-10-24 11:11:35.238163016Z" level=fatal msg="opening seccomp profile (/etc/crio/seccomp.json) failed: open /etc/crio/seccomp.json: no such file or directory"
  Oct 24 11:11:35 ip-10-0-134-243 systemd[1]: crio.service: Main process exited, code=exited, status=1/FAILURE
  Oct 24 11:11:35 ip-10-0-134-243 systemd[1]: crio.service: Failed with result 'exit-code'.
  Oct 24 11:11:35 ip-10-0-134-243 systemd[1]: Failed to start Open Container Initiative Daemon.
  Oct 24 11:11:35 ip-10-0-134-243 systemd[1]: crio.service: Consumed 151ms CPU time
  WARNING control-plane/10.0.157.61 had failing systemd units: crio.service
  WARNING control-plane/10.0.157.61: crio.service:
  ● crio.service - Open Container Initiative Daemon
     Loaded: loaded (/usr/lib/systemd/system/crio.service; disabled; vendor preset: disabled)
    Drop-In: /etc/systemd/system/crio.service.d
             └─10-default-env.conf
     Active: failed (Result: exit-code) since Thu 2019-10-24 11:11:36 UTC; 1s ago
       Docs: https://github.com/cri-o/cri-o
    Process: 8379 ExecStart=/usr/bin/crio $CRIO_STORAGE_OPTIONS $CRIO_NETWORK_OPTIONS $CRIO_METRICS_OPTIONS (code=exited, status=1/FAILURE)
   Main PID: 8379 (code=exited, status=1/FAILURE)
        CPU: 158ms

  Oct 24 11:11:36 ip-10-0-157-61 systemd[1]: Starting Open Container Initiative Daemon...
  Oct 24 11:11:36 ip-10-0-157-61 crio[8379]: time="2019-10-24 11:11:36.807284677Z" level=fatal msg="opening seccomp profile (/etc/crio/seccomp.json) failed: open /etc/crio/seccomp.json: no such file or directory"
  Oct 24 11:11:36 ip-10-0-157-61 systemd[1]: crio.service: Main process exited, code=exited, status=1/FAILURE
  Oct 24 11:11:36 ip-10-0-157-61 systemd[1]: crio.service: Failed with result 'exit-code'.
  Oct 24 11:11:36 ip-10-0-157-61 systemd[1]: Failed to start Open Container Initiative Daemon.
  Oct 24 11:11:36 ip-10-0-157-61 systemd[1]: crio.service: Consumed 158ms CPU time

That's maybe a bit noisy, but mostly because all three control-plane
machines failed the same way.

Journal field docs are linked from the Go docstrings, but if these
evolve later, the ones I'm using are from [2], except for UNIT, where I have opened an upstream documentation request [3].

It might be worth exposing this as:

  $ openshift-install analyze bootstrap PATH

so folks could look at bootstrap logs which had been gathered by
third-parties, but I'm punting on that for now.

[1]: https://storage.googleapis.com/origin-ci-test/logs/release-promote-openshift-machine-os-content-e2e-aws-4.3/2455/artifacts/e2e-aws/installer/log-bundle-20191024111122.tar
[2]: https://github.com/systemd/systemd/blob/v246/man/systemd.journal-fields.xml
[3]: systemd/systemd#17538
  • Loading branch information
wking committed Nov 6, 2020
1 parent 26cf75e commit fe43889
Show file tree
Hide file tree
Showing 6 changed files with 1,123 additions and 0 deletions.
300 changes: 300 additions & 0 deletions cmd/openshift-install/analyze.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,300 @@
package main

import (
"archive/tar"
"bufio"
"compress/gzip"
"fmt"
"io"
"io/ioutil"
"os"
"path"
"regexp"
"sort"
"strings"

"github.com/sirupsen/logrus"
"k8s.io/utils/pointer"

"github.com/openshift/installer/pkg/systemd"
)

var failedUnitRegexp = regexp.MustCompile(`^[*] (?P<unit>[^ ]*) .*`)
var failedSSHRegexp = regexp.MustCompile(`(?P<all>ssh: connect to host (?P<host>[\d.]+) port (?P<port>\d+): (?P<error>.*))$`)

type unit struct {
// Name is the name of the unit.
Name string

// State is the unit state
State string

// Detail is a detailed explaination of the current unit state.
Detail string

// Log has systemd logs from the unit.
Log *systemd.Log
}

func (unit *unit) Render(host string) {
logrus.Warnf("%s had %s systemd unit %s", host, unit.State, unit.Name)
if unit.Detail != "" {
logrus.Warnf(unit.Detail)
}

if unit.Log != nil && unit.Log.Restarts(unit.Name) > 0 {
for _, line := range unit.Log.Format(unit.Name, -2) {
logrus.Warn(line)
}
}
}

type host struct {
// Name is an identifier for the host.
Name string

// Role is an identifier for the host's role (e.g. 'control-plane').
Role string

// AccessError, if non-empty, is the error from a failed attempt to
// SSH into the host.
AccessError string

// Units is a set of systemd units from the host.
Units map[string]*unit
}

func (host *host) Render() {
name := host.Name
if host.Role != "" && host.Role != host.Name {
name = fmt.Sprintf("%s %s", host.Role, name)
}

if host.AccessError != "" {
logrus.Warnf("%s access error: %s", name, host.AccessError)
return
}

if len(host.Units) == 0 {
logrus.Infof("%s had no failing or restarting systemd units", name)
return
}

units := make([]string, 0, len(host.Units))
for unitKey := range host.Units {
units = append(units, unitKey)
}
sort.Strings(units)

logrus.Warnf("%s had failing or restarting systemd units: %s", name, strings.Join(units, ", "))
for _, unitKey := range units {
host.Units[unitKey].Render(name)
}
}

type bootstrapLogBundle struct {
// AccessError, if non-empty, is the error from a failed attempt to
// SSH into the host.
AccessError string

// Hosts is a set of hosts referened from the bundle.
Hosts map[string]*host

// Log is the output of the SSH gather script
Log string
}

func newBootstrapLogBundle(r io.Reader) (*bootstrapLogBundle, error) {
tarReader := tar.NewReader(r)
bundle := &bootstrapLogBundle{
Hosts: map[string]*host{},
}
for {
header, err := tarReader.Next()
if err == io.EOF {
break
}
if err != nil {
return bundle, err
}

baseName := path.Base(header.Name)

hostKey := "bootstrap"
role := "bootstrap"
var accessError *string
if strings.Contains(header.Name, "/control-plane/") {
segments := strings.Split(header.Name, "/")
for i, segment := range segments {
if segment == "control-plane" && len(segments) > i && segments[i+1] != "" {
hostKey = segments[i+1]
role = "control-plane"
if len(segments) > i+1 && segments[i+2] != "" {
accessError = pointer.StringPtr("")
} else {
accessError = pointer.StringPtr("failed to SSH into host")
}
break
}
}
}

if _, ok := bundle.Hosts[hostKey]; !ok {
bundle.Hosts[hostKey] = &host{Name: hostKey, Role: role}
}
host := bundle.Hosts[hostKey]
if accessError != nil {
host.AccessError = *accessError
}

if strings.HasSuffix(header.Name, "/gather.log") {
data, err := ioutil.ReadAll(tarReader)
if err != nil {
return bundle, fmt.Errorf("reading %q: %w", header.Name, err)
}
bundle.Log = string(data)
}

if baseName == "failed-units.txt" {
if host.Units == nil {
host.Units = map[string]*unit{}
}
scanner := bufio.NewScanner(tarReader)
for scanner.Scan() {
line := scanner.Text()
matches := failedUnitRegexp.FindStringSubmatch(line)
if matches == nil {
continue
}

for i, name := range failedUnitRegexp.SubexpNames() {
if name == "unit" {
if u, ok := bundle.Hosts[hostKey].Units[matches[i]]; ok {
u.State = "failed"
} else {
bundle.Hosts[hostKey].Units[matches[i]] = &unit{
Name: matches[i],
State: "failed",
}
}
break
}
}
}
if err := scanner.Err(); err != nil {
return bundle, fmt.Errorf("reading line from %q: %w", header.Name, err)
}
}

if path.Base(path.Dir(header.Name)) == "unit-status" {
var extension string
for _, ext := range []string{".txt", ".log.json"} {
if strings.HasSuffix(baseName, ext) {
extension = ext
break
}
}
if extension == "" {
continue
}

if host.Units == nil {
host.Units = map[string]*unit{}
}

unitKey := strings.TrimSuffix(baseName, extension) // crio.service.txt -> crio.service

if _, ok := host.Units[unitKey]; !ok {
host.Units[unitKey] = &unit{Name: unitKey}
}
u := host.Units[unitKey]
if u.State == "" {
u.State = "restarting" // for now, assume we're restarting unless we are failing via failed-units.txt
}

switch extension {
case ".txt":
data, err := ioutil.ReadAll(tarReader)
if err != nil {
return bundle, fmt.Errorf("reading %q: %w", header.Name, err)
}
u.Detail = string(data)
case ".log.json":
log, err := systemd.NewLog(tarReader)
if err != nil {
return bundle, fmt.Errorf("parsing systemd log JSON %q: %w", header.Name, err)
}
u.Log = log
}
}
}

scanner := bufio.NewScanner(strings.NewReader(bundle.Log))
for scanner.Scan() {
line := scanner.Text()
matches := failedSSHRegexp.FindStringSubmatch(line)
if matches == nil {
continue
}

groups := map[string]string{}
for i, name := range failedSSHRegexp.SubexpNames() {
groups[name] = matches[i]
}

if host, ok := bundle.Hosts[groups["host"]]; ok {
host.AccessError = strings.TrimSpace(groups["all"])
} else if bundle.AccessError == "" {
bundle.AccessError = strings.TrimSpace(groups["all"])
}
}
if err := scanner.Err(); err != nil {
return bundle, fmt.Errorf("reading line from gather log: %w", err)
}

return bundle, nil
}

func (bundle *bootstrapLogBundle) Render() {
if bundle.AccessError != "" {
logrus.Warnf("bootstrap access error: %s", bundle.AccessError)
return
}

hosts := make([]string, 0, len(bundle.Hosts))
for hostKey := range bundle.Hosts {
hosts = append(hosts, hostKey)
}
sort.Strings(hosts)

for _, hostKey := range hosts {
bundle.Hosts[hostKey].Render()
}

if len(hosts) <= 1 {
logrus.Warnf("no control-plane machines in the gathered tarball")
}
}

func analyzeGatheredBootstrap(tarPath string) error {
file, err := os.Open(tarPath)
if err != nil {
return err
}
defer file.Close()

gzipReader, err := gzip.NewReader(file)
if err != nil {
return err
}
defer gzipReader.Close()

bundle, err := newBootstrapLogBundle(gzipReader)
if err != nil {
return fmt.Errorf("parsing bootstrap log bundle %s: %w", tarPath, err)
}

bundle.Render()

return nil
}
5 changes: 5 additions & 0 deletions cmd/openshift-install/gather.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,11 +160,16 @@ func logGatherBootstrap(bootstrap string, port int, masters []string, directory
if err := ssh.PullFileTo(client, fmt.Sprintf("/home/core/log-bundle-%s.tar.gz", gatherID), file); err != nil {
return errors.Wrap(err, "failed to pull log file from remote")
}

path, err := filepath.Abs(file)
if err != nil {
return errors.Wrap(err, "failed to stat log file")
}
logrus.Infof("Bootstrap gather logs captured here %q", path)

if err := analyzeGatheredBootstrap(path); err != nil {
return errors.Wrapf(err, "failed to analyze %q", file)
}
return nil
}

Expand Down
Loading

0 comments on commit fe43889

Please sign in to comment.