Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Write the whole dmesg of the crashed kernel to the /persist/reboot-stack #2961

Merged
merged 8 commits into from
Dec 13, 2022
6 changes: 4 additions & 2 deletions docs/KERNEL-DUMPS.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,10 @@ EVE-OS has a `kdump` container that checks for `/proc/vmcore` and generates a mi
* Minimal kernel dump will be generated in the `/persist/kcrashes` folder.
* Dmesg (kernel ring buffer) of the crashed kernel will be saved in the `/persist/kcrashes` folder.
* Only 5 fresh kernel dumps and dmesgs are stored in the folder, old files are deleted.
* Kernel panic message from the crashed system buffer is redirected to the EVE-OS /persist/reboot-stack file and to the tty console in order to show it on the connected monitor for debug purposes.
* Reboot reason string "kernel panic, kdump collected: $KDUMP_PATH" is redirected to the EVE-OS /persist/reboot-reason file.
* The whole dmesg from the crashed system buffer is written to the EVE-OS `/persist/reboot-stack` file.
* Only kernel panic message from the crashed system buffer is redirected to the tty console in order to show it on the connected monitor for debug purposes.
* Boot reason string "BootReasonKernel" is written to the EVE-OS `/persist/boot-reason` file.
* Reboot reason string "kernel panic, kdump collected: $KDUMP_PATH" is written to the EVE-OS `/persist/reboot-reason` file.
* After the crash dump is created, the kernel will be rebooted according to the `/proc/sys/kernel/panic` timeout configuration value.

As was mentioned earlier `makedumpfile` generates minimal kernel dump which excludes zeroed, cache, private, free and user-space memory pages. This is done not only to minimize the resulting dump file, but also for security reasons: customer data does not leak.
Expand Down
2 changes: 1 addition & 1 deletion pkg/kdump/kdump.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ if test -f /proc/vmcore; then

# Prepare reboot-reason, reboot-stack and boot-reason
echo "kernel panic, kdump collected: $KDUMP_PATH" > /persist/reboot-reason
cat /tmp/backtrace > /persist/reboot-stack
cat /tmp/dmesg > /persist/reboot-stack
echo "BootReasonKernel" > /persist/boot-reason

# Simulate the default reboot after panic kernel behaviour
Expand Down
49 changes: 13 additions & 36 deletions pkg/newlog/cmd/newlogd.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ const (
uploadDevDir = types.NewlogUploadDevDir
uploadAppDir = types.NewlogUploadAppDir
keepSentDir = types.NewlogKeepSentQueueDir
failSendDIr = types.NewlogDir + "/failedUpload"
failSendDir = types.NewlogDir + "/failedUpload"
panicFileDir = types.NewlogDir + "/panicStacks"
symlinkFile = collectDir + "/current.device.log"
tmpSymlink = collectDir + "/tmp-sym.dev.log"
Expand Down Expand Up @@ -910,39 +910,15 @@ func writelogEntry(stats *statsLogFile, logline string) int {
return len
}

type logDirectoryType int

const (
keepSentDirType logDirectoryType = iota
uploadAppDirType
uploadDevDirType
failSendDIrType
)

func (dirType logDirectoryType) getPath() string {
switch dirType {
case keepSentDirType:
return keepSentDir
case uploadAppDirType:
return uploadAppDir
case uploadDevDirType:
return uploadDevDir
case failSendDIrType:
return failSendDIr
}
return ""
}

type gfileStats struct {
isSent bool
dirType logDirectoryType
logDir string
filename string
filesize int64
}

func checkDirGZFiles(sfiles map[string]gfileStats, dirType logDirectoryType) ([]string, int64, error) {
func checkDirGZFiles(sfiles map[string]gfileStats, logDir string) ([]string, int64, error) {
var sizes int64
logDir := dirType.getPath()
dir, err := os.Open(logDir)
if err != nil {
if os.IsNotExist(err) {
Expand Down Expand Up @@ -978,7 +954,7 @@ func checkDirGZFiles(sfiles map[string]gfileStats, dirType logDirectoryType) ([]
filename: fname,
filesize: fsize,
isSent: alreadySent,
dirType: dirType,
logDir: logDir,
}
sizes += fsize
fname2 := strings.TrimSuffix(fname, ".gz")
Expand All @@ -999,19 +975,19 @@ func checkKeepQuota() {
maxSize := int64(limitGzipFilesMbyts * 1000000)
sfiles := make(map[string]gfileStats)

key0, size0, err := checkDirGZFiles(sfiles, keepSentDirType)
key0, size0, err := checkDirGZFiles(sfiles, keepSentDir)
if err != nil {
log.Errorf("checkKeepQuota: keepSentDir %v", err)
}
key1, size1, err := checkDirGZFiles(sfiles, uploadAppDirType)
key1, size1, err := checkDirGZFiles(sfiles, uploadAppDir)
if err != nil {
log.Errorf("checkKeepQuota: AppDir %v", err)
}
key2, size2, err := checkDirGZFiles(sfiles, uploadDevDirType)
key2, size2, err := checkDirGZFiles(sfiles, uploadDevDir)
if err != nil {
log.Errorf("checkKeepQuota: DevDir %v", err)
}
key3, size3, err := checkDirGZFiles(sfiles, failSendDIrType)
key3, size3, err := checkDirGZFiles(sfiles, failSendDir)
if err != nil && !os.IsNotExist(err) {
log.Errorf("checkKeepQuota: FailToSendDir %v", err)
}
Expand All @@ -1034,7 +1010,7 @@ func checkKeepQuota() {
continue
}
fs := sfiles[key]
filePath := filepath.Join(fs.dirType.getPath(), fs.filename)
filePath := filepath.Join(fs.logDir, fs.filename)
if _, err := os.Stat(filePath); err != nil {
continue
}
Expand Down Expand Up @@ -1622,7 +1598,7 @@ func getPersistSpace() uint64 {
func getSyslogMsg(loggerChan chan inputEntry) {

sysfmt := regexp.MustCompile("<([0-9]+)>(.{15}|.{25}) (.*?): (.*)")
conn, err := listenUnix()
conn, err := listenDevLog()
if err != nil {
log.Error(err)
return
Expand Down Expand Up @@ -1650,8 +1626,9 @@ func getSyslogMsg(loggerChan chan inputEntry) {
}
}

//func listenUnix() (net.PacketConn, error) {
func listenUnix() (*net.UnixConn, error) {
// listenDevLog() - substitute /dev/log with our AF_UNIX socket and open it
// for listening
func listenDevLog() (*net.UnixConn, error) {
UnixPath := "/dev/log"
os.Remove(UnixPath)
a, err := net.ResolveUnixAddr("unixgram", UnixPath)
Expand Down
20 changes: 12 additions & 8 deletions pkg/pillar/agentlog/agentlog.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ const (
stackFile = "reboot-stack"
rebootImage = "reboot-image"
bootReasonFile = "boot-reason"
maxReadSize = 16384 // From files in /persist
readSize16k = 16 << 10 // From files in /persist
readSize512k = 512 << 10 // Kernel dmesg
)

var savedRebootReason = "unknown"
Expand Down Expand Up @@ -335,7 +336,7 @@ func RebootReason(reason string, bootReason types.BootReason, agentName string,
if bootReason != types.BootReasonNone {
filename = "/persist/" + bootReasonFile
brString := bootReason.String()
b, _ := fileutils.ReadWithMaxSize(nil, filename, maxReadSize)
b, _ := fileutils.ReadWithMaxSize(nil, filename, readSize16k)
if len(b) != 0 {
// Note: can not use log here since we are called from a log hook!
fmt.Printf("not replacing BootReason %s with %s\n",
Expand Down Expand Up @@ -375,27 +376,30 @@ func RebootStack(log *base.LogObject, stacks string, agentName string, agentPid
}

// GetRebootReason returns the RebootReason string together with
// its timestamp plus the reboot stack
// We limit the size we read to 16k for both of those.
// its timestamp plus the reboot stack. We limit the size we read
// to 16k for the RebootReason and to 512k for the stack, because
// in case of a kernel crash, the stack file contains the whole
// dmesg, which kernel buffer is limited to some reasonable value
// below 512k.
func GetRebootReason(log *base.LogObject) (string, time.Time, string) {
reasonFilename := fmt.Sprintf("%s/%s", types.PersistDir, reasonFile)
stackFilename := fmt.Sprintf("%s/%s", types.PersistDir, stackFile)
reason, ts, _ := fileutils.StatAndRead(log, reasonFilename, maxReadSize)
stack, _, _ := fileutils.StatAndRead(log, stackFilename, maxReadSize)
reason, ts, _ := fileutils.StatAndRead(log, reasonFilename, readSize16k)
stack, _, _ := fileutils.StatAndRead(log, stackFilename, readSize512k)
return reason, ts, stack
}

// GetBootReason returns the BootReason enum, which is stored as a string in /persist, together with its timestamp
func GetBootReason(log *base.LogObject) (types.BootReason, time.Time) {
reasonFilename := fmt.Sprintf("%s/%s", types.PersistDir, bootReasonFile)
reason, ts, _ := fileutils.StatAndRead(log, reasonFilename, maxReadSize)
reason, ts, _ := fileutils.StatAndRead(log, reasonFilename, readSize16k)
return types.BootReasonFromString(reason), ts
}

// GetRebootImage : Image from which the reboot happened
func GetRebootImage(log *base.LogObject) string {
rebootFilename := fmt.Sprintf("%s/%s", types.PersistDir, rebootImage)
image, _, _ := fileutils.StatAndRead(log, rebootFilename, maxReadSize)
image, _, _ := fileutils.StatAndRead(log, rebootFilename, readSize16k)
return image
}

Expand Down
34 changes: 25 additions & 9 deletions pkg/pillar/cmd/nodeagent/nodeagent.go
Original file line number Diff line number Diff line change
Expand Up @@ -523,14 +523,21 @@ func handleLastRebootReason(ctx *nodeagentContext) {
}

agentlog.DiscardBootReason(log)
// Make sure we log the reboot stack
// Make sure we log the reboot stack or dmesg
if len(rebootStack) > 0 {
log.Warnf("Found RebootStack %d bytes", len(rebootStack))
lines := strings.Split(rebootStack, "\n")
log.Warnf("Found RebootStack %d bytes %d lines",
len(rebootStack), len(lines))

// Tag each line for easy grep
var tag string
if bootReason == types.BootReasonKernel {
tag = "dmesg"
} else {
tag = "stack"
}
log.Warnf("[%s] found reboot-stack content: %d bytes %d lines",
tag, len(rebootStack), len(lines))
for i, l := range lines {
log.Warnf("[%d]: %s", i, l)
log.Warnf("[%s #%d]: %s", tag, i, l)
}
}
// still no rebootReason? set the default
Expand All @@ -554,7 +561,7 @@ func handleLastRebootReason(ctx *nodeagentContext) {
dateStr)
bootReason = types.BootReasonPowerFail
} else {
reason = fmt.Sprintf("Reboot reason - kernel crash - at %s",
reason = fmt.Sprintf("Reboot reason - kernel crash (no kdump) - at %s",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we also hit this code when we experience a hardware watchdog, maybe we should refine the string and also introduce a distinct BootReason enum for this case? If that makes sense to you, feel free to do it in a separate PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Am I correct that we can take this path in case of watchdog reboots?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If a hardware watchdog then the power cycle counter from smartctl will not increase so you'll hit this code. The software watchdog "repair" script writes a BootReason to the file.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, will take a look more thoroughly on that case.

dateStr)
bootReason = types.BootReasonKernel
}
Expand All @@ -577,10 +584,19 @@ func handleLastRebootReason(ctx *nodeagentContext) {
// if reboot stack size crosses max size, truncate
if len(rebootStack) > maxJSONAttributeSize {
runes := bytes.Runes([]byte(rebootStack))
if len(runes) > maxJSONAttributeSize {
runes = runes[:maxRebootStackSize]
sz := len(runes)
// Repeat the check for runes
if sz > maxJSONAttributeSize {
// Get the tail of the stack, because stack grows down and in
// case of truncation it is important to see the beginning of
// it (bottom) rather then the end (top).
eriknordmark marked this conversation as resolved.
Show resolved Hide resolved
runes = runes[sz-maxRebootStackSize : sz]
rebootStack = fmt.Sprintf("...\n%v", string(runes))
} else {
// It is quite possible bytes array size is beyond the max, but
// runes size is less. We ignore this and assume it is not so
// important.
}
rebootStack = fmt.Sprintf("Truncated stack: %v", string(runes))
}
rebootImage := agentlog.GetRebootImage(log)
if rebootImage != "" {
Expand Down
2 changes: 1 addition & 1 deletion pkg/pillar/types/zedagenttypes.go
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ const (
BootReasonOOM // OOM causing process to be killed
BootReasonWatchdogHung // Software watchdog due stuck agent
BootReasonWatchdogPid // Software watchdog due to e.g., golang panic
BootReasonKernel // TBD how we detect this
BootReasonKernel // Set by dump-capture kernel, see docs/KERNEL-DUMPS.md and pkg/kdump/kdump.sh for details
eriknordmark marked this conversation as resolved.
Show resolved Hide resolved
BootReasonPowerFail // Known power failure e.g., from disk controller S.M.A.R.T counter increase
BootReasonUnknown // Could be power failure, kernel panic, or hardware watchdog
BootReasonVaultFailure // Vault was not ready within the expected time
Expand Down
7 changes: 4 additions & 3 deletions pkg/pillar/utils/file/file.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ func ReadWithMaxSize(log *base.LogObject, filename string, maxReadSize int) ([]b
}
defer f.Close()
r := bufio.NewReader(f)
content := make([]byte, maxReadSize)
content := make([]byte, maxReadSize+1)
n, err := r.Read(content)
if err != nil {
err = fmt.Errorf("ReadWithMaxSize %s failed: %v", filename, err)
Expand All @@ -197,13 +197,14 @@ func ReadWithMaxSize(log *base.LogObject, filename string, maxReadSize int) ([]b
}
return nil, err
}
if n == maxReadSize {
if n > maxReadSize {
err = fmt.Errorf("ReadWithMaxSize %s truncated after %d bytes",
filename, maxReadSize)
n = maxReadSize
} else {
err = nil
}
return content[0:n], err
return content[:n], err
}

// ReadSavedCounter returns counter value from provided file
Expand Down