From d9f495a09391283910e4857482bfc6950d61388e Mon Sep 17 00:00:00 2001 From: Pranshu Srivastava Date: Tue, 19 Mar 2024 01:06:01 +0530 Subject: [PATCH] collector/filesystem: `s/MNT_NOWAIT/MNT_WAIT` `getfsstat(2)` spec mentions that using `MNT_NOWAIT` will return the information it has available without requesting an update from each file system. Hence, use `MNT_WAIT` in place of the earlier used mode, and make changes to the affected collectors to avoid being stuck for long intervals. Fixes: #1498 Signed-off-by: Pranshu Srivastava --- collector/filesystem_bsd.go | 26 ++++-- collector/filesystem_common.go | 3 + collector/filesystem_freebsd.go | 38 +++++++-- collector/filesystem_linux.go | 138 ++++++++++++++------------------ collector/filesystem_openbsd.go | 44 +++++++--- 5 files changed, 146 insertions(+), 103 deletions(-) diff --git a/collector/filesystem_bsd.go b/collector/filesystem_bsd.go index d3025a0171..ffbf4bb8d5 100644 --- a/collector/filesystem_bsd.go +++ b/collector/filesystem_bsd.go @@ -19,6 +19,7 @@ package collector import ( "errors" + "time" "unsafe" "github.com/go-kit/log/level" @@ -38,17 +39,28 @@ const ( readOnly = 0x1 // MNT_RDONLY ) -// Expose filesystem fullness. +// GetStats exposes filesystem fullness. func (c *filesystemCollector) GetStats() (stats []filesystemStats, err error) { - var mntbuf *C.struct_statfs - count := C.getmntinfo(&mntbuf, C.MNT_NOWAIT) - if count == 0 { - return nil, errors.New("getmntinfo() failed") + // `getmntinfo` relies on `getfsstat` in some variants, and is blocking in general. + count := 0 + countCh := make(chan int, 1) + var mountBuf *C.struct_statfs + go func(mountBuf **C.struct_statfs) { + countCh <- int(C.getmntinfo(mountBuf, C.MNT_WAIT)) + close(countCh) + }(&mountBuf) + select { + case count = <-countCh: + if count <= 0 { + return nil, errors.New("getmntinfo failed") + } + case <-time.After(*mountTimeout): + return nil, errors.New("getmntinfo timed out") } - mnt := (*[1 << 20]C.struct_statfs)(unsafe.Pointer(mntbuf)) + mnt := (*[1 << 20]C.struct_statfs)(unsafe.Pointer(mountBuf)) stats = []filesystemStats{} - for i := 0; i < int(count); i++ { + for i := 0; i < count; i++ { mountpoint := C.GoString(&mnt[i].f_mntonname[0]) if c.excludedMountPointsPattern.MatchString(mountpoint) { level.Debug(c.logger).Log("msg", "Ignoring mount point", "mountpoint", mountpoint) diff --git a/collector/filesystem_common.go b/collector/filesystem_common.go index f5d5135241..94a53d4c1f 100644 --- a/collector/filesystem_common.go +++ b/collector/filesystem_common.go @@ -46,6 +46,9 @@ var ( "collector.filesystem.ignored-mount-points", "Regexp of mount points to ignore for filesystem collector.", ).Hidden().String() + mountTimeout = kingpin.Flag("collector.filesystem.mount-timeout", + "how long to wait for a mount to respond before marking it as stale"). + Hidden().Default("5s").Duration() fsTypesExcludeSet bool fsTypesExclude = kingpin.Flag( diff --git a/collector/filesystem_freebsd.go b/collector/filesystem_freebsd.go index f05702d14f..d8397b84b7 100644 --- a/collector/filesystem_freebsd.go +++ b/collector/filesystem_freebsd.go @@ -28,15 +28,41 @@ const ( // Expose filesystem fullness. func (c *filesystemCollector) GetStats() ([]filesystemStats, error) { - n, err := unix.Getfsstat(nil, unix.MNT_NOWAIT) - if err != nil { + var mountPointCount int + nChan := make(chan int, 1) + errChan := make(chan error, 1) + go func() { + var err error + var n int + n, err = unix.Getfsstat(nil, unix.MNT_WAIT) + if err != nil { + errChan <- err + return + } + nChan <- n + }() + select { + case mountPointCount = <-nChan: + case err := <-errChan: return nil, err + case <-time.After(*mountTimeout): + return nil, errors.New("getfsstat timed out") } - buf := make([]unix.Statfs_t, n) - _, err = unix.Getfsstat(buf, unix.MNT_NOWAIT) - if err != nil { - return nil, err + + buf := make([]unix.Statfs_t, mountPointCount) + go func(buf []unix.Statfs_t) { + _, err = unix.Getfsstat(buf, unix.MNT_WAIT) + errChan <- err + }(buf) + select { + case err := <-errChan: + if err != nil { + return nil, err + } + case <-time.After(*mountTimeout): + return nil, errors.New("getfsstat timed out") } + stats := []filesystemStats{} for _, fs := range buf { mountpoint := unix.ByteSliceToString(fs.Mntonname[:]) diff --git a/collector/filesystem_linux.go b/collector/filesystem_linux.go index 1d0c8493b9..691b536fdd 100644 --- a/collector/filesystem_linux.go +++ b/collector/filesystem_linux.go @@ -37,114 +37,115 @@ const ( defFSTypesExcluded = "^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$" ) -var mountTimeout = kingpin.Flag("collector.filesystem.mount-timeout", - "how long to wait for a mount to respond before marking it as stale"). - Hidden().Default("5s").Duration() var statWorkerCount = kingpin.Flag("collector.filesystem.stat-workers", "how many stat calls to process simultaneously"). Hidden().Default("4").Int() -var stuckMounts = make(map[string]struct{}) -var stuckMountsMtx = &sync.Mutex{} +var stuckMountsMap = make(map[string]struct{}) +var stuckMountsMutex = &sync.Mutex{} // GetStats returns filesystem stats. func (c *filesystemCollector) GetStats() ([]filesystemStats, error) { - mps, err := mountPointDetails(c.logger) + fsLabels, err := mountPointDetails(c.logger) if err != nil { return nil, err } - stats := []filesystemStats{} - labelChan := make(chan filesystemLabels) - statChan := make(chan filesystemStats) + fsStats := []filesystemStats{} + fsLabelChan := make(chan filesystemLabels) + fsStatChan := make(chan filesystemStats) wg := sync.WaitGroup{} - workerCount := *statWorkerCount if workerCount < 1 { workerCount = 1 } - for i := 0; i < workerCount; i++ { wg.Add(1) go func() { defer wg.Done() - for labels := range labelChan { - statChan <- c.processStat(labels) + for fsLabel := range fsLabelChan { + fsStatChan <- c.processStat(fsLabel) } }() } go func() { - for _, labels := range mps { - if c.excludedMountPointsPattern.MatchString(labels.mountPoint) { - level.Debug(c.logger).Log("msg", "Ignoring mount point", "mountpoint", labels.mountPoint) + for _, fsLabel := range fsLabels { + if c.excludedMountPointsPattern.MatchString(fsLabel.mountPoint) { + level.Debug(c.logger).Log("msg", "Ignoring mount point", "mountpoint", fsLabel.mountPoint) continue } - if c.excludedFSTypesPattern.MatchString(labels.fsType) { - level.Debug(c.logger).Log("msg", "Ignoring fs", "type", labels.fsType) - continue - } - - stuckMountsMtx.Lock() - if _, ok := stuckMounts[labels.mountPoint]; ok { - labels.deviceError = "mountpoint timeout" - stats = append(stats, filesystemStats{ - labels: labels, - deviceError: 1, - }) - level.Debug(c.logger).Log("msg", "Mount point is in an unresponsive state", "mountpoint", labels.mountPoint) - stuckMountsMtx.Unlock() + if c.excludedFSTypesPattern.MatchString(fsLabel.fsType) { + level.Debug(c.logger).Log("msg", "Ignoring fs", "type", fsLabel.fsType) continue } - - stuckMountsMtx.Unlock() - labelChan <- labels + fsLabelChan <- fsLabel } - close(labelChan) + close(fsLabelChan) wg.Wait() - close(statChan) + close(fsStatChan) }() - for stat := range statChan { - stats = append(stats, stat) + for fsStat := range fsStatChan { + fsStats = append(fsStats, fsStat) } - return stats, nil + return fsStats, nil } -func (c *filesystemCollector) processStat(labels filesystemLabels) filesystemStats { +func (c *filesystemCollector) processStat(fsLabel filesystemLabels) filesystemStats { var ro float64 - for _, option := range strings.Split(labels.options, ",") { + for _, option := range strings.Split(fsLabel.options, ",") { if option == "ro" { ro = 1 break } } - success := make(chan struct{}) - go stuckMountWatcher(labels.mountPoint, success, c.logger) - + // If the mount point is stuck, mark it as such and return early. + // This is done to avoid blocking the stat call indefinitely. + // NOTE: For instance, this can happen when an NFS mount is unreachable. buf := new(unix.Statfs_t) - err := unix.Statfs(rootfsFilePath(labels.mountPoint), buf) - stuckMountsMtx.Lock() - close(success) - - // If the mount has been marked as stuck, unmark it and log it's recovery. - if _, ok := stuckMounts[labels.mountPoint]; ok { - level.Debug(c.logger).Log("msg", "Mount point has recovered, monitoring will resume", "mountpoint", labels.mountPoint) - delete(stuckMounts, labels.mountPoint) + statFsErrChan := make(chan error, 1) + go func(buf *unix.Statfs_t) { + statFsErrChan <- unix.Statfs(rootfsFilePath(fsLabel.mountPoint), buf) + close(statFsErrChan) + }(buf) + + select { + case err := <-statFsErrChan: + if err != nil { + level.Debug(c.logger).Log("msg", "Error on statfs() system call", "rootfs", rootfsFilePath(fsLabel.mountPoint), "err", err) + fsLabel.deviceError = err.Error() + } + case <-time.After(*mountTimeout): + stuckMountsMutex.Lock() + if _, ok := stuckMountsMap[fsLabel.mountPoint]; ok { + level.Debug(c.logger).Log("msg", "Mount point timed out, it is being labeled as stuck and will not be monitored", "mountpoint", fsLabel.mountPoint) + stuckMountsMap[fsLabel.mountPoint] = struct{}{} + fsLabel.deviceError = "mountpoint timeout" + } + stuckMountsMutex.Unlock() } - stuckMountsMtx.Unlock() - if err != nil { - labels.deviceError = err.Error() - level.Debug(c.logger).Log("msg", "Error on statfs() system call", "rootfs", rootfsFilePath(labels.mountPoint), "err", err) + // Check if the mount point has recovered and remove it from the stuck map. + if _, isOpen := <-statFsErrChan; !isOpen { + stuckMountsMutex.Lock() + if _, ok := stuckMountsMap[fsLabel.mountPoint]; ok { + level.Debug(c.logger).Log("msg", "Mount point has recovered, monitoring will resume", "mountpoint", fsLabel.mountPoint) + delete(stuckMountsMap, fsLabel.mountPoint) + } + stuckMountsMutex.Unlock() + } + + // If the mount point is stuck or statfs errored, mark it as such and return. + if fsLabel.deviceError != "" { return filesystemStats{ - labels: labels, + labels: fsLabel, deviceError: 1, ro: ro, } } return filesystemStats{ - labels: labels, + labels: fsLabel, size: float64(buf.Blocks) * float64(buf.Bsize), free: float64(buf.Bfree) * float64(buf.Bsize), avail: float64(buf.Bavail) * float64(buf.Bsize), @@ -154,29 +155,6 @@ func (c *filesystemCollector) processStat(labels filesystemLabels) filesystemSta } } -// stuckMountWatcher listens on the given success channel and if the channel closes -// then the watcher does nothing. If instead the timeout is reached, the -// mount point that is being watched is marked as stuck. -func stuckMountWatcher(mountPoint string, success chan struct{}, logger log.Logger) { - mountCheckTimer := time.NewTimer(*mountTimeout) - defer mountCheckTimer.Stop() - select { - case <-success: - // Success - case <-mountCheckTimer.C: - // Timed out, mark mount as stuck - stuckMountsMtx.Lock() - select { - case <-success: - // Success came in just after the timeout was reached, don't label the mount as stuck - default: - level.Debug(logger).Log("msg", "Mount point timed out, it is being labeled as stuck and will not be monitored", "mountpoint", mountPoint) - stuckMounts[mountPoint] = struct{}{} - } - stuckMountsMtx.Unlock() - } -} - func mountPointDetails(logger log.Logger) ([]filesystemLabels, error) { file, err := os.Open(procFilePath("1/mounts")) if errors.Is(err, os.ErrNotExist) { diff --git a/collector/filesystem_openbsd.go b/collector/filesystem_openbsd.go index 1c1e479e1f..07e86b5f01 100644 --- a/collector/filesystem_openbsd.go +++ b/collector/filesystem_openbsd.go @@ -27,20 +27,44 @@ const ( ) // Expose filesystem fullness. -func (c *filesystemCollector) GetStats() (stats []filesystemStats, err error) { - var mnt []unix.Statfs_t - size, err := unix.Getfsstat(mnt, unix.MNT_NOWAIT) - if err != nil { - return nil, err +func (c *filesystemCollector) GetStats() (stats []filesystemStats, fsstatErr error) { + var mountPointCount int + nChan := make(chan int, 1) + errChan := make(chan error, 1) + go func() { + var statErr error + var n int + n, statErr = unix.Getfsstat(nil, unix.MNT_WAIT) + if statErr != nil { + errChan <- statErr + return + } + nChan <- n + }() + select { + case mountPointCount = <-nChan: + case statErr := <-errChan: + return nil, statErr + case <-time.After(*mountTimeout): + return nil, errors.New("getfsstat timed out") } - mnt = make([]unix.Statfs_t, size) - _, err = unix.Getfsstat(mnt, unix.MNT_NOWAIT) - if err != nil { - return nil, err + + buf := make([]unix.Statfs_t, mountPointCount) + go func(buf []unix.Statfs_t) { + _, fsstatErr = unix.Getfsstat(buf, unix.MNT_WAIT) + errChan <- fsstatErr + }(buf) + select { + case err := <-errChan: + if err != nil { + return nil, err + } + case <-time.After(*mountTimeout): + return nil, errors.New("getfsstat timed out") } stats = []filesystemStats{} - for _, v := range mnt { + for _, v := range buf { mountpoint := unix.ByteSliceToString(v.F_mntonname[:]) if c.excludedMountPointsPattern.MatchString(mountpoint) { level.Debug(c.logger).Log("msg", "Ignoring mount point", "mountpoint", mountpoint)