Skip to content

Commit

Permalink
filesystem: fix mountTimeout not working issue (prometheus#2903)
Browse files Browse the repository at this point in the history
Signed-off-by: DongWei <jiangxuege@hotmail.com>
Signed-off-by: Vitaly Zhuravlev <v-zhuravlev@users.noreply.github.com>
  • Loading branch information
DongWei-4 authored and v-zhuravlev committed Nov 1, 2024
1 parent c9c01f7 commit 8cf91ed
Showing 1 changed file with 13 additions and 9 deletions.
22 changes: 13 additions & 9 deletions collector/filesystem_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,16 +122,8 @@ func (c *filesystemCollector) processStat(labels filesystemLabels) filesystemSta

buf := new(unix.Statfs_t)
err := unix.Statfs(rootfsFilePath(labels.mountPoint), buf)
stuckMountsMtx.Lock()
close(success)

// If the mount has been marked as stuck, unmark it and log it's recovery.
if _, ok := stuckMounts[labels.mountPoint]; ok {
level.Debug(c.logger).Log("msg", "Mount point has recovered, monitoring will resume", "mountpoint", labels.mountPoint)
delete(stuckMounts, labels.mountPoint)
}
stuckMountsMtx.Unlock()

if err != nil {
level.Debug(c.logger).Log("msg", "Error on statfs() system call", "rootfs", rootfsFilePath(labels.mountPoint), "err", err)
return filesystemStats{
Expand Down Expand Up @@ -161,17 +153,29 @@ func stuckMountWatcher(mountPoint string, success chan struct{}, logger log.Logg
select {
case <-success:
// Success
// If the mount has been marked as stuck, unmark it and log it's recovery.
stuckMountsMtx.Lock()
defer stuckMountsMtx.Unlock()
if _, ok := stuckMounts[mountPoint]; ok {
level.Debug(logger).Log("msg", "Mount point has recovered, monitoring will resume", "mountpoint", mountPoint)
delete(stuckMounts, mountPoint)
}
case <-mountCheckTimer.C:
// Timed out, mark mount as stuck
stuckMountsMtx.Lock()
defer stuckMountsMtx.Unlock()
select {
case <-success:
// Success came in just after the timeout was reached, don't label the mount as stuck
// If the mount has been marked as stuck, unmark it and log it's recovery.
if _, ok := stuckMounts[mountPoint]; ok {
level.Debug(logger).Log("msg", "Mount point has recovered, monitoring will resume", "mountpoint", mountPoint)
delete(stuckMounts, mountPoint)
}
default:
level.Debug(logger).Log("msg", "Mount point timed out, it is being labeled as stuck and will not be monitored", "mountpoint", mountPoint)
stuckMounts[mountPoint] = struct{}{}
}
stuckMountsMtx.Unlock()
}
}

Expand Down

0 comments on commit 8cf91ed

Please sign in to comment.