Skip to content

Commit

Permalink
Changed the way that stuck mounts are handled. If a mount fails to re…
Browse files Browse the repository at this point in the history
…turn, it will stop being queried until it returns.

Fixed spelling mistakes.

Update transport_generic.go

Changed to a mutex approach instead of channels and added a timeout before declaring a mount stuck.

Removed unnecessary lock channel and clarified some var names.

Fixed style nits.

Signed-off-by: Mark Knapp <mknapp@hudson-trading.com>
  • Loading branch information
mknapphrt committed Jul 13, 2018
1 parent ac5a981 commit 7719ff4
Showing 1 changed file with 54 additions and 1 deletion.
55 changes: 54 additions & 1 deletion collector/filesystem_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ import (
"bufio"
"os"
"strings"
"sync"
"syscall"
"time"

"github.com/prometheus/common/log"
)
Expand All @@ -28,8 +30,12 @@ const (
defIgnoredMountPoints = "^/(dev|proc|sys|var/lib/docker)($|/)"
defIgnoredFSTypes = "^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$"
readOnly = 0x1 // ST_RDONLY
mountTimeout = 30 * time.Second
)

var stuckMounts = make(map[string]struct{})
var stuckMountsMtx = &sync.Mutex{}

// GetStats returns filesystem stats.
func (c *filesystemCollector) GetStats() ([]filesystemStats, error) {
mps, err := mountPointDetails()
Expand All @@ -46,9 +52,35 @@ func (c *filesystemCollector) GetStats() ([]filesystemStats, error) {
log.Debugf("Ignoring fs type: %s", labels.fsType)
continue
}
stuckMountsMtx.Lock()
if _, ok := stuckMounts[labels.mountPoint]; ok {
stats = append(stats, filesystemStats{
labels: labels,
deviceError: 1,
})
log.Debugf("Mount point %q is in an unresponsive state", labels.mountPoint)
stuckMountsMtx.Unlock()
continue
}
stuckMountsMtx.Unlock()

// The success channel is used do tell the "watcher" that the stat
// finished successfully. The channel is closed on success.
success := make(chan struct{})
go stuckMountWatcher(labels.mountPoint, success)

buf := new(syscall.Statfs_t)
err := syscall.Statfs(labels.mountPoint, buf)
err = syscall.Statfs(labels.mountPoint, buf)

stuckMountsMtx.Lock()
close(success)
// If the mount has been marked as stuck, unmark it and log it's recovery.
if _, ok := stuckMounts[labels.mountPoint]; ok {
log.Debugf("Mount point %q has recovered, monitoring will resume", labels.mountPoint)
delete(stuckMounts, labels.mountPoint)
}
stuckMountsMtx.Unlock()

if err != nil {
stats = append(stats, filesystemStats{
labels: labels,
Expand Down Expand Up @@ -76,6 +108,27 @@ func (c *filesystemCollector) GetStats() ([]filesystemStats, error) {
return stats, nil
}

// stuckMountWatcher listens on the given success channel and if the channel closes
// then the watcher does nothing. If instead the timeout is reached, the
// mount point that is being watched is marked as stuck.
func stuckMountWatcher(mountPoint string, success chan struct{}) {
select {
case <-success:
// Success
case <-time.After(mountTimeout):
// Timed out, mark mount as stuck
stuckMountsMtx.Lock()
select {
case <-success:
// Success came in just after the timeout was reached, don't label the mount as stuck
default:
log.Debugf("Mount point %q timed out, it is being labeled as stuck and will not be monitored", mountPoint)
stuckMounts[mountPoint] = struct{}{}
}
stuckMountsMtx.Unlock()
}
}

func mountPointDetails() ([]filesystemLabels, error) {
file, err := os.Open(procFilePath("mounts"))
if err != nil {
Expand Down

0 comments on commit 7719ff4

Please sign in to comment.