Skip to content

Commit

Permalink
Merge pull request #1682 from hashicorp/b-sanity-check-state-file
Browse files Browse the repository at this point in the history
Prevent state file corruption and check state file sanity on save
  • Loading branch information
schmichael committed Sep 13, 2016
2 parents 3e71719 + 78a4bef commit bbe884a
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 7 deletions.
15 changes: 8 additions & 7 deletions client/alloc_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ type AllocRunner struct {
destroyCh chan struct{}
destroyLock sync.Mutex
waitCh chan struct{}

// serialize saveAllocRunnerState calls
persistLock sync.Mutex
}

// allocRunnerState is used to snapshot the state of the alloc runner
Expand All @@ -74,7 +77,6 @@ type allocRunnerState struct {
Alloc *structs.Allocation
AllocClientStatus string
AllocClientDescription string
TaskStates map[string]*structs.TaskState
Context *driver.ExecContext
}

Expand Down Expand Up @@ -118,7 +120,7 @@ func (r *AllocRunner) RestoreState() error {
r.ctx = snap.Context
r.allocClientStatus = snap.AllocClientStatus
r.allocClientDescription = snap.AllocClientDescription
r.taskStates = snap.TaskStates
r.taskStates = snap.Alloc.TaskStates

var snapshotErrors multierror.Error
if r.alloc == nil {
Expand Down Expand Up @@ -179,12 +181,12 @@ func (r *AllocRunner) SaveState() error {
}

func (r *AllocRunner) saveAllocRunnerState() error {
// Create the snapshot.
r.taskStatusLock.RLock()
states := copyTaskStates(r.taskStates)
r.taskStatusLock.RUnlock()
r.persistLock.Lock()
defer r.persistLock.Unlock()

// Create the snapshot.
alloc := r.Alloc()

r.allocLock.Lock()
allocClientStatus := r.allocClientStatus
allocClientDescription := r.allocClientDescription
Expand All @@ -200,7 +202,6 @@ func (r *AllocRunner) saveAllocRunnerState() error {
Context: ctx,
AllocClientStatus: allocClientStatus,
AllocClientDescription: allocClientDescription,
TaskStates: states,
}
return persistState(r.stateFilePath(), &snap)
}
Expand Down
6 changes: 6 additions & 0 deletions client/task_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ type TaskRunner struct {
destroyLock sync.Mutex
destroyEvent *structs.TaskEvent
waitCh chan struct{}

// serialize SaveState calls
persistLock sync.Mutex
}

// taskRunnerState is used to snapshot the state of the task runner
Expand Down Expand Up @@ -186,6 +189,9 @@ func (r *TaskRunner) RestoreState() error {

// SaveState is used to snapshot our state
func (r *TaskRunner) SaveState() error {
r.persistLock.Lock()
defer r.persistLock.Unlock()

snap := taskRunnerState{
Task: r.task,
Version: r.config.Version,
Expand Down
7 changes: 7 additions & 0 deletions client/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,13 @@ func persistState(path string, data interface{}) error {
if err := os.Rename(tmpPath, path); err != nil {
return fmt.Errorf("failed to rename tmp to path: %v", err)
}

// Sanity check since users have reported empty state files on disk
if stat, err := os.Stat(path); err != nil {
return fmt.Errorf("unable to stat state file %s: %v", path, err)
} else if stat.Size() == 0 {
return fmt.Errorf("persisted invalid state file %s; see https://github.com/hashicorp/nomad/issues/1367")
}
return nil
}

Expand Down

0 comments on commit bbe884a

Please sign in to comment.