diff --git a/client/alloc_runner.go b/client/alloc_runner.go index e2705a341793..00d76746422e 100644 --- a/client/alloc_runner.go +++ b/client/alloc_runner.go @@ -66,6 +66,9 @@ type AllocRunner struct { destroyCh chan struct{} destroyLock sync.Mutex waitCh chan struct{} + + // serialize saveAllocRunnerState calls + persistLock sync.Mutex } // allocRunnerState is used to snapshot the state of the alloc runner @@ -74,7 +77,6 @@ type allocRunnerState struct { Alloc *structs.Allocation AllocClientStatus string AllocClientDescription string - TaskStates map[string]*structs.TaskState Context *driver.ExecContext } @@ -118,7 +120,7 @@ func (r *AllocRunner) RestoreState() error { r.ctx = snap.Context r.allocClientStatus = snap.AllocClientStatus r.allocClientDescription = snap.AllocClientDescription - r.taskStates = snap.TaskStates + r.taskStates = snap.Alloc.TaskStates var snapshotErrors multierror.Error if r.alloc == nil { @@ -179,12 +181,12 @@ func (r *AllocRunner) SaveState() error { } func (r *AllocRunner) saveAllocRunnerState() error { - // Create the snapshot. - r.taskStatusLock.RLock() - states := copyTaskStates(r.taskStates) - r.taskStatusLock.RUnlock() + r.persistLock.Lock() + defer r.persistLock.Unlock() + // Create the snapshot. alloc := r.Alloc() + r.allocLock.Lock() allocClientStatus := r.allocClientStatus allocClientDescription := r.allocClientDescription @@ -200,7 +202,6 @@ func (r *AllocRunner) saveAllocRunnerState() error { Context: ctx, AllocClientStatus: allocClientStatus, AllocClientDescription: allocClientDescription, - TaskStates: states, } return persistState(r.stateFilePath(), &snap) } diff --git a/client/task_runner.go b/client/task_runner.go index 208f2ab25cd1..b68fbeb65c6c 100644 --- a/client/task_runner.go +++ b/client/task_runner.go @@ -69,6 +69,9 @@ type TaskRunner struct { destroyLock sync.Mutex destroyEvent *structs.TaskEvent waitCh chan struct{} + + // serialize SaveState calls + persistLock sync.Mutex } // taskRunnerState is used to snapshot the state of the task runner @@ -186,6 +189,9 @@ func (r *TaskRunner) RestoreState() error { // SaveState is used to snapshot our state func (r *TaskRunner) SaveState() error { + r.persistLock.Lock() + defer r.persistLock.Unlock() + snap := taskRunnerState{ Task: r.task, Version: r.config.Version, diff --git a/client/util.go b/client/util.go index 369b5f059613..ff2fa3d2559b 100644 --- a/client/util.go +++ b/client/util.go @@ -92,6 +92,13 @@ func persistState(path string, data interface{}) error { if err := os.Rename(tmpPath, path); err != nil { return fmt.Errorf("failed to rename tmp to path: %v", err) } + + // Sanity check since users have reported empty state files on disk + if stat, err := os.Stat(path); err != nil { + return fmt.Errorf("unable to stat state file %s: %v", path, err) + } else if stat.Size() == 0 { + return fmt.Errorf("persisted invalid state file %s; see https://github.com/hashicorp/nomad/issues/1367") + } return nil }