From 6f2b09f67675bc1e2d487155ca39ca116ba508fb Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Thu, 1 Sep 2016 17:34:40 -0700 Subject: [PATCH 1/5] Add sanity check to SaveState Also just reuse the task states snapshot taken by `Alloc()` instead of doing a redundant copy. --- client/alloc_runner.go | 5 +---- client/util.go | 7 +++++++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/client/alloc_runner.go b/client/alloc_runner.go index e2705a341793..d79374ef1ec6 100644 --- a/client/alloc_runner.go +++ b/client/alloc_runner.go @@ -180,12 +180,9 @@ func (r *AllocRunner) SaveState() error { func (r *AllocRunner) saveAllocRunnerState() error { // Create the snapshot. - r.taskStatusLock.RLock() - states := copyTaskStates(r.taskStates) - r.taskStatusLock.RUnlock() - alloc := r.Alloc() r.allocLock.Lock() + states := alloc.TaskStates allocClientStatus := r.allocClientStatus allocClientDescription := r.allocClientDescription r.allocLock.Unlock() diff --git a/client/util.go b/client/util.go index 369b5f059613..ff2fa3d2559b 100644 --- a/client/util.go +++ b/client/util.go @@ -92,6 +92,13 @@ func persistState(path string, data interface{}) error { if err := os.Rename(tmpPath, path); err != nil { return fmt.Errorf("failed to rename tmp to path: %v", err) } + + // Sanity check since users have reported empty state files on disk + if stat, err := os.Stat(path); err != nil { + return fmt.Errorf("unable to stat state file %s: %v", path, err) + } else if stat.Size() == 0 { + return fmt.Errorf("persisted invalid state file %s; see https://github.com/hashicorp/nomad/issues/1367") + } return nil } From 8d9468d4c303dfb6b507786b2eb956f7d14c25a1 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Fri, 2 Sep 2016 15:39:22 -0700 Subject: [PATCH 2/5] Lock around saving state Prevent interleaving state syncs as it could conceivably lead to empty state files as per #1367 --- client/alloc_runner.go | 7 +++++++ client/task_runner.go | 6 ++++++ 2 files changed, 13 insertions(+) diff --git a/client/alloc_runner.go b/client/alloc_runner.go index d79374ef1ec6..ee7ab9c06e1b 100644 --- a/client/alloc_runner.go +++ b/client/alloc_runner.go @@ -66,6 +66,9 @@ type AllocRunner struct { destroyCh chan struct{} destroyLock sync.Mutex waitCh chan struct{} + + // serialize saveAllocRunnerState calls + persistLock sync.Mutex } // allocRunnerState is used to snapshot the state of the alloc runner @@ -179,8 +182,12 @@ func (r *AllocRunner) SaveState() error { } func (r *AllocRunner) saveAllocRunnerState() error { + r.persistLock.Lock() + defer r.persistLock.Unlock() + // Create the snapshot. alloc := r.Alloc() + r.allocLock.Lock() states := alloc.TaskStates allocClientStatus := r.allocClientStatus diff --git a/client/task_runner.go b/client/task_runner.go index 208f2ab25cd1..b68fbeb65c6c 100644 --- a/client/task_runner.go +++ b/client/task_runner.go @@ -69,6 +69,9 @@ type TaskRunner struct { destroyLock sync.Mutex destroyEvent *structs.TaskEvent waitCh chan struct{} + + // serialize SaveState calls + persistLock sync.Mutex } // taskRunnerState is used to snapshot the state of the task runner @@ -186,6 +189,9 @@ func (r *TaskRunner) RestoreState() error { // SaveState is used to snapshot our state func (r *TaskRunner) SaveState() error { + r.persistLock.Lock() + defer r.persistLock.Unlock() + snap := taskRunnerState{ Task: r.task, Version: r.config.Version, From 8cb85ed1200f05d2395db5ba46cc1d6466a69677 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Fri, 2 Sep 2016 15:41:05 -0700 Subject: [PATCH 3/5] Don't serialize task states twice in state files --- client/alloc_runner.go | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/client/alloc_runner.go b/client/alloc_runner.go index ee7ab9c06e1b..00d76746422e 100644 --- a/client/alloc_runner.go +++ b/client/alloc_runner.go @@ -77,7 +77,6 @@ type allocRunnerState struct { Alloc *structs.Allocation AllocClientStatus string AllocClientDescription string - TaskStates map[string]*structs.TaskState Context *driver.ExecContext } @@ -121,7 +120,7 @@ func (r *AllocRunner) RestoreState() error { r.ctx = snap.Context r.allocClientStatus = snap.AllocClientStatus r.allocClientDescription = snap.AllocClientDescription - r.taskStates = snap.TaskStates + r.taskStates = snap.Alloc.TaskStates var snapshotErrors multierror.Error if r.alloc == nil { @@ -189,7 +188,6 @@ func (r *AllocRunner) saveAllocRunnerState() error { alloc := r.Alloc() r.allocLock.Lock() - states := alloc.TaskStates allocClientStatus := r.allocClientStatus allocClientDescription := r.allocClientDescription r.allocLock.Unlock() @@ -204,7 +202,6 @@ func (r *AllocRunner) saveAllocRunnerState() error { Context: ctx, AllocClientStatus: allocClientStatus, AllocClientDescription: allocClientDescription, - TaskStates: states, } return persistState(r.stateFilePath(), &snap) } From fe9fe4c26259c1ad3bd7e94bd711418aaf819b20 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Fri, 2 Sep 2016 15:41:41 -0700 Subject: [PATCH 4/5] A nil context isn't an error --- client/alloc_runner.go | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/client/alloc_runner.go b/client/alloc_runner.go index 00d76746422e..3454450bc954 100644 --- a/client/alloc_runner.go +++ b/client/alloc_runner.go @@ -51,13 +51,14 @@ type AllocRunner struct { dirtyCh chan struct{} - ctx *driver.ExecContext - ctxLock sync.Mutex - tasks map[string]*TaskRunner - taskStates map[string]*structs.TaskState - restored map[string]struct{} - taskLock sync.RWMutex + ctx *driver.ExecContext + ctxLock sync.Mutex + tasks map[string]*TaskRunner + restored map[string]struct{} + taskLock sync.RWMutex + + taskStates map[string]*structs.TaskState taskStatusLock sync.RWMutex updateCh chan *structs.Allocation @@ -126,9 +127,6 @@ func (r *AllocRunner) RestoreState() error { if r.alloc == nil { snapshotErrors.Errors = append(snapshotErrors.Errors, fmt.Errorf("alloc_runner snapshot includes a nil allocation")) } - if r.ctx == nil { - snapshotErrors.Errors = append(snapshotErrors.Errors, fmt.Errorf("alloc_runner snapshot includes a nil context")) - } if e := snapshotErrors.ErrorOrNil(); e != nil { return e } From 78a4befa714b0b5b20964a5a71548fdc0300e484 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Mon, 12 Sep 2016 12:56:12 -0700 Subject: [PATCH 5/5] Revert "A nil context isn't an error" This reverts commit fe9fe4c26259c1ad3bd7e94bd711418aaf819b20. --- client/alloc_runner.go | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/client/alloc_runner.go b/client/alloc_runner.go index 3454450bc954..00d76746422e 100644 --- a/client/alloc_runner.go +++ b/client/alloc_runner.go @@ -51,14 +51,13 @@ type AllocRunner struct { dirtyCh chan struct{} - ctx *driver.ExecContext - ctxLock sync.Mutex + ctx *driver.ExecContext + ctxLock sync.Mutex + tasks map[string]*TaskRunner + taskStates map[string]*structs.TaskState + restored map[string]struct{} + taskLock sync.RWMutex - tasks map[string]*TaskRunner - restored map[string]struct{} - taskLock sync.RWMutex - - taskStates map[string]*structs.TaskState taskStatusLock sync.RWMutex updateCh chan *structs.Allocation @@ -127,6 +126,9 @@ func (r *AllocRunner) RestoreState() error { if r.alloc == nil { snapshotErrors.Errors = append(snapshotErrors.Errors, fmt.Errorf("alloc_runner snapshot includes a nil allocation")) } + if r.ctx == nil { + snapshotErrors.Errors = append(snapshotErrors.Errors, fmt.Errorf("alloc_runner snapshot includes a nil context")) + } if e := snapshotErrors.ErrorOrNil(); e != nil { return e }