Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move chroot building into TaskRunner #2132

Merged
merged 13 commits into from
Jan 6, 2017
86 changes: 59 additions & 27 deletions client/alloc_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ import (
"github.com/hashicorp/go-multierror"
"github.com/hashicorp/nomad/client/allocdir"
"github.com/hashicorp/nomad/client/config"
"github.com/hashicorp/nomad/client/driver"
"github.com/hashicorp/nomad/client/vaultclient"
"github.com/hashicorp/nomad/nomad/structs"

Expand Down Expand Up @@ -48,8 +47,9 @@ type AllocRunner struct {

dirtyCh chan struct{}

ctx *driver.ExecContext
ctxLock sync.Mutex
allocDir *allocdir.AllocDir
allocDirLock sync.Mutex

tasks map[string]*TaskRunner
taskStates map[string]*structs.TaskState
restored map[string]struct{}
Expand All @@ -76,9 +76,22 @@ type AllocRunner struct {
type allocRunnerState struct {
Version string
Alloc *structs.Allocation
AllocDir *allocdir.AllocDir
AllocClientStatus string
AllocClientDescription string
Context *driver.ExecContext

// COMPAT: Remove in 0.7.0: removing will break upgrading directly from
// 0.5.2, so don't remove in the 0.6 series.
// Context is deprecated and only used to migrate from older releases.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We use the following format for these: // COMPAT: Remove in 0.X.0: <reason for compatibility code and when it can be removed

// It will be removed in the future.
Context *struct {
AllocID string // unused; included for completeness
AllocDir struct {
AllocDir string
SharedDir string // unused; included for completeness
TaskDirs map[string]string
}
} `json:"Context,omitempty"`
}

// NewAllocRunner is used to create a new allocation context
Expand Down Expand Up @@ -117,18 +130,28 @@ func (r *AllocRunner) RestoreState() error {
return err
}

// #2132 Upgrade path: if snap.AllocDir is nil, try to convert old
// Context struct to new AllocDir struct
if snap.AllocDir == nil && snap.Context != nil {
r.logger.Printf("[DEBUG] client: migrating state snapshot for alloc %q", r.alloc.ID)
snap.AllocDir = allocdir.NewAllocDir(r.logger, snap.Context.AllocDir.AllocDir)
for taskName := range snap.Context.AllocDir.TaskDirs {
snap.AllocDir.NewTaskDir(taskName)
}
}

// Restore fields
r.alloc = snap.Alloc
r.ctx = snap.Context
r.allocDir = snap.AllocDir
r.allocClientStatus = snap.AllocClientStatus
r.allocClientDescription = snap.AllocClientDescription

var snapshotErrors multierror.Error
if r.alloc == nil {
snapshotErrors.Errors = append(snapshotErrors.Errors, fmt.Errorf("alloc_runner snapshot includes a nil allocation"))
}
if r.ctx == nil {
snapshotErrors.Errors = append(snapshotErrors.Errors, fmt.Errorf("alloc_runner snapshot includes a nil context"))
if r.allocDir == nil {
snapshotErrors.Errors = append(snapshotErrors.Errors, fmt.Errorf("alloc_runner snapshot includes a nil alloc dir"))
}
if e := snapshotErrors.ErrorOrNil(); e != nil {
return e
Expand All @@ -142,9 +165,16 @@ func (r *AllocRunner) RestoreState() error {
// Mark the task as restored.
r.restored[name] = struct{}{}

td, ok := r.allocDir.TaskDirs[name]
if !ok {
err := fmt.Errorf("failed to find task dir metadata for alloc %q task %q",
r.alloc.ID, name)
r.logger.Printf("[ERR] client: %v", err)
return err
}

task := &structs.Task{Name: name}
tr := NewTaskRunner(r.logger, r.config, r.setTaskState, r.ctx, r.Alloc(),
task, r.vaultClient)
tr := NewTaskRunner(r.logger, r.config, r.setTaskState, td, r.Alloc(), task, r.vaultClient)
r.tasks[name] = tr

// Skip tasks in terminal states.
Expand All @@ -166,10 +196,7 @@ func (r *AllocRunner) RestoreState() error {

// GetAllocDir returns the alloc dir for the alloc runner
func (r *AllocRunner) GetAllocDir() *allocdir.AllocDir {
if r.ctx == nil {
return nil
}
return r.ctx.AllocDir
return r.allocDir
}

// SaveState is used to snapshot the state of the alloc runner
Expand Down Expand Up @@ -204,14 +231,14 @@ func (r *AllocRunner) saveAllocRunnerState() error {
allocClientDescription := r.allocClientDescription
r.allocLock.Unlock()

r.ctxLock.Lock()
ctx := r.ctx
r.ctxLock.Unlock()
r.allocDirLock.Lock()
allocDir := r.allocDir
r.allocDirLock.Unlock()

snap := allocRunnerState{
Version: r.config.Version,
Alloc: alloc,
Context: ctx,
AllocDir: allocDir,
AllocClientStatus: allocClientStatus,
AllocClientDescription: allocClientDescription,
}
Expand All @@ -233,7 +260,7 @@ func (r *AllocRunner) DestroyState() error {

// DestroyContext is used to destroy the context
func (r *AllocRunner) DestroyContext() error {
return r.ctx.AllocDir.Destroy()
return r.allocDir.Destroy()
}

// copyTaskStates returns a copy of the passed task states.
Expand Down Expand Up @@ -409,26 +436,27 @@ func (r *AllocRunner) Run() {
}

// Create the execution context
r.ctxLock.Lock()
if r.ctx == nil {
allocDir := allocdir.NewAllocDir(filepath.Join(r.config.AllocDir, r.alloc.ID))
if err := allocDir.Build(tg.Tasks); err != nil {
r.allocDirLock.Lock()
if r.allocDir == nil {
// Build allocation directory
r.allocDir = allocdir.NewAllocDir(r.logger, filepath.Join(r.config.AllocDir, r.alloc.ID))
if err := r.allocDir.Build(); err != nil {
r.logger.Printf("[WARN] client: failed to build task directories: %v", err)
r.setStatus(structs.AllocClientStatusFailed, fmt.Sprintf("failed to build task dirs for '%s'", alloc.TaskGroup))
r.ctxLock.Unlock()
r.allocDirLock.Unlock()
return
}
r.ctx = driver.NewExecContext(allocDir, r.alloc.ID)

if r.otherAllocDir != nil {
if err := allocDir.Move(r.otherAllocDir, tg.Tasks); err != nil {
if err := r.allocDir.Move(r.otherAllocDir, tg.Tasks); err != nil {
r.logger.Printf("[ERROR] client: failed to move alloc dir into alloc %q: %v", r.alloc.ID, err)
}
if err := r.otherAllocDir.Destroy(); err != nil {
r.logger.Printf("[ERROR] client: error destroying allocdir %v", r.otherAllocDir.AllocDir, err)
}
}
}
r.ctxLock.Unlock()
r.allocDirLock.Unlock()

// Check if the allocation is in a terminal status. In this case, we don't
// start any of the task runners and directly wait for the destroy signal to
Expand All @@ -448,7 +476,11 @@ func (r *AllocRunner) Run() {
continue
}

tr := NewTaskRunner(r.logger, r.config, r.setTaskState, r.ctx, r.Alloc(), task.Copy(), r.vaultClient)
r.allocDirLock.Lock()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be cleaner if AllocDir maintains it's own locks internally so that the consumers of the API doesn't have to ensure they are locked appropriately.

Copy link
Member Author

@schmichael schmichael Jan 4, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sadly we'd still need this lock because it guards the setting and checking the allocDir field on the AllocRunner.

We might be able to skip acquiring it on this particular line if AllocDir had an internal lock, but I'd rather always guard accesses to a field than try to figure out safe places to skip synchronization and confuse future readers of the code.

taskdir := r.allocDir.NewTaskDir(task.Name)
r.allocDirLock.Unlock()

tr := NewTaskRunner(r.logger, r.config, r.setTaskState, taskdir, r.Alloc(), task.Copy(), r.vaultClient)
r.tasks[task.Name] = tr
tr.MarkReceived()

Expand Down
Loading