Skip to content

Commit

Permalink
Track task start/finish time & improve logs errors
Browse files Browse the repository at this point in the history
This PR adds tracking to when a task starts and finishes and the logs
API takes advantage of this and returns better errors when asking for
logs that do not exist.
  • Loading branch information
dadgar committed Mar 31, 2017
1 parent 2ec3e61 commit d212f6f
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 14 deletions.
8 changes: 5 additions & 3 deletions api/tasks.go
Original file line number Diff line number Diff line change
Expand Up @@ -424,9 +424,11 @@ func (t *Task) SetLogConfig(l *LogConfig) *Task {
// TaskState tracks the current state of a task and events that caused state
// transitions.
type TaskState struct {
State string
Failed bool
Events []*TaskEvent
State string
Failed bool
StartedAt time.Time
FinishedAt time.Time
Events []*TaskEvent
}

const (
Expand Down
19 changes: 14 additions & 5 deletions client/alloc_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -381,8 +381,16 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
return
}

taskState.State = state
if state == structs.TaskStateDead {
switch state {
case structs.TaskStateRunning:
// Capture the start time if it is just starting
if taskState.State != structs.TaskStateRunning {
taskState.StartedAt = time.Now().UTC()
}
case structs.TaskStateDead:
// Capture the finished time
taskState.FinishedAt = time.Now().UTC()

// Find all tasks that are not the one that is dead and check if the one
// that is dead is a leader
var otherTaskRunners []*TaskRunner
Expand All @@ -406,18 +414,19 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
r.logger.Printf("[DEBUG] client: task %q failed, destroying other tasks in task group: %v", taskName, otherTaskNames)
}
} else if leader {
// If the task was a leader task we should kill all the other tasks.
for _, tr := range otherTaskRunners {
tr.Destroy(structs.NewTaskEvent(structs.TaskLeaderDead))
}
if len(otherTaskRunners) > 0 {
r.logger.Printf("[DEBUG] client: leader task %q is dead, destroying other tasks in task group: %v", taskName, otherTaskNames)
}
}

// If the task was a leader task we should kill all the other
// tasks.
}

// Store the new state
taskState.State = state

select {
case r.dirtyCh <- struct{}{}:
default:
Expand Down
9 changes: 9 additions & 0 deletions client/alloc_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -633,6 +633,9 @@ func TestAllocRunner_TaskFailed_KillTG(t *testing.T) {
if state1.State != structs.TaskStateDead {
return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead)
}
if state1.FinishedAt.IsZero() || state1.StartedAt.IsZero() {
return false, fmt.Errorf("expected to have a start and finish time")
}
if len(state1.Events) < 2 {
// At least have a received and destroyed
return false, fmt.Errorf("Unexpected number of events")
Expand Down Expand Up @@ -700,6 +703,9 @@ func TestAllocRunner_TaskLeader_KillTG(t *testing.T) {
if state1.State != structs.TaskStateDead {
return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead)
}
if state1.FinishedAt.IsZero() || state1.StartedAt.IsZero() {
return false, fmt.Errorf("expected to have a start and finish time")
}
if len(state1.Events) < 2 {
// At least have a received and destroyed
return false, fmt.Errorf("Unexpected number of events")
Expand All @@ -721,6 +727,9 @@ func TestAllocRunner_TaskLeader_KillTG(t *testing.T) {
if state2.State != structs.TaskStateDead {
return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead)
}
if state2.FinishedAt.IsZero() || state2.StartedAt.IsZero() {
return false, fmt.Errorf("expected to have a start and finish time")
}

return true, nil
}, func(err error) {
Expand Down
23 changes: 17 additions & 6 deletions client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -531,11 +531,21 @@ func (c *Client) GetAllocFS(allocID string) (allocdir.AllocDirFS, error) {

ar, ok := c.allocs[allocID]
if !ok {
return nil, fmt.Errorf("alloc not found")
return nil, fmt.Errorf("unknown allocation ID %q", allocID)
}
return ar.GetAllocDir(), nil
}

// GetClientAlloc returns the allocation from the client
func (c *Client) GetClientAlloc(allocID string) (*structs.Allocation, error) {
all := c.allAllocs()
alloc, ok := all[allocID]
if !ok {
return nil, fmt.Errorf("unknown allocation ID %q", allocID)
}
return alloc, nil
}

// GetServers returns the list of nomad servers this client is aware of.
func (c *Client) GetServers() []string {
endpoints := c.servers.all()
Expand Down Expand Up @@ -2349,20 +2359,21 @@ func (c *Client) getAllocatedResources(selfNode *structs.Node) *structs.Resource
}

// allAllocs returns all the allocations managed by the client
func (c *Client) allAllocs() []*structs.Allocation {
var allocs []*structs.Allocation
func (c *Client) allAllocs() map[string]*structs.Allocation {
allocs := make(map[string]*structs.Allocation, 16)
for _, ar := range c.getAllocRunners() {
allocs = append(allocs, ar.Alloc())
a := ar.Alloc()
allocs[a.ID] = a
}
c.blockedAllocsLock.Lock()
for _, alloc := range c.blockedAllocations {
allocs = append(allocs, alloc)
allocs[alloc.ID] = alloc
}
c.blockedAllocsLock.Unlock()

c.migratingAllocsLock.Lock()
for _, ctrl := range c.migratingAllocs {
allocs = append(allocs, ctrl.alloc)
allocs[ctrl.alloc.ID] = ctrl.alloc
}
c.migratingAllocsLock.Unlock()
return allocs
Expand Down
18 changes: 18 additions & 0 deletions command/agent/fs_endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -761,6 +761,24 @@ func (s *HTTPServer) Logs(resp http.ResponseWriter, req *http.Request) (interfac
return nil, err
}

alloc, err := s.agent.client.GetClientAlloc(allocID)
if err != nil {
return nil, err
}

// Check that the task is there
tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
if tg == nil {
return nil, fmt.Errorf("Failed to lookup task group for allocation")
} else if taskStruct := tg.LookupTask(task); taskStruct == nil {
return nil, CodedError(404, fmt.Sprintf("task group %q does not have task with name %q", alloc.TaskGroup, task))
}

state, ok := alloc.TaskStates[task]
if !ok || state.StartedAt.IsZero() {
return nil, CodedError(404, fmt.Sprintf("task %q not started yet. No logs available", task))
}

// Create an output that gets flushed on every write
output := ioutils.NewWriteFlusher(resp)

Expand Down
10 changes: 10 additions & 0 deletions nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -2774,6 +2774,14 @@ type TaskState struct {
// Failed marks a task as having failed
Failed bool

// StartedAt is the time the task is started. It is updated each time the
// task starts
StartedAt time.Time

// FinishedAt is the time at which the task transistioned to dead and will
// not be started again.
FinishedAt time.Time

// Series of task events that transition the state of the task.
Events []*TaskEvent
}
Expand All @@ -2785,6 +2793,8 @@ func (ts *TaskState) Copy() *TaskState {
copy := new(TaskState)
copy.State = ts.State
copy.Failed = ts.Failed
copy.StartedAt = ts.StartedAt
copy.FinishedAt = ts.FinishedAt

if ts.Events != nil {
copy.Events = make([]*TaskEvent, len(ts.Events))
Expand Down
6 changes: 6 additions & 0 deletions website/source/docs/http/alloc.html.md
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,8 @@ be specified using the `?region=` query parameter.
}
],
"State": "running"
"FinishedAt": "0001-01-01T00:00:00Z",
"StartedAt": "2017-03-31T22:51:40.248633594Z",
"Failed": false,
}
},
Expand All @@ -247,6 +249,10 @@ be specified using the `?region=` query parameter.
* `TaskStateRunning` - The task is currently running.
* `TaskStateDead` - The task is dead and will not run again.

Further the state contains the `StartedAt` and `FinishedAt` times of the
task. `StartedAt` can be updated multiple times if the task restarts but
`FinishedAt` is set only when the task transistions to `TaskStateDead`

<p>The latest 10 events are stored per task. Each event is timestamped (unix nano-seconds)
and has one of the following types:</p>

Expand Down

0 comments on commit d212f6f

Please sign in to comment.