Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Track task start/finish time & improve logs errors #2512

Merged
merged 1 commit into from
Mar 31, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions api/tasks.go
Original file line number Diff line number Diff line change
Expand Up @@ -424,9 +424,11 @@ func (t *Task) SetLogConfig(l *LogConfig) *Task {
// TaskState tracks the current state of a task and events that caused state
// transitions.
type TaskState struct {
State string
Failed bool
Events []*TaskEvent
State string
Failed bool
StartedAt time.Time
FinishedAt time.Time
Events []*TaskEvent
}

const (
Expand Down
19 changes: 14 additions & 5 deletions client/alloc_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -381,8 +381,16 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
return
}

taskState.State = state
if state == structs.TaskStateDead {
switch state {
case structs.TaskStateRunning:
// Capture the start time if it is just starting
if taskState.State != structs.TaskStateRunning {
taskState.StartedAt = time.Now().UTC()
}
case structs.TaskStateDead:
// Capture the finished time
taskState.FinishedAt = time.Now().UTC()

// Find all tasks that are not the one that is dead and check if the one
// that is dead is a leader
var otherTaskRunners []*TaskRunner
Expand All @@ -406,18 +414,19 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
r.logger.Printf("[DEBUG] client: task %q failed, destroying other tasks in task group: %v", taskName, otherTaskNames)
}
} else if leader {
// If the task was a leader task we should kill all the other tasks.
for _, tr := range otherTaskRunners {
tr.Destroy(structs.NewTaskEvent(structs.TaskLeaderDead))
}
if len(otherTaskRunners) > 0 {
r.logger.Printf("[DEBUG] client: leader task %q is dead, destroying other tasks in task group: %v", taskName, otherTaskNames)
}
}

// If the task was a leader task we should kill all the other
// tasks.
}

// Store the new state
taskState.State = state

select {
case r.dirtyCh <- struct{}{}:
default:
Expand Down
9 changes: 9 additions & 0 deletions client/alloc_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -633,6 +633,9 @@ func TestAllocRunner_TaskFailed_KillTG(t *testing.T) {
if state1.State != structs.TaskStateDead {
return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead)
}
if state1.FinishedAt.IsZero() || state1.StartedAt.IsZero() {
return false, fmt.Errorf("expected to have a start and finish time")
}
if len(state1.Events) < 2 {
// At least have a received and destroyed
return false, fmt.Errorf("Unexpected number of events")
Expand Down Expand Up @@ -700,6 +703,9 @@ func TestAllocRunner_TaskLeader_KillTG(t *testing.T) {
if state1.State != structs.TaskStateDead {
return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead)
}
if state1.FinishedAt.IsZero() || state1.StartedAt.IsZero() {
return false, fmt.Errorf("expected to have a start and finish time")
}
if len(state1.Events) < 2 {
// At least have a received and destroyed
return false, fmt.Errorf("Unexpected number of events")
Expand All @@ -721,6 +727,9 @@ func TestAllocRunner_TaskLeader_KillTG(t *testing.T) {
if state2.State != structs.TaskStateDead {
return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead)
}
if state2.FinishedAt.IsZero() || state2.StartedAt.IsZero() {
return false, fmt.Errorf("expected to have a start and finish time")
}

return true, nil
}, func(err error) {
Expand Down
23 changes: 17 additions & 6 deletions client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -531,11 +531,21 @@ func (c *Client) GetAllocFS(allocID string) (allocdir.AllocDirFS, error) {

ar, ok := c.allocs[allocID]
if !ok {
return nil, fmt.Errorf("alloc not found")
return nil, fmt.Errorf("unknown allocation ID %q", allocID)
}
return ar.GetAllocDir(), nil
}

// GetClientAlloc returns the allocation from the client
func (c *Client) GetClientAlloc(allocID string) (*structs.Allocation, error) {
all := c.allAllocs()
alloc, ok := all[allocID]
if !ok {
return nil, fmt.Errorf("unknown allocation ID %q", allocID)
}
return alloc, nil
}

// GetServers returns the list of nomad servers this client is aware of.
func (c *Client) GetServers() []string {
endpoints := c.servers.all()
Expand Down Expand Up @@ -2349,20 +2359,21 @@ func (c *Client) getAllocatedResources(selfNode *structs.Node) *structs.Resource
}

// allAllocs returns all the allocations managed by the client
func (c *Client) allAllocs() []*structs.Allocation {
var allocs []*structs.Allocation
func (c *Client) allAllocs() map[string]*structs.Allocation {
allocs := make(map[string]*structs.Allocation, 16)
for _, ar := range c.getAllocRunners() {
allocs = append(allocs, ar.Alloc())
a := ar.Alloc()
allocs[a.ID] = a
}
c.blockedAllocsLock.Lock()
for _, alloc := range c.blockedAllocations {
allocs = append(allocs, alloc)
allocs[alloc.ID] = alloc
}
c.blockedAllocsLock.Unlock()

c.migratingAllocsLock.Lock()
for _, ctrl := range c.migratingAllocs {
allocs = append(allocs, ctrl.alloc)
allocs[ctrl.alloc.ID] = ctrl.alloc
}
c.migratingAllocsLock.Unlock()
return allocs
Expand Down
18 changes: 18 additions & 0 deletions command/agent/fs_endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -761,6 +761,24 @@ func (s *HTTPServer) Logs(resp http.ResponseWriter, req *http.Request) (interfac
return nil, err
}

alloc, err := s.agent.client.GetClientAlloc(allocID)
if err != nil {
return nil, err
}

// Check that the task is there
tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
if tg == nil {
return nil, fmt.Errorf("Failed to lookup task group for allocation")
} else if taskStruct := tg.LookupTask(task); taskStruct == nil {
return nil, CodedError(404, fmt.Sprintf("task group %q does not have task with name %q", alloc.TaskGroup, task))
}

state, ok := alloc.TaskStates[task]
if !ok || state.StartedAt.IsZero() {
return nil, CodedError(404, fmt.Sprintf("task %q not started yet. No logs available", task))
}

// Create an output that gets flushed on every write
output := ioutils.NewWriteFlusher(resp)

Expand Down
10 changes: 10 additions & 0 deletions nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -2774,6 +2774,14 @@ type TaskState struct {
// Failed marks a task as having failed
Failed bool

// StartedAt is the time the task is started. It is updated each time the
// task starts
StartedAt time.Time

// FinishedAt is the time at which the task transistioned to dead and will
// not be started again.
FinishedAt time.Time

// Series of task events that transition the state of the task.
Events []*TaskEvent
}
Expand All @@ -2785,6 +2793,8 @@ func (ts *TaskState) Copy() *TaskState {
copy := new(TaskState)
copy.State = ts.State
copy.Failed = ts.Failed
copy.StartedAt = ts.StartedAt
copy.FinishedAt = ts.FinishedAt

if ts.Events != nil {
copy.Events = make([]*TaskEvent, len(ts.Events))
Expand Down
6 changes: 6 additions & 0 deletions website/source/docs/http/alloc.html.md
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,8 @@ be specified using the `?region=` query parameter.
}
],
"State": "running"
"FinishedAt": "0001-01-01T00:00:00Z",
"StartedAt": "2017-03-31T22:51:40.248633594Z",
"Failed": false,
}
},
Expand All @@ -247,6 +249,10 @@ be specified using the `?region=` query parameter.
* `TaskStateRunning` - The task is currently running.
* `TaskStateDead` - The task is dead and will not run again.

Further the state contains the `StartedAt` and `FinishedAt` times of the
task. `StartedAt` can be updated multiple times if the task restarts but
`FinishedAt` is set only when the task transistions to `TaskStateDead`

<p>The latest 10 events are stored per task. Each event is timestamped (unix nano-seconds)
and has one of the following types:</p>

Expand Down