-
Notifications
You must be signed in to change notification settings - Fork 619
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add log message for task CNI setup, and other log mods #3621
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -149,14 +149,14 @@ func NewContainerStateChangeEvent(task *apitask.Task, cont *apicontainer.Contain | |
} | ||
contKnownStatus := cont.GetKnownStatus() | ||
if !contKnownStatus.ShouldReportToBackend(cont.GetSteadyStateStatus()) { | ||
return event, errors.Errorf( | ||
return event, ErrShouldNotSendEvent{fmt.Sprintf( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This prevents logging these bogus error messages:
|
||
"create container state change event api: status not recognized by ECS: %v", | ||
contKnownStatus) | ||
contKnownStatus)} | ||
} | ||
if cont.GetSentStatus() >= contKnownStatus { | ||
return event, errors.Errorf( | ||
return event, ErrShouldNotSendEvent{fmt.Sprintf( | ||
"create container state change event api: status [%s] already sent for container %s, task %s", | ||
contKnownStatus.String(), cont.Name, task.Arn) | ||
contKnownStatus.String(), cont.Name, task.Arn)} | ||
} | ||
if reason == "" && cont.ApplyingError != nil { | ||
reason = cont.ApplyingError.Error() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -508,10 +508,6 @@ func (task *Task) initNetworkMode(acsTaskNetworkMode *string) { | |
field.NetworkMode: aws.StringValue(acsTaskNetworkMode), | ||
}) | ||
} | ||
logger.Info("Task network mode initialized", logger.Fields{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This log message is misleading, the network mode is not actually "initialized" in any functional way here, this is only called when unmarshaling the task payload from ACS. Since it calls this function on every task payload, and ACS will often send redundant payloads, it also is repeated anywhere from 4-10 times for every task, for example:
|
||
field.TaskID: task.GetID(), | ||
field.NetworkMode: task.NetworkMode, | ||
}) | ||
} | ||
|
||
func (task *Task) initServiceConnectResources() error { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1752,6 +1752,15 @@ func (engine *DockerTaskEngine) provisionContainerResourcesAwsvpc(task *apitask. | |
} | ||
} | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Below adds a new logging message like below:
|
||
logger.Info("Setting up CNI config for task", logger.Fields{ | ||
field.TaskID: task.GetID(), | ||
"cniContainerID": cniConfig.ContainerID, | ||
"cniPluginPath": cniConfig.PluginsPath, | ||
"cniID": cniConfig.ID, | ||
"cniBridgeName": cniConfig.BridgeName, | ||
"cniContainerNetNs": cniConfig.ContainerNetNS, | ||
}) | ||
|
||
// Invoke the libcni to config the network namespace for the container | ||
result, err := engine.cniClient.SetupNS(engine.ctx, cniConfig, cniSetupTimeout) | ||
if err != nil { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -707,6 +707,8 @@ func newDockerContainerMetadataResolver(taskEngine ecsengine.TaskEngine) (*Docke | |
} | ||
|
||
// taskContainerMetricsUnsafe gets all container metrics for a task arn. | ||
// | ||
//gocyclo:ignore | ||
func (engine *DockerStatsEngine) taskContainerMetricsUnsafe(taskArn string) ([]*ecstcs.ContainerMetric, error) { | ||
containerMap, taskExists := engine.tasksToContainers[taskArn] | ||
if !taskExists { | ||
|
@@ -723,10 +725,20 @@ func (engine *DockerStatsEngine) taskContainerMetricsUnsafe(taskArn string) ([]* | |
if engine.stopTrackingContainerUnsafe(container, taskArn) { | ||
continue | ||
} | ||
// age is used to determine if we should or should not expect missing metrics. | ||
// this is because recently-started containers would normally not have their metrics | ||
// queue filled yet. | ||
age := time.Since(container.containerMetadata.StartedAt) | ||
// gracePeriod is the time that containers are allowed to have missing metrics | ||
// without throwing/logging errors. | ||
gracePeriod := time.Second * 30 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We are often logging these warning messages soon after a container starts up because we don't have enough metrics in the queue yet. This change adds a "grace period" of 30s when we won't warn about this condition, this prevents messages like below:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice addition! |
||
|
||
// CPU and Memory are both critical, so skip the container if either of these fail. | ||
cpuStatsSet, err := container.statsQueue.GetCPUStatsSet() | ||
if err != nil { | ||
if age < gracePeriod { | ||
continue | ||
} | ||
logger.Error("Error collecting cloudwatch metrics for container", logger.Fields{ | ||
field.Container: dockerID, | ||
field.Error: err, | ||
|
@@ -735,6 +747,9 @@ func (engine *DockerStatsEngine) taskContainerMetricsUnsafe(taskArn string) ([]* | |
} | ||
memoryStatsSet, err := container.statsQueue.GetMemoryStatsSet() | ||
if err != nil { | ||
if age < gracePeriod { | ||
continue | ||
} | ||
logger.Error("Error collecting cloudwatch metrics for container", logger.Fields{ | ||
field.Container: dockerID, | ||
field.Error: err, | ||
|
@@ -749,7 +764,7 @@ func (engine *DockerStatsEngine) taskContainerMetricsUnsafe(taskArn string) ([]* | |
} | ||
|
||
storageStatsSet, err := container.statsQueue.GetStorageStatsSet() | ||
if err != nil { | ||
if err != nil && age > gracePeriod { | ||
logger.Warn("Error getting storage stats for container", logger.Fields{ | ||
field.Container: dockerID, | ||
field.Error: err, | ||
|
@@ -777,7 +792,7 @@ func (engine *DockerStatsEngine) taskContainerMetricsUnsafe(taskArn string) ([]* | |
seelog.Debug("Skip adding network stats for pause container in Service Connect enabled task") | ||
} else { | ||
networkStatsSet, err := container.statsQueue.GetNetworkStatsSet() | ||
if err != nil { | ||
if err != nil && age > gracePeriod { | ||
// we log the error and still continue to publish cpu, memory stats | ||
logger.Warn("Error getting network stats for container", logger.Fields{ | ||
field.Container: dockerID, | ||
|
@@ -795,7 +810,7 @@ func (engine *DockerStatsEngine) taskContainerMetricsUnsafe(taskArn string) ([]* | |
// do not add network stats for pause container | ||
if dockerContainer.Container.Type != apicontainer.ContainerCNIPause { | ||
networkStats, err := taskStatsMap.StatsQueue.GetNetworkStatsSet() | ||
if err != nil { | ||
if err != nil && age > gracePeriod { | ||
logger.Warn("Error getting network stats for container", logger.Fields{ | ||
field.TaskARN: taskArn, | ||
field.Container: dockerContainer.DockerID, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -169,62 +169,72 @@ func (queue *Queue) GetMemoryStatsSet() (*ecstcs.CWStatsSet, error) { | |
func (queue *Queue) GetStorageStatsSet() (*ecstcs.StorageStatsSet, error) { | ||
storageStatsSet := &ecstcs.StorageStatsSet{} | ||
var err error | ||
var errStr string | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the errors below are already logged as a warning in the calling function, so removing the duplicate logging here |
||
storageStatsSet.ReadSizeBytes, err = queue.getULongStatsSet(getStorageReadBytes) | ||
if err != nil { | ||
seelog.Warnf("Error getting storage read size bytes: %v", err) | ||
errStr += fmt.Sprintf("error getting storage read size bytes: %v - ", err) | ||
} | ||
storageStatsSet.WriteSizeBytes, err = queue.getULongStatsSet(getStorageWriteBytes) | ||
if err != nil { | ||
seelog.Warnf("Error getting storage write size bytes: %v", err) | ||
errStr += fmt.Sprintf("error getting storage write size bytes: %v - ", err) | ||
} | ||
return storageStatsSet, err | ||
var errOut error | ||
if len(errStr) > 0 { | ||
errOut = fmt.Errorf(errStr) | ||
} | ||
return storageStatsSet, errOut | ||
} | ||
|
||
// GetNetworkStatsSet gets the stats set for network metrics. | ||
func (queue *Queue) GetNetworkStatsSet() (*ecstcs.NetworkStatsSet, error) { | ||
networkStatsSet := &ecstcs.NetworkStatsSet{} | ||
var err error | ||
var errStr string | ||
networkStatsSet.RxBytes, err = queue.getULongStatsSet(getNetworkRxBytes) | ||
if err != nil { | ||
seelog.Warnf("Error getting network rx bytes: %v", err) | ||
errStr += fmt.Sprintf("error getting network rx bytes: %v - ", err) | ||
} | ||
networkStatsSet.RxDropped, err = queue.getULongStatsSet(getNetworkRxDropped) | ||
if err != nil { | ||
seelog.Warnf("Error getting network rx dropped: %v", err) | ||
errStr += fmt.Sprintf("error getting network rx dropped: %v - ", err) | ||
} | ||
networkStatsSet.RxErrors, err = queue.getULongStatsSet(getNetworkRxErrors) | ||
if err != nil { | ||
seelog.Warnf("Error getting network rx errors: %v", err) | ||
errStr += fmt.Sprintf("error getting network rx errors: %v - ", err) | ||
} | ||
networkStatsSet.RxPackets, err = queue.getULongStatsSet(getNetworkRxPackets) | ||
if err != nil { | ||
seelog.Warnf("Error getting network rx packets: %v", err) | ||
errStr += fmt.Sprintf("error getting network rx packets: %v - ", err) | ||
} | ||
networkStatsSet.TxBytes, err = queue.getULongStatsSet(getNetworkTxBytes) | ||
if err != nil { | ||
seelog.Warnf("Error getting network tx bytes: %v", err) | ||
errStr += fmt.Sprintf("error getting network tx bytes: %v - ", err) | ||
} | ||
networkStatsSet.TxDropped, err = queue.getULongStatsSet(getNetworkTxDropped) | ||
if err != nil { | ||
seelog.Warnf("Error getting network tx dropped: %v", err) | ||
errStr += fmt.Sprintf("error getting network tx dropped: %v - ", err) | ||
} | ||
networkStatsSet.TxErrors, err = queue.getULongStatsSet(getNetworkTxErrors) | ||
if err != nil { | ||
seelog.Warnf("Error getting network tx errors: %v", err) | ||
errStr += fmt.Sprintf("error getting network tx errors: %v - ", err) | ||
} | ||
networkStatsSet.TxPackets, err = queue.getULongStatsSet(getNetworkTxPackets) | ||
if err != nil { | ||
seelog.Warnf("Error getting network tx packets: %v", err) | ||
errStr += fmt.Sprintf("error getting network tx packets: %v - ", err) | ||
} | ||
networkStatsSet.RxBytesPerSecond, err = queue.getUDoubleCWStatsSet(getNetworkRxPacketsPerSecond) | ||
if err != nil { | ||
seelog.Warnf("Error getting network rx bytes per second: %v", err) | ||
errStr += fmt.Sprintf("error getting network rx bytes per second: %v - ", err) | ||
} | ||
networkStatsSet.TxBytesPerSecond, err = queue.getUDoubleCWStatsSet(getNetworkTxPacketsPerSecond) | ||
if err != nil { | ||
seelog.Warnf("Error getting network tx bytes per second: %v", err) | ||
errStr += fmt.Sprintf("error getting network tx bytes per second: %v - ", err) | ||
} | ||
var errOut error | ||
if len(errStr) > 0 { | ||
errOut = fmt.Errorf(errStr) | ||
} | ||
return networkStatsSet, err | ||
return networkStatsSet, errOut | ||
} | ||
|
||
func getNetworkRxBytes(s *UsageStats) uint64 { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -57,22 +57,14 @@ func (taskStat *StatsTask) collect() { | |
backoff := retry.NewExponentialBackoff(time.Second*1, time.Second*10, 0.5, 2) | ||
|
||
for { | ||
err := taskStat.processStatsStream() | ||
statStreamErr := taskStat.processStatsStream() | ||
select { | ||
case <-taskStat.Ctx.Done(): | ||
logger.Debug("Stopping stats collection for taskStat", logger.Fields{ | ||
field.TaskID: taskId, | ||
}) | ||
return | ||
default: | ||
if err != nil { | ||
d := backoff.Duration() | ||
time.Sleep(d) | ||
logger.Debug("Error querying stats for task", logger.Fields{ | ||
field.TaskID: taskId, | ||
field.Error: err, | ||
}) | ||
} | ||
// We were disconnected from the stats stream. | ||
// Check if the task is terminal. If it is, stop collecting metrics. | ||
terminal, err := taskStat.terminal() | ||
|
@@ -83,11 +75,22 @@ func (taskStat *StatsTask) collect() { | |
field.Error: err, | ||
}) | ||
taskStat.StopStatsCollection() | ||
continue | ||
} else if terminal { | ||
logger.Warn("Task is terminal, stopping stats collection", logger.Fields{ | ||
logger.Info("Task is terminal, stopping stats collection", logger.Fields{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is not really a warning as this is expected behavior |
||
field.TaskID: taskId, | ||
}) | ||
taskStat.StopStatsCollection() | ||
continue | ||
} | ||
// task stats were not stopped for terminal task, backoff before trying to reconnect | ||
if statStreamErr != nil { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. backing off and logging after we determine if task has exited prevents lots of log messages like this:
|
||
d := backoff.Duration() | ||
time.Sleep(d) | ||
logger.Debug("Error querying stats for task", logger.Fields{ | ||
field.TaskID: taskId, | ||
field.Error: err, | ||
}) | ||
} | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
oh finally! 🙌