Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

handle broker run service exception handling #3163

Merged
merged 5 commits into from
Feb 21, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 63 additions & 65 deletions src/Runner.Listener/JobDispatcher.cs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ public interface IJobDispatcher : IRunnerService
// This implementation of IJobDispatcher is not thread safe.
// It is based on the fact that the current design of the runner is a dequeue
// and processes one message from the message queue at a time.
// In addition, it only executes one job every time,
// In addition, it only executes one job every time,
// and the server will not send another job while this one is still running.
public sealed class JobDispatcher : RunnerService, IJobDispatcher
{
Expand Down Expand Up @@ -546,13 +546,27 @@ await processChannel.SendAsync(
Trace.Info($"Return code {returnCode} indicate worker encounter an unhandled exception or app crash, attach worker stdout/stderr to JobRequest result.");

var jobServer = await InitializeJobServerAsync(systemConnection);
await LogWorkerProcessUnhandledException(jobServer, message, detailInfo);

// Go ahead to finish the job with result 'Failed' if the STDERR from worker is System.IO.IOException, since it typically means we are running out of disk space.
if (detailInfo.Contains(typeof(System.IO.IOException).ToString(), StringComparison.OrdinalIgnoreCase))
var unhandledExceptionIssue = new Issue() { Type = IssueType.Error, Message = detailInfo };
unhandledExceptionIssue.Data[Constants.Runner.InternalTelemetryIssueDataKey] = Constants.Runner.WorkerCrash;
switch (jobServer)
{
Trace.Info($"Finish job with result 'Failed' due to IOException.");
await ForceFailJob(jobServer, message, detailInfo);
case IJobServer js:
{
await LogWorkerProcessUnhandledException(js, message, unhandledExceptionIssue);
// Go ahead to finish the job with result 'Failed' if the STDERR from worker is System.IO.IOException, since it typically means we are running out of disk space.
if (detailInfo.Contains(typeof(System.IO.IOException).ToString(), StringComparison.OrdinalIgnoreCase))
{
Trace.Info($"Finish job with result 'Failed' due to IOException.");
await ForceFailJob(js, message);
}

break;
}
case IRunServer rs:
await ForceFailJob(rs, message, unhandledExceptionIssue);
yaananth marked this conversation as resolved.
Show resolved Hide resolved
yaananth marked this conversation as resolved.
Show resolved Hide resolved
break;
default:
throw new NotSupportedException($"JobServer type '{jobServer.GetType().Name}' is not supported.");
}
}

Expand Down Expand Up @@ -644,7 +658,7 @@ await processChannel.SendAsync(
}
}

// wait worker to exit
// wait worker to exit
// if worker doesn't exit within timeout, then kill worker.
completedTask = await Task.WhenAny(workerProcessTask, Task.Delay(-1, workerCancelTimeoutKillToken));

Expand Down Expand Up @@ -1131,86 +1145,70 @@ private async Task CompleteJobRequestAsync(int poolId, Pipelines.AgentJobRequest
}

// log an error issue to job level timeline record
private async Task LogWorkerProcessUnhandledException(IRunnerService server, Pipelines.AgentJobRequestMessage message, string detailInfo)
private async Task LogWorkerProcessUnhandledException(IJobServer jobServer, Pipelines.AgentJobRequestMessage message, Issue issue)
{
if (server is IJobServer jobServer)
try
{
try
{
var timeline = await jobServer.GetTimelineAsync(message.Plan.ScopeIdentifier, message.Plan.PlanType, message.Plan.PlanId, message.Timeline.Id, CancellationToken.None);
ArgUtil.NotNull(timeline, nameof(timeline));
var timeline = await jobServer.GetTimelineAsync(message.Plan.ScopeIdentifier, message.Plan.PlanType, message.Plan.PlanId, message.Timeline.Id, CancellationToken.None);
ArgUtil.NotNull(timeline, nameof(timeline));

TimelineRecord jobRecord = timeline.Records.FirstOrDefault(x => x.Id == message.JobId && x.RecordType == "Job");
ArgUtil.NotNull(jobRecord, nameof(jobRecord));
TimelineRecord jobRecord = timeline.Records.FirstOrDefault(x => x.Id == message.JobId && x.RecordType == "Job");
ArgUtil.NotNull(jobRecord, nameof(jobRecord));

var unhandledExceptionIssue = new Issue() { Type = IssueType.Error, Message = detailInfo };
unhandledExceptionIssue.Data[Constants.Runner.InternalTelemetryIssueDataKey] = Constants.Runner.WorkerCrash;
jobRecord.ErrorCount++;
jobRecord.Issues.Add(unhandledExceptionIssue);

if (message.Variables.TryGetValue("DistributedTask.MarkJobAsFailedOnWorkerCrash", out var markJobAsFailedOnWorkerCrash) &&
StringUtil.ConvertToBoolean(markJobAsFailedOnWorkerCrash?.Value))
{
Trace.Info("Mark the job as failed since the worker crashed");
jobRecord.Result = TaskResult.Failed;
// mark the job as completed so service will pickup the result
jobRecord.State = TimelineRecordState.Completed;
}
jobRecord.ErrorCount++;
jobRecord.Issues.Add(issue);

await jobServer.UpdateTimelineRecordsAsync(message.Plan.ScopeIdentifier, message.Plan.PlanType, message.Plan.PlanId, message.Timeline.Id, new TimelineRecord[] { jobRecord }, CancellationToken.None);
}
catch (Exception ex)
if (message.Variables.TryGetValue("DistributedTask.MarkJobAsFailedOnWorkerCrash", out var markJobAsFailedOnWorkerCrash) &&
StringUtil.ConvertToBoolean(markJobAsFailedOnWorkerCrash?.Value))
{
Trace.Error("Fail to report unhandled exception from Runner.Worker process");
Trace.Error(ex);
Trace.Info("Mark the job as failed since the worker crashed");
jobRecord.Result = TaskResult.Failed;
// mark the job as completed so service will pickup the result
jobRecord.State = TimelineRecordState.Completed;
}

await jobServer.UpdateTimelineRecordsAsync(message.Plan.ScopeIdentifier, message.Plan.PlanType, message.Plan.PlanId, message.Timeline.Id, new TimelineRecord[] { jobRecord }, CancellationToken.None);
}
else
catch (Exception ex)
{
Trace.Info("Job server does not support handling unhandled exception yet, error message: {0}", detailInfo);
return;
Trace.Error("Fail to report unhandled exception from Runner.Worker process");
Trace.Error(ex);
}
}

// raise job completed event to fail the job.
private async Task ForceFailJob(IRunnerService server, Pipelines.AgentJobRequestMessage message, string detailInfo)
private async Task ForceFailJob(IJobServer jobServer, Pipelines.AgentJobRequestMessage message)
{
if (server is IJobServer jobServer)
try
{
try
{
var jobCompletedEvent = new JobCompletedEvent(message.RequestId, message.JobId, TaskResult.Failed);
await jobServer.RaisePlanEventAsync<JobCompletedEvent>(message.Plan.ScopeIdentifier, message.Plan.PlanType, message.Plan.PlanId, jobCompletedEvent, CancellationToken.None);
}
catch (Exception ex)
{
Trace.Error("Fail to raise JobCompletedEvent back to service.");
Trace.Error(ex);
}
var jobCompletedEvent = new JobCompletedEvent(message.RequestId, message.JobId, TaskResult.Failed);
await jobServer.RaisePlanEventAsync<JobCompletedEvent>(message.Plan.ScopeIdentifier, message.Plan.PlanType, message.Plan.PlanId, jobCompletedEvent, CancellationToken.None);
}
else if (server is IRunServer runServer)
catch (Exception ex)
{
try
{
var unhandledExceptionIssue = new Issue() { Type = IssueType.Error, Message = detailInfo };
var unhandledAnnotation = unhandledExceptionIssue.ToAnnotation();
var jobAnnotations = new List<Annotation>();
if (unhandledAnnotation.HasValue)
{
jobAnnotations.Add(unhandledAnnotation.Value);
}
Trace.Error("Fail to raise JobCompletedEvent back to service.");
Trace.Error(ex);
}
}

await runServer.CompleteJobAsync(message.Plan.PlanId, message.JobId, TaskResult.Failed, outputs: null, stepResults: null, jobAnnotations: jobAnnotations, environmentUrl: null, CancellationToken.None);
}
catch (Exception ex)
private async Task ForceFailJob(IRunServer runServer, Pipelines.AgentJobRequestMessage message, Issue issue)
{
try
{
var annotation = issue.ToAnnotation();
var jobAnnotations = new List<Annotation>();
if (annotation.HasValue)
{
Trace.Error("Fail to raise job completion back to service.");
Trace.Error(ex);
jobAnnotations.Add(annotation.Value);
}

await runServer.CompleteJobAsync(message.Plan.PlanId, message.JobId, TaskResult.Failed, outputs: null, stepResults: null, jobAnnotations: jobAnnotations, environmentUrl: null, CancellationToken.None);
}
else
catch (Exception ex)
{
throw new NotSupportedException($"Server type {server.GetType().FullName} is not supported.");
Trace.Error("Fail to raise job completion back to service.");
Trace.Error(ex);
}
}

Expand Down
Loading