Skip to content
This repository has been archived by the owner on Nov 1, 2023. It is now read-only.

Debug failing check pr #2476

Merged
merged 9 commits into from
Oct 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion src/ApiService/ApiService/Functions/AgentCommands.cs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ private async Async.Task<HttpResponseData> Get(HttpRequestData req) {
var envelope = new NodeCommandEnvelope(command, messageId);
return await RequestHandling.Ok(req, new PendingNodeCommand(envelope));
} else {
_log.WithTag("HttpRequest", "GET").Verbose($"failed to find {nodeCommand.MachineId:Tag:MachineId}");
return await RequestHandling.Ok(req, new PendingNodeCommand(null));
}
}
Expand Down
1 change: 0 additions & 1 deletion src/ApiService/ApiService/Functions/AgentEvents.cs
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,6 @@ private async Async.Task<HttpResponseData> Post(HttpRequestData req) {
return null;
}

_log.Info($"node state update: {machineId:Tag:MachineId} from {node.State:Tag:FromState} to {ev.State:Tag:ToState}");
node = await _context.NodeOperations.SetState(node, ev.State);

if (ev.State == NodeState.Free) {
Expand Down
2 changes: 1 addition & 1 deletion src/ApiService/ApiService/Log.cs
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ public interface ILogTracer {
void Error(LogStringHandler message);

void Error(Error error);
void Event(LogStringHandler evt, IReadOnlyDictionary<string, double>? metrics);
void Event(LogStringHandler evt, IReadOnlyDictionary<string, double>? metrics = null);
void Exception(Exception ex, LogStringHandler message = $"", IReadOnlyDictionary<string, double>? metrics = null);
void ForceFlush();
void Info(LogStringHandler message);
Expand Down
85 changes: 41 additions & 44 deletions src/ApiService/ApiService/onefuzzlib/NodeOperations.cs
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ public NodeOperations(ILogTracer log, IOnefuzzContext context)

public async Task<OneFuzzResultVoid> AcquireScaleInProtection(Node node) {
if (node.ScalesetId is Guid scalesetId && await ScalesetNodeExists(node)) {
_logTracer.Info($"Setting scale-in protection on node {node.MachineId}");
_logTracer.Info($"Setting scale-in protection on node {node.MachineId:Tag:MachineId}");
return await _context.VmssOperations.UpdateScaleInProtection(scalesetId, node.MachineId, protectFromScaleIn: true);
}

Expand All @@ -95,7 +95,7 @@ public async Task<OneFuzzResultVoid> ReleaseScaleInProtection(Node node) {
if (!node.DebugKeepNode &&
node.ScalesetId is Guid scalesetId &&
await ScalesetNodeExists(node)) {
_logTracer.Info($"Removing scale-in protection on node {node.MachineId}");
_logTracer.Info($"Removing scale-in protection on node {node.MachineId:Tag:MachineId}");
return await _context.VmssOperations.UpdateScaleInProtection(scalesetId, node.MachineId, protectFromScaleIn: false);
}

Expand All @@ -119,68 +119,68 @@ public async Async.Task<bool> ScalesetNodeExists(Node node) {

public async Task<bool> CanProcessNewWork(Node node) {
if (IsOutdated(node) && _context.ServiceConfiguration.OneFuzzAllowOutdatedAgent != "true") {
_logTracer.Info($"can_process_new_work agent and service versions differ, stopping node. machine_id:{node.MachineId} agent_version:{node.Version} service_version:{_context.ServiceConfiguration.OneFuzzVersion}");
_logTracer.Info($"can_process_new_work agent and service versions differ, stopping node. {node.MachineId:Tag:MachineId} {node.Version:Tag:AgentVersion} {_context.ServiceConfiguration.OneFuzzVersion:Tag:ServiceVersion}");
_ = await Stop(node, done: true);
return false;
}

if (IsTooOld(node)) {
_logTracer.Info($"can_process_new_work node is too old. machine_id:{node.MachineId}");
_logTracer.Info($"can_process_new_work node is too old {node.MachineId:Tag:MachineId}");
_ = await Stop(node, done: true);
return false;
}

if (!node.State.CanProcessNewWork()) {
_logTracer.Info($"can_process_new_work node not in appropriate state for new work machine_id:{node.MachineId} state:{node.State}");
_logTracer.Info($"can_process_new_work node not in appropriate state for new work {node.MachineId:Tag:MachineId} {node.State:Tag:State}");
return false;
}

if (node.State.ReadyForReset()) {
_logTracer.Info($"can_process_new_work node is set for reset. machine_id:{node.MachineId}");
_logTracer.Info($"can_process_new_work node is set for reset {node.MachineId:Tag:MachineId}");
return false;
}

if (node.DeleteRequested) {
_logTracer.Info($"can_process_new_work is set to be deleted. machine_id:{node.MachineId}");
_logTracer.Info($"can_process_new_work is set to be deleted {node.MachineId:Tag:MachineId}");
_ = await Stop(node, done: true);
return false;
}

if (node.ReimageRequested) {
_logTracer.Info($"can_process_new_work is set to be reimaged. machine_id:{node.MachineId}");
_logTracer.Info($"can_process_new_work is set to be reimaged {node.MachineId:Tag:MachineId}");
_ = await Stop(node, done: true);
return false;
}

if (await CouldShrinkScaleset(node)) {
_logTracer.Info($"can_process_new_work node scheduled to shrink. machine_id:{node.MachineId}");
_logTracer.Info($"can_process_new_work node scheduled to shrink {node.MachineId:Tag:MachineId}");
_ = await SetHalt(node);
return false;
}

if (node.ScalesetId != null) {
var scalesetResult = await _context.ScalesetOperations.GetById(node.ScalesetId.Value);
if (!scalesetResult.IsOk) {
_logTracer.Info($"can_process_new_work invalid scaleset. scaleset_id:{node.ScalesetId} machine_id:{node.MachineId}");
_logTracer.Info($"can_process_new_work invalid scaleset {node.ScalesetId:Tag:ScalesetId} - {node.MachineId:Tag:MachineId}");
return false;
}

var scaleset = scalesetResult.OkV;
if (!scaleset.State.IsAvailable()) {
_logTracer.Info($"can_process_new_work scaleset not available for work. scaleset_id:{node.ScalesetId} machine_id:{node.MachineId}");
_logTracer.Info($"can_process_new_work scaleset not available for work {scaleset.ScalesetId:Tag:ScalesetId} - {node.MachineId:Tag:MachineId} {scaleset.State:Tag:State}");
return false;
}
}

var poolResult = await _context.PoolOperations.GetByName(node.PoolName);
if (!poolResult.IsOk) {
_logTracer.Info($"can_schedule - invalid pool. pool_name:{node.PoolName} machine_id:{node.MachineId}");
_logTracer.Info($"can_schedule - invalid pool {node.PoolName:Tag:PoolName} - {node.MachineId:Tag:MachineId}");
return false;
}

var pool = poolResult.OkV;
if (!PoolStateHelper.Available.Contains(pool.State)) {
_logTracer.Info($"can_schedule - pool is not available for work. pool_name:{node.PoolName} machine_id:{node.MachineId}");
_logTracer.Info($"can_schedule - pool is not available for work {node.PoolName:Tag:PoolName} - {node.MachineId:Tag:MachineId}");
return false;
}

Expand All @@ -193,9 +193,9 @@ public async Task<bool> CanProcessNewWork(Node node) {
/// reasonably up-to-date with OS patches without disrupting running
/// fuzzing tasks with patch reboot cycles.
public async Async.Task ReimageLongLivedNodes(Guid scaleSetId) {
var timeFilter = $"not (initialized_at ge datetime'{(DateTimeOffset.UtcNow - INodeOperations.NODE_REIMAGE_TIME).ToString("o")}')";

await foreach (var node in QueryAsync($"(scaleset_id eq '{scaleSetId}') and {timeFilter}")) {
var timeFilter = Query.OlderThan("initialized_at", DateTimeOffset.UtcNow - INodeOperations.NODE_REIMAGE_TIME);
//force ToString(), since all GUIDs are strings in the table
await foreach (var node in QueryAsync(Query.And(TableClient.CreateQueryFilter($"scaleset_id eq {scaleSetId.ToString()}"), timeFilter))) {
if (node.DebugKeepNode) {
_logTracer.Info($"removing debug_keep_node for expired node. scaleset_id:{node.ScalesetId} machine_id:{node.MachineId}");
}
Expand All @@ -213,7 +213,7 @@ public async Async.Task MarkOutdatedNodes() {

var outdated = SearchOutdated(excludeUpdateScheduled: true);
await foreach (var node in outdated) {
_logTracer.Info($"node is outdated: {node.MachineId} - node_version:{node.Version} api_version:");
_logTracer.Info($"node is outdated: {node.MachineId:Tag:MachineId} - {node.Version:Tag:NodeVersion}");

if (node.Version == "1.0.0") {
_ = await ToReimage(node, done: true);
Expand Down Expand Up @@ -303,7 +303,7 @@ public async Async.Task<Node> ToReimage(Node node, bool done = false) {

var reimageRequested = node.ReimageRequested;
if (!node.ReimageRequested && !node.DeleteRequested) {
_logTracer.Info($"setting reimage_requested: {node.MachineId:Tag:MachineId}");
_logTracer.Info($"setting reimage_requested: {node.MachineId:Tag:MachineId} {node.ScalesetId:Tag:ScalesetId}");
reimageRequested = true;
}

Expand All @@ -313,7 +313,7 @@ public async Async.Task<Node> ToReimage(Node node, bool done = false) {

var r = await Replace(updatedNode);
if (!r.IsOk) {
_logTracer.WithHttpStatus(r.ErrorV).Error($"Failed to save Node record for node {node.MachineId:Tag:MachineId}");
_logTracer.WithHttpStatus(r.ErrorV).Error($"Failed to save Node record for node {updatedNode.MachineId:Tag:MachineId} {node.ScalesetId:Tag:ScalesetId}");
}

return updatedNode;
Expand Down Expand Up @@ -345,7 +345,17 @@ public IAsyncEnumerable<Node> GetDeadNodes(Guid scaleSetId, TimeSpan expirationP
} catch (RequestFailedException ex) when (
ex.Status == (int)HttpStatusCode.Conflict ||
ex.ErrorCode == "EntityAlreadyExists") {
return null;

var existingNode = await QueryAsync(Query.SingleEntity(poolName.ToString(), machineId.ToString())).FirstOrDefaultAsync();
stishkin marked this conversation as resolved.
Show resolved Hide resolved
if (existingNode is not null) {
if (existingNode.State != node.State || existingNode.ReimageRequested != node.ReimageRequested || existingNode.Version != node.Version || existingNode.DeleteRequested != node.DeleteRequested) {
_logTracer.Error($"Not replacing {existingNode:Tag:ExistingNode} with a new-and-different {node:Tag:Node}");
}
return null;
} else {
_logTracer.Critical($"Failed to get node when node insertion returned EntityAlreadyExists {poolName.ToString():Tag:PoolName} {machineId:Tag:MachineId}");
r = await Replace(node);
}
}
} else {
r = await Replace(node);
Expand Down Expand Up @@ -448,17 +458,17 @@ public async Task<bool> CouldShrinkScaleset(Node node) {
}

public async Async.Task<Node> SetState(Node node, NodeState state) {
if (node.State == state) {
return node;
}
if (node.State != state) {
_logTracer.Event($"SetState Node {node.MachineId:Tag:MachineId} {node.State:Tag:From} - {state:Tag:To}");

node = node with { State = state };
await _context.Events.SendEvent(new EventNodeStateUpdated(
node.MachineId,
node.ScalesetId,
node.PoolName,
state
));
node = node with { State = state };
await _context.Events.SendEvent(new EventNodeStateUpdated(
node.MachineId,
node.ScalesetId,
node.PoolName,
state
));
}

var r = await Update(node);
if (!r.IsOk) {
Expand All @@ -470,12 +480,10 @@ await _context.Events.SendEvent(new EventNodeStateUpdated(
}

public static string SearchStatesQuery(
stishkin marked this conversation as resolved.
Show resolved Hide resolved
string oneFuzzVersion,
Guid? poolId = default,
Guid? scaleSetId = default,
IEnumerable<NodeState>? states = default,
PoolName? poolName = default,
bool excludeUpdateScheduled = false,
int? numResults = default) {

List<string> queryParts = new();
Expand All @@ -497,17 +505,6 @@ public static string SearchStatesQuery(
queryParts.Add($"({q})");
}

if (excludeUpdateScheduled) {
queryParts.Add($"reimage_requested eq false");
queryParts.Add($"delete_requested eq false");
}

//# azure table query always return false when the column does not exist
//# We write the query this way to allow us to get the nodes where the
//# version is not defined as well as the nodes with a mismatched version
var versionQuery = TableClient.CreateQueryFilter($"not (version eq {oneFuzzVersion})");
queryParts.Add(versionQuery);

return Query.And(queryParts);
}

Expand All @@ -519,7 +516,7 @@ public IAsyncEnumerable<Node> SearchStates(
PoolName? poolName = default,
bool excludeUpdateScheduled = false,
int? numResults = default) {
var query = NodeOperations.SearchStatesQuery(_context.ServiceConfiguration.OneFuzzVersion, poolId, scalesetId, states, poolName, excludeUpdateScheduled, numResults);
var query = NodeOperations.SearchStatesQuery(poolId, scalesetId, states, poolName, numResults);

if (numResults is null) {
return QueryAsync(query);
Expand Down
1 change: 1 addition & 0 deletions src/ApiService/ApiService/onefuzzlib/PoolOperations.cs
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ public async Async.Task<Pool> SetState(Pool pool, PoolState state) {
return pool;
}

_logTracer.WithTag("PoolName", pool.Name.ToString()).Event($"SetState Pool {pool.PoolId:Tag:PoolId} {pool.State:Tag:From} - {state:Tag:To}");
// scalesets should never leave the `halt` state
// it is terminal
if (pool.State == PoolState.Halt) {
Expand Down
2 changes: 2 additions & 0 deletions src/ApiService/ApiService/onefuzzlib/ProxyOperations.cs
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,8 @@ public async Async.Task<Proxy> SetState(Proxy proxy, VmState state) {
return proxy;
}

_logTracer.Event($"SetState Proxy {proxy.ProxyId:Tag:ProxyId} {proxy.State:Tag:From} - {state:Tag:To}");

var newProxy = proxy with { State = state };
var r = await Replace(newProxy);
if (!r.IsOk) {
Expand Down
8 changes: 4 additions & 4 deletions src/ApiService/ApiService/onefuzzlib/ReproOperations.cs
Original file line number Diff line number Diff line change
Expand Up @@ -90,15 +90,15 @@ public async Async.Task<Repro> Stopping(Repro repro) {
var vm = await GetVm(repro, config);
var vmOperations = _context.VmOperations;
if (!await vmOperations.IsDeleted(vm)) {
_logTracer.Info($"vm stopping: {repro.VmId}");
_logTracer.Info($"vm stopping: {repro.VmId:Tag:VmId} {vm.Name:Tag:VmName}");
var rr = await vmOperations.Delete(vm);
if (!rr) {
_logTracer.Error($"failed to delete repro {repro.VmId:Tag:VmId}");
if (rr) {
_logTracer.Info($"repro vm fully deleted {repro.VmId:Tag:VmId} {vm.Name:Tag:VmName}");
stishkin marked this conversation as resolved.
Show resolved Hide resolved
}
repro = repro with { State = VmState.Stopping };
var r = await Replace(repro);
if (!r.IsOk) {
_logTracer.WithHttpStatus(r.ErrorV).Error($"failed to replace repro {repro.VmId:Tag:VmId} marked Stopping");
_logTracer.WithHttpStatus(r.ErrorV).Error($"failed to replace repro {repro.VmId:Tag:VmId} {vm.Name:Tag:VmName} marked Stopping");
}
return repro;
} else {
Expand Down
18 changes: 12 additions & 6 deletions src/ApiService/ApiService/onefuzzlib/ScalesetOperations.cs
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ public async Async.Task<OneFuzzResultVoid> SyncAutoscaleSettings(Scaleset scales
return poolResult.ErrorV;
}

_logTracer.Info($"Updating auto-scale entry for scaleset: {scaleset.ScalesetId}");
_logTracer.Info($"Updating auto-scale entry for scaleset: {scaleset.ScalesetId:Tag:ScalesetId}");
_ = await _context.AutoScaleOperations.Update(
scalesetId: scaleset.ScalesetId,
minAmount: minAmount,
Expand Down Expand Up @@ -188,6 +188,7 @@ public async Async.Task<Scaleset> SetState(Scaleset scaleset, ScalesetState stat
return scaleset;
}

_log.WithTag("Pool", scaleset.PoolName.ToString()).Event($"SetState Scaleset {scaleset.ScalesetId:Tag:ScalesetId} {scaleset.State:Tag:From} - {state:Tag:To}");
var updatedScaleSet = scaleset with { State = state };
var r = await Replace(updatedScaleSet);
if (!r.IsOk) {
Expand Down Expand Up @@ -217,7 +218,7 @@ async Async.Task<Scaleset> SetFailed(Scaleset scaleset, Error error) {

var updatedScaleset = await SetState(scaleset with { Error = error }, ScalesetState.CreationFailed);

await _context.Events.SendEvent(new EventScalesetFailed(scaleset.ScalesetId, scaleset.PoolName, error));
await _context.Events.SendEvent(new EventScalesetFailed(updatedScaleset.ScalesetId, updatedScaleset.PoolName, error));
return updatedScaleset;
}

Expand Down Expand Up @@ -329,7 +330,7 @@ public async Async.Task<Scaleset> Setup(Scaleset scaleset) {
scaleset = result.OkV;
}
} else {
_logTracer.Info($"scaleset running {scaleset.ScalesetId:Tag:ScalesetId}");
_logTracer.Info($"scaleset {scaleset.ScalesetId:Tag:ScalesetId} is in {vmss.ProvisioningState:Tag:ScalesetState}");

var autoScaling = await TryEnableAutoScaling(scaleset);

Expand Down Expand Up @@ -538,7 +539,7 @@ public async Async.Task<Scaleset> Halt(Scaleset scaleset) {
continue;
}

_log.Info($"adding missing azure node {machineId:Tag:MachineId} to scaleset {scaleSet.ScalesetId:Tag:ScalesetId}");
_log.Info($"adding missing azure node {machineId:Tag:MachineId} {scaleSet.ScalesetId:Tag:ScalesetId}");

// Note, using isNew:True makes it such that if a node already has
// checked in, this won't overwrite it.
Expand Down Expand Up @@ -575,6 +576,7 @@ where x.State.ReadyForReset()
} else if (await new ShrinkQueue(pool.OkV!.PoolId, _context.Queue, _log).ShouldShrink()) {
toDelete[node.MachineId] = await _context.NodeOperations.SetHalt(node);
} else {
_logTracer.Info($"Node ready to reimage {node.MachineId:Tag:MachineId} {node.ScalesetId:Tag:ScalesetId} {node.State:Tag:State}");
toReimage[node.MachineId] = node;
}
}
Expand All @@ -590,6 +592,8 @@ where x.State.ReadyForReset()
errorMessage = "node reimaged due to never receiving heartbeat";
}

_log.Info($"{errorMessage} {deadNode.MachineId:Tag:MachineId} {deadNode.ScalesetId:Tag:ScalesetId}");

var error = new Error(ErrorCode.TASK_FAILED, new[] { $"{errorMessage} scaleset_id {deadNode.ScalesetId} last heartbeat:{deadNode.Heartbeat}" });
await _context.NodeOperations.MarkTasksStoppedEarly(deadNode, error);
toReimage[deadNode.MachineId] = await _context.NodeOperations.ToReimage(deadNode, true);
Expand Down Expand Up @@ -661,8 +665,10 @@ await Async.Task.WhenAll(nodes
await Async.Task.WhenAll(nodes
.Where(node => machineIds.Contains(node.MachineId))
.Select(async node => {
await _context.NodeOperations.Delete(node);
await _context.NodeOperations.ReleaseScaleInProtection(node).IgnoreResult();
var r = await _context.NodeOperations.ReleaseScaleInProtection(node);
if (r.IsOk) {
await _context.NodeOperations.Delete(node);
}
}));
} else {
_log.Info($"failed to reimage nodes due to {r.ErrorV:Tag:Error}");
Expand Down
2 changes: 1 addition & 1 deletion src/ApiService/ApiService/onefuzzlib/TaskOperations.cs
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ public async Async.Task<Task> SetState(Task task, TaskState state) {
if (task.State == state) {
return task;
}

_logTracer.Event($"SetState Task {task.TaskId:Tag:TaskId} {task.State:Tag:From} - {state:Tag:To}");
if (task.State == TaskState.Running || task.State == TaskState.SettingUp) {
task = await OnStart(task with { State = state });
} else {
Expand Down
Loading