From bb7599b3f8c74f09c04ea0f81e867a07669c558f Mon Sep 17 00:00:00 2001 From: stas Date: Fri, 30 Sep 2022 08:04:03 -0700 Subject: [PATCH 1/9] add more logs --- .../ApiService/Functions/AgentCommands.cs | 1 - .../ApiService/Functions/AgentEvents.cs | 1 - src/ApiService/ApiService/Log.cs | 2 +- .../ApiService/onefuzzlib/NodeOperations.cs | 40 ++++++++++++------- .../ApiService/onefuzzlib/PoolOperations.cs | 1 + .../ApiService/onefuzzlib/ProxyOperations.cs | 2 + .../onefuzzlib/ScalesetOperations.cs | 15 ++++--- .../ApiService/onefuzzlib/TaskOperations.cs | 2 +- .../ApiService/onefuzzlib/VmssOperations.cs | 8 ++-- 9 files changed, 44 insertions(+), 28 deletions(-) diff --git a/src/ApiService/ApiService/Functions/AgentCommands.cs b/src/ApiService/ApiService/Functions/AgentCommands.cs index cabdd6d554..03b854c65b 100644 --- a/src/ApiService/ApiService/Functions/AgentCommands.cs +++ b/src/ApiService/ApiService/Functions/AgentCommands.cs @@ -38,7 +38,6 @@ private async Async.Task Get(HttpRequestData req) { var envelope = new NodeCommandEnvelope(command, messageId); return await RequestHandling.Ok(req, new PendingNodeCommand(envelope)); } else { - _log.WithTag("HttpRequest", "GET").Verbose($"failed to find {nodeCommand.MachineId:Tag:MachineId}"); return await RequestHandling.Ok(req, new PendingNodeCommand(null)); } } diff --git a/src/ApiService/ApiService/Functions/AgentEvents.cs b/src/ApiService/ApiService/Functions/AgentEvents.cs index 3807f383ae..f3ccfc1deb 100644 --- a/src/ApiService/ApiService/Functions/AgentEvents.cs +++ b/src/ApiService/ApiService/Functions/AgentEvents.cs @@ -103,7 +103,6 @@ private async Async.Task Post(HttpRequestData req) { return null; } - _log.Info($"node state update: {machineId:Tag:MachineId} from {node.State:Tag:FromState} to {ev.State:Tag:ToState}"); node = await _context.NodeOperations.SetState(node, ev.State); if (ev.State == NodeState.Free) { diff --git a/src/ApiService/ApiService/Log.cs b/src/ApiService/ApiService/Log.cs index e20abf4b4a..bacc5e3f58 100644 --- a/src/ApiService/ApiService/Log.cs +++ b/src/ApiService/ApiService/Log.cs @@ -179,7 +179,7 @@ public interface ILogTracer { void Error(LogStringHandler message); void Error(Error error); - void Event(LogStringHandler evt, IReadOnlyDictionary? metrics); + void Event(LogStringHandler evt, IReadOnlyDictionary? metrics = null); void Exception(Exception ex, LogStringHandler message = $"", IReadOnlyDictionary? metrics = null); void ForceFlush(); void Info(LogStringHandler message); diff --git a/src/ApiService/ApiService/onefuzzlib/NodeOperations.cs b/src/ApiService/ApiService/onefuzzlib/NodeOperations.cs index 84c2bff66c..99ed8e983b 100644 --- a/src/ApiService/ApiService/onefuzzlib/NodeOperations.cs +++ b/src/ApiService/ApiService/onefuzzlib/NodeOperations.cs @@ -84,7 +84,7 @@ public NodeOperations(ILogTracer log, IOnefuzzContext context) public async Task AcquireScaleInProtection(Node node) { if (node.ScalesetId is Guid scalesetId && await ScalesetNodeExists(node)) { - _logTracer.Info($"Setting scale-in protection on node {node.MachineId}"); + _logTracer.Info($"Setting scale-in protection on node {node.MachineId:Tag:MachineId}"); return await _context.VmssOperations.UpdateScaleInProtection(scalesetId, node.MachineId, protectFromScaleIn: true); } @@ -95,7 +95,7 @@ public async Task ReleaseScaleInProtection(Node node) { if (!node.DebugKeepNode && node.ScalesetId is Guid scalesetId && await ScalesetNodeExists(node)) { - _logTracer.Info($"Removing scale-in protection on node {node.MachineId}"); + _logTracer.Info($"Removing scale-in protection on node {node.MachineId:Tag:MachineId}"); return await _context.VmssOperations.UpdateScaleInProtection(scalesetId, node.MachineId, protectFromScaleIn: false); } @@ -119,41 +119,41 @@ public async Async.Task ScalesetNodeExists(Node node) { public async Task CanProcessNewWork(Node node) { if (IsOutdated(node) && _context.ServiceConfiguration.OneFuzzAllowOutdatedAgent != "true") { - _logTracer.Info($"can_process_new_work agent and service versions differ, stopping node. machine_id:{node.MachineId} agent_version:{node.Version} service_version:{_context.ServiceConfiguration.OneFuzzVersion}"); + _logTracer.Info($"can_process_new_work agent and service versions differ, stopping node. {node.MachineId:Tag:MachineId} {node.Version:Tag:AgentVersion} {_context.ServiceConfiguration.OneFuzzVersion:Tag:ServiceVersion}"); _ = await Stop(node, done: true); return false; } if (IsTooOld(node)) { - _logTracer.Info($"can_process_new_work node is too old. machine_id:{node.MachineId}"); + _logTracer.Info($"can_process_new_work node is too old {node.MachineId:Tag:MachineId}"); _ = await Stop(node, done: true); return false; } if (!node.State.CanProcessNewWork()) { - _logTracer.Info($"can_process_new_work node not in appropriate state for new work machine_id:{node.MachineId} state:{node.State}"); + _logTracer.Info($"can_process_new_work node not in appropriate state for new work {node.MachineId:Tag:MachineId} {node.State:Tag:State}"); return false; } if (node.State.ReadyForReset()) { - _logTracer.Info($"can_process_new_work node is set for reset. machine_id:{node.MachineId}"); + _logTracer.Info($"can_process_new_work node is set for reset {node.MachineId:Tag:MachineId}"); return false; } if (node.DeleteRequested) { - _logTracer.Info($"can_process_new_work is set to be deleted. machine_id:{node.MachineId}"); + _logTracer.Info($"can_process_new_work is set to be deleted {node.MachineId:Tag:MachineId}"); _ = await Stop(node, done: true); return false; } if (node.ReimageRequested) { - _logTracer.Info($"can_process_new_work is set to be reimaged. machine_id:{node.MachineId}"); + _logTracer.Info($"can_process_new_work is set to be reimaged {node.MachineId:Tag:MachineId}"); _ = await Stop(node, done: true); return false; } if (await CouldShrinkScaleset(node)) { - _logTracer.Info($"can_process_new_work node scheduled to shrink. machine_id:{node.MachineId}"); + _logTracer.Info($"can_process_new_work node scheduled to shrink {node.MachineId:Tag:MachineId}"); _ = await SetHalt(node); return false; } @@ -161,26 +161,26 @@ public async Task CanProcessNewWork(Node node) { if (node.ScalesetId != null) { var scalesetResult = await _context.ScalesetOperations.GetById(node.ScalesetId.Value); if (!scalesetResult.IsOk) { - _logTracer.Info($"can_process_new_work invalid scaleset. scaleset_id:{node.ScalesetId} machine_id:{node.MachineId}"); + _logTracer.Info($"can_process_new_work invalid scaleset {node.ScalesetId:Tag:ScalesetId} - {node.MachineId:Tag:MachineId}"); return false; } var scaleset = scalesetResult.OkV; if (!scaleset.State.IsAvailable()) { - _logTracer.Info($"can_process_new_work scaleset not available for work. scaleset_id:{node.ScalesetId} machine_id:{node.MachineId}"); + _logTracer.Info($"can_process_new_work scaleset not available for work {scaleset.ScalesetId:Tag:ScalesetId} - {node.MachineId:Tag:MachineId} {scaleset.State:Tag:State}"); return false; } } var poolResult = await _context.PoolOperations.GetByName(node.PoolName); if (!poolResult.IsOk) { - _logTracer.Info($"can_schedule - invalid pool. pool_name:{node.PoolName} machine_id:{node.MachineId}"); + _logTracer.Info($"can_schedule - invalid pool {node.PoolName:Tag:PoolName} - {node.MachineId:Tag:MachineId}"); return false; } var pool = poolResult.OkV; if (!PoolStateHelper.Available.Contains(pool.State)) { - _logTracer.Info($"can_schedule - pool is not available for work. pool_name:{node.PoolName} machine_id:{node.MachineId}"); + _logTracer.Info($"can_schedule - pool is not available for work {node.PoolName:Tag:PoolName} - {node.MachineId:Tag:MachineId}"); return false; } @@ -213,7 +213,7 @@ public async Async.Task MarkOutdatedNodes() { var outdated = SearchOutdated(excludeUpdateScheduled: true); await foreach (var node in outdated) { - _logTracer.Info($"node is outdated: {node.MachineId} - node_version:{node.Version} api_version:"); + _logTracer.Info($"node is outdated: {node.MachineId:Tag:MachineId} - {node.Version:Tag:NodeVersion}"); if (node.Version == "1.0.0") { _ = await ToReimage(node, done: true); @@ -313,7 +313,7 @@ public async Async.Task ToReimage(Node node, bool done = false) { var r = await Replace(updatedNode); if (!r.IsOk) { - _logTracer.WithHttpStatus(r.ErrorV).Error($"Failed to save Node record for node {node.MachineId:Tag:MachineId}"); + _logTracer.WithHttpStatus(r.ErrorV).Error($"Failed to save Node record for node {updatedNode.MachineId:Tag:MachineId}"); } return updatedNode; @@ -345,6 +345,15 @@ public IAsyncEnumerable GetDeadNodes(Guid scaleSetId, TimeSpan expirationP } catch (RequestFailedException ex) when ( ex.Status == (int)HttpStatusCode.Conflict || ex.ErrorCode == "EntityAlreadyExists") { + + var existingNode = await QueryAsync(Query.SingleEntity(poolName.ToString(), machineId.ToString())).FirstOrDefaultAsync(); + if (existingNode is not null) { + if (existingNode != node) { + _logTracer.Error($"Found existing node record {existingNode:Tag:ExistingNode} when inserting a new node {node:Tag:NewNode}"); + } + } else { + _logTracer.Critical($"Failed to get node when node insertion returned EntityAlreadyExists {poolName.ToString():Tag:PoolName} {machineId:Tag:MachineId}"); + } return null; } } else { @@ -451,6 +460,7 @@ public async Async.Task SetState(Node node, NodeState state) { if (node.State == state) { return node; } + _logTracer.Event($"SetState Node {node.MachineId:Tag:MachineId} {node.State:Tag:From} - {state:Tag:To}"); node = node with { State = state }; await _context.Events.SendEvent(new EventNodeStateUpdated( diff --git a/src/ApiService/ApiService/onefuzzlib/PoolOperations.cs b/src/ApiService/ApiService/onefuzzlib/PoolOperations.cs index 3849b9d539..27867ab470 100644 --- a/src/ApiService/ApiService/onefuzzlib/PoolOperations.cs +++ b/src/ApiService/ApiService/onefuzzlib/PoolOperations.cs @@ -142,6 +142,7 @@ public async Async.Task SetState(Pool pool, PoolState state) { return pool; } + _logTracer.WithTag("PoolName", pool.Name.ToString()).Event($"SetState Pool {pool.PoolId:Tag:PoolId} {pool.State:Tag:From} - {state:Tag:To}"); // scalesets should never leave the `halt` state // it is terminal if (pool.State == PoolState.Halt) { diff --git a/src/ApiService/ApiService/onefuzzlib/ProxyOperations.cs b/src/ApiService/ApiService/onefuzzlib/ProxyOperations.cs index 0243034bcc..7e8da069bd 100644 --- a/src/ApiService/ApiService/onefuzzlib/ProxyOperations.cs +++ b/src/ApiService/ApiService/onefuzzlib/ProxyOperations.cs @@ -140,6 +140,8 @@ public async Async.Task SetState(Proxy proxy, VmState state) { return proxy; } + _logTracer.Event($"ProxySetState {proxy.ProxyId:Tag:ProxyId} {proxy.State:Tag:From} - {state:Tag:To}"); + var newProxy = proxy with { State = state }; var r = await Replace(newProxy); if (!r.IsOk) { diff --git a/src/ApiService/ApiService/onefuzzlib/ScalesetOperations.cs b/src/ApiService/ApiService/onefuzzlib/ScalesetOperations.cs index 41e9e91f27..a31e4a55dc 100644 --- a/src/ApiService/ApiService/onefuzzlib/ScalesetOperations.cs +++ b/src/ApiService/ApiService/onefuzzlib/ScalesetOperations.cs @@ -130,7 +130,7 @@ public async Async.Task SyncAutoscaleSettings(Scaleset scales return poolResult.ErrorV; } - _logTracer.Info($"Updating auto-scale entry for scaleset: {scaleset.ScalesetId}"); + _logTracer.Info($"Updating auto-scale entry for scaleset: {scaleset.ScalesetId:Tag:ScalesetId}"); _ = await _context.AutoScaleOperations.Update( scalesetId: scaleset.ScalesetId, minAmount: minAmount, @@ -188,6 +188,7 @@ public async Async.Task SetState(Scaleset scaleset, ScalesetState stat return scaleset; } + _log.WithTag("Pool", scaleset.PoolName.ToString()).Event($"SetState Scaleset {scaleset.ScalesetId:Tag:ScalesetId} {scaleset.State:Tag:From} - {state:Tag:To}"); var updatedScaleSet = scaleset with { State = state }; var r = await Replace(updatedScaleSet); if (!r.IsOk) { @@ -217,7 +218,7 @@ async Async.Task SetFailed(Scaleset scaleset, Error error) { var updatedScaleset = await SetState(scaleset with { Error = error }, ScalesetState.CreationFailed); - await _context.Events.SendEvent(new EventScalesetFailed(scaleset.ScalesetId, scaleset.PoolName, error)); + await _context.Events.SendEvent(new EventScalesetFailed(updatedScaleset.ScalesetId, updatedScaleset.PoolName, error)); return updatedScaleset; } @@ -329,7 +330,7 @@ public async Async.Task Setup(Scaleset scaleset) { scaleset = result.OkV; } } else { - _logTracer.Info($"scaleset running {scaleset.ScalesetId:Tag:ScalesetId}"); + _logTracer.Info($"scaleset {scaleset.ScalesetId:Tag:ScalesetId} is in {vmss.ProvisioningState:Tag:ScalesetState}"); var autoScaling = await TryEnableAutoScaling(scaleset); @@ -651,7 +652,7 @@ public async Async.Task ReimageNodes(Scaleset scaleset, IEnumerable nodes, await Async.Task.WhenAll(nodes .Where(node => machineIds.Contains(node.MachineId)) .Select(async node => { - await _context.NodeOperations.ReleaseScaleInProtection(node).IgnoreResult(); + _ = await _context.NodeOperations.ReleaseScaleInProtection(node); })); return; @@ -661,8 +662,10 @@ await Async.Task.WhenAll(nodes await Async.Task.WhenAll(nodes .Where(node => machineIds.Contains(node.MachineId)) .Select(async node => { - await _context.NodeOperations.Delete(node); - await _context.NodeOperations.ReleaseScaleInProtection(node).IgnoreResult(); + var r = await _context.NodeOperations.ReleaseScaleInProtection(node); + if (r.IsOk) { + await _context.NodeOperations.Delete(node); + } })); } else { _log.Info($"failed to reimage nodes due to {r.ErrorV:Tag:Error}"); diff --git a/src/ApiService/ApiService/onefuzzlib/TaskOperations.cs b/src/ApiService/ApiService/onefuzzlib/TaskOperations.cs index c31ea4f785..af2fd414fd 100644 --- a/src/ApiService/ApiService/onefuzzlib/TaskOperations.cs +++ b/src/ApiService/ApiService/onefuzzlib/TaskOperations.cs @@ -142,7 +142,7 @@ public async Async.Task SetState(Task task, TaskState state) { if (task.State == state) { return task; } - + _logTracer.Event($"TaskSetState {task.TaskId:Tag:TaskId} {task.State:Tag:From} - {state:Tag:To}"); if (task.State == TaskState.Running || task.State == TaskState.SettingUp) { task = await OnStart(task with { State = state }); } else { diff --git a/src/ApiService/ApiService/onefuzzlib/VmssOperations.cs b/src/ApiService/ApiService/onefuzzlib/VmssOperations.cs index 8e75d7d457..b93496eb98 100644 --- a/src/ApiService/ApiService/onefuzzlib/VmssOperations.cs +++ b/src/ApiService/ApiService/onefuzzlib/VmssOperations.cs @@ -254,9 +254,11 @@ public async Async.Task UpdateScaleInProtection(Guid name, Gu try { _ = await vmCollection.CreateOrUpdateAsync(WaitUntil.Started, instanceVm.Data.InstanceId, instanceVm.Data); return OneFuzzResultVoid.Ok; - } catch { - var msg = $"unable to set protection policy on: {vmId}:{instanceVm.Id} in vmss {name}"; - return OneFuzzResultVoid.Error(ErrorCode.UNABLE_TO_UPDATE, msg); + } catch (RequestFailedException ex) when (ex.Status == 409 && ex.Message.StartsWith("The request failed due to conflict with a concurrent request")) { + return OneFuzzResultVoid.Error(ErrorCode.UNABLE_TO_UPDATE, $"protection policy update is already in progress: {vmId}:{instanceVm.Id} in vmss {name}"); + } catch (Exception ex) { + _log.Exception(ex, $"unable to set protection policy on: {vmId:Tag:MachineId}:{instanceVm.Id:Tag:InstanceId} in vmss {name:Tag:ScalesetId}"); + return OneFuzzResultVoid.Error(ErrorCode.UNABLE_TO_UPDATE, $"unable to set protection policy on: {vmId}:{instanceVm.Id} in vmss {name}"); } } else { _log.Info($"scale in protection was already set to {protectFromScaleIn:Tag:ProtectFromScaleIn} on vm {vmId:Tag:VmId} for scaleset {name:Tag:VmssName}"); From 7fa4ac361825fa11d65710292c2c0ef3d5744839 Mon Sep 17 00:00:00 2001 From: stas Date: Sat, 1 Oct 2022 09:50:00 -0700 Subject: [PATCH 2/9] bug fix --- .../ApiService/onefuzzlib/NodeOperations.cs | 22 ++++------------ .../ApiService/onefuzzlib/ProxyOperations.cs | 2 +- .../ApiService/onefuzzlib/TaskOperations.cs | 2 +- src/ApiService/Tests/QueryTest.cs | 26 +++++++------------ 4 files changed, 17 insertions(+), 35 deletions(-) diff --git a/src/ApiService/ApiService/onefuzzlib/NodeOperations.cs b/src/ApiService/ApiService/onefuzzlib/NodeOperations.cs index 99ed8e983b..e9c742f7f9 100644 --- a/src/ApiService/ApiService/onefuzzlib/NodeOperations.cs +++ b/src/ApiService/ApiService/onefuzzlib/NodeOperations.cs @@ -348,13 +348,14 @@ public IAsyncEnumerable GetDeadNodes(Guid scaleSetId, TimeSpan expirationP var existingNode = await QueryAsync(Query.SingleEntity(poolName.ToString(), machineId.ToString())).FirstOrDefaultAsync(); if (existingNode is not null) { - if (existingNode != node) { - _logTracer.Error($"Found existing node record {existingNode:Tag:ExistingNode} when inserting a new node {node:Tag:NewNode}"); + if (existingNode.State != node.State || existingNode.ReimageRequested != node.ReimageRequested || existingNode.Version != node.Version || existingNode.DeleteRequested != node.DeleteRequested) { + _logTracer.Error($"Not replacing {existingNode:Tag:ExistingNode} with a new-and-different {node:Tag:Node}"); } + return null; } else { _logTracer.Critical($"Failed to get node when node insertion returned EntityAlreadyExists {poolName.ToString():Tag:PoolName} {machineId:Tag:MachineId}"); + r = await Replace(node); } - return null; } } else { r = await Replace(node); @@ -480,12 +481,10 @@ await _context.Events.SendEvent(new EventNodeStateUpdated( } public static string SearchStatesQuery( - string oneFuzzVersion, Guid? poolId = default, Guid? scaleSetId = default, IEnumerable? states = default, PoolName? poolName = default, - bool excludeUpdateScheduled = false, int? numResults = default) { List queryParts = new(); @@ -507,17 +506,6 @@ public static string SearchStatesQuery( queryParts.Add($"({q})"); } - if (excludeUpdateScheduled) { - queryParts.Add($"reimage_requested eq false"); - queryParts.Add($"delete_requested eq false"); - } - - //# azure table query always return false when the column does not exist - //# We write the query this way to allow us to get the nodes where the - //# version is not defined as well as the nodes with a mismatched version - var versionQuery = TableClient.CreateQueryFilter($"not (version eq {oneFuzzVersion})"); - queryParts.Add(versionQuery); - return Query.And(queryParts); } @@ -529,7 +517,7 @@ public IAsyncEnumerable SearchStates( PoolName? poolName = default, bool excludeUpdateScheduled = false, int? numResults = default) { - var query = NodeOperations.SearchStatesQuery(_context.ServiceConfiguration.OneFuzzVersion, poolId, scalesetId, states, poolName, excludeUpdateScheduled, numResults); + var query = NodeOperations.SearchStatesQuery(poolId, scalesetId, states, poolName, numResults); if (numResults is null) { return QueryAsync(query); diff --git a/src/ApiService/ApiService/onefuzzlib/ProxyOperations.cs b/src/ApiService/ApiService/onefuzzlib/ProxyOperations.cs index 7e8da069bd..a859b5b2c8 100644 --- a/src/ApiService/ApiService/onefuzzlib/ProxyOperations.cs +++ b/src/ApiService/ApiService/onefuzzlib/ProxyOperations.cs @@ -140,7 +140,7 @@ public async Async.Task SetState(Proxy proxy, VmState state) { return proxy; } - _logTracer.Event($"ProxySetState {proxy.ProxyId:Tag:ProxyId} {proxy.State:Tag:From} - {state:Tag:To}"); + _logTracer.Event($"SetState Proxy {proxy.ProxyId:Tag:ProxyId} {proxy.State:Tag:From} - {state:Tag:To}"); var newProxy = proxy with { State = state }; var r = await Replace(newProxy); diff --git a/src/ApiService/ApiService/onefuzzlib/TaskOperations.cs b/src/ApiService/ApiService/onefuzzlib/TaskOperations.cs index af2fd414fd..0a836bb6b5 100644 --- a/src/ApiService/ApiService/onefuzzlib/TaskOperations.cs +++ b/src/ApiService/ApiService/onefuzzlib/TaskOperations.cs @@ -142,7 +142,7 @@ public async Async.Task SetState(Task task, TaskState state) { if (task.State == state) { return task; } - _logTracer.Event($"TaskSetState {task.TaskId:Tag:TaskId} {task.State:Tag:From} - {state:Tag:To}"); + _logTracer.Event($"SetState Task {task.TaskId:Tag:TaskId} {task.State:Tag:From} - {state:Tag:To}"); if (task.State == TaskState.Running || task.State == TaskState.SettingUp) { task = await OnStart(task with { State = state }); } else { diff --git a/src/ApiService/Tests/QueryTest.cs b/src/ApiService/Tests/QueryTest.cs index 030dfbea91..e8af377a37 100644 --- a/src/ApiService/Tests/QueryTest.cs +++ b/src/ApiService/Tests/QueryTest.cs @@ -10,30 +10,24 @@ public class QueryTests { [Fact] public void NodeOperationsSearchStatesQuery() { - var ver = "1.2.3"; - var query1 = NodeOperations.SearchStatesQuery(ver); - Assert.Equal("(not (version eq '1.2.3'))", query1); + var query1 = NodeOperations.SearchStatesQuery(); + Assert.Equal("", query1); - var query2 = NodeOperations.SearchStatesQuery(ver, poolId: Guid.Parse("3b0426d3-9bde-4ae8-89ac-4edf0d3b3618")); - Assert.Equal("((pool_id eq '3b0426d3-9bde-4ae8-89ac-4edf0d3b3618')) and (not (version eq '1.2.3'))", query2); + var query2 = NodeOperations.SearchStatesQuery(poolId: Guid.Parse("3b0426d3-9bde-4ae8-89ac-4edf0d3b3618")); + Assert.Equal("((pool_id eq '3b0426d3-9bde-4ae8-89ac-4edf0d3b3618'))", query2); - var query3 = NodeOperations.SearchStatesQuery(ver, scaleSetId: Guid.Parse("4c96dd6b-9bdb-4758-9720-1010c244fa4b")); - Assert.Equal("((scaleset_id eq '4c96dd6b-9bdb-4758-9720-1010c244fa4b')) and (not (version eq '1.2.3'))", query3); + var query3 = NodeOperations.SearchStatesQuery(scaleSetId: Guid.Parse("4c96dd6b-9bdb-4758-9720-1010c244fa4b")); + Assert.Equal("((scaleset_id eq '4c96dd6b-9bdb-4758-9720-1010c244fa4b'))", query3); - var query4 = NodeOperations.SearchStatesQuery(ver, states: new[] { NodeState.Free, NodeState.Done, NodeState.Ready }); - Assert.Equal("(((state eq 'free') or (state eq 'done') or (state eq 'ready'))) and (not (version eq '1.2.3'))", query4); - - var query5 = NodeOperations.SearchStatesQuery(ver, excludeUpdateScheduled: true); - Assert.Equal("(reimage_requested eq false) and (delete_requested eq false) and (not (version eq '1.2.3'))", query5); + var query4 = NodeOperations.SearchStatesQuery(states: new[] { NodeState.Free, NodeState.Done, NodeState.Ready }); + Assert.Equal("(((state eq 'free') or (state eq 'done') or (state eq 'ready')))", query4); var query7 = NodeOperations.SearchStatesQuery( - ver, poolId: Guid.Parse("3b0426d3-9bde-4ae8-89ac-4edf0d3b3618"), scaleSetId: Guid.Parse("4c96dd6b-9bdb-4758-9720-1010c244fa4b"), - states: new[] { NodeState.Free, NodeState.Done, NodeState.Ready }, - excludeUpdateScheduled: true); - Assert.Equal("((pool_id eq '3b0426d3-9bde-4ae8-89ac-4edf0d3b3618')) and ((scaleset_id eq '4c96dd6b-9bdb-4758-9720-1010c244fa4b')) and (((state eq 'free') or (state eq 'done') or (state eq 'ready'))) and (reimage_requested eq false) and (delete_requested eq false) and (not (version eq '1.2.3'))", query7); + states: new[] { NodeState.Free, NodeState.Done, NodeState.Ready }); + Assert.Equal("((pool_id eq '3b0426d3-9bde-4ae8-89ac-4edf0d3b3618')) and ((scaleset_id eq '4c96dd6b-9bdb-4758-9720-1010c244fa4b')) and (((state eq 'free') or (state eq 'done') or (state eq 'ready')))", query7); } [Fact] From 3ac78a43171e24182c325e25e6b7255aaff2049f Mon Sep 17 00:00:00 2001 From: stas Date: Sat, 1 Oct 2022 14:28:29 -0700 Subject: [PATCH 3/9] more logs --- src/ApiService/ApiService/onefuzzlib/ReproOperations.cs | 6 +++--- src/ApiService/ApiService/onefuzzlib/ScalesetOperations.cs | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/ApiService/ApiService/onefuzzlib/ReproOperations.cs b/src/ApiService/ApiService/onefuzzlib/ReproOperations.cs index 07cadb5cbf..216381db96 100644 --- a/src/ApiService/ApiService/onefuzzlib/ReproOperations.cs +++ b/src/ApiService/ApiService/onefuzzlib/ReproOperations.cs @@ -90,15 +90,15 @@ public async Async.Task Stopping(Repro repro) { var vm = await GetVm(repro, config); var vmOperations = _context.VmOperations; if (!await vmOperations.IsDeleted(vm)) { - _logTracer.Info($"vm stopping: {repro.VmId}"); + _logTracer.Info($"vm stopping: {repro.VmId:Tag:VmId} {vm.Name:Tag:VmName}"); var rr = await vmOperations.Delete(vm); if (!rr) { - _logTracer.Error($"failed to delete repro {repro.VmId:Tag:VmId}"); + _logTracer.Error($"failed to delete repro {repro.VmId:Tag:VmId} {vm.Name:Tag:VmName}"); } repro = repro with { State = VmState.Stopping }; var r = await Replace(repro); if (!r.IsOk) { - _logTracer.WithHttpStatus(r.ErrorV).Error($"failed to replace repro {repro.VmId:Tag:VmId} marked Stopping"); + _logTracer.WithHttpStatus(r.ErrorV).Error($"failed to replace repro {repro.VmId:Tag:VmId} {vm.Name:Tag:VmName} marked Stopping"); } return repro; } else { diff --git a/src/ApiService/ApiService/onefuzzlib/ScalesetOperations.cs b/src/ApiService/ApiService/onefuzzlib/ScalesetOperations.cs index a31e4a55dc..7fa81da515 100644 --- a/src/ApiService/ApiService/onefuzzlib/ScalesetOperations.cs +++ b/src/ApiService/ApiService/onefuzzlib/ScalesetOperations.cs @@ -539,7 +539,7 @@ public async Async.Task Halt(Scaleset scaleset) { continue; } - _log.Info($"adding missing azure node {machineId:Tag:MachineId} to scaleset {scaleSet.ScalesetId:Tag:ScalesetId}"); + _log.Info($"adding missing azure node {machineId:Tag:MachineId} {scaleSet.ScalesetId:Tag:ScalesetId}"); // Note, using isNew:True makes it such that if a node already has // checked in, this won't overwrite it. @@ -576,6 +576,7 @@ where x.State.ReadyForReset() } else if (await new ShrinkQueue(pool.OkV!.PoolId, _context.Queue, _log).ShouldShrink()) { toDelete[node.MachineId] = await _context.NodeOperations.SetHalt(node); } else { + _logTracer.Info($"Node ready to reimage {node.MachineId:Tag:MachineId} {node.ScalesetId:Tag:ScalesetId} {node.State:Tag:State}"); toReimage[node.MachineId] = node; } } @@ -591,6 +592,8 @@ where x.State.ReadyForReset() errorMessage = "node reimaged due to never receiving heartbeat"; } + _log.Info($"{errorMessage} {deadNode.MachineId:Tag:MachineId} {deadNode.ScalesetId:Tag:ScalesetId}"); + var error = new Error(ErrorCode.TASK_FAILED, new[] { $"{errorMessage} scaleset_id {deadNode.ScalesetId} last heartbeat:{deadNode.Heartbeat}" }); await _context.NodeOperations.MarkTasksStoppedEarly(deadNode, error); toReimage[deadNode.MachineId] = await _context.NodeOperations.ToReimage(deadNode, true); From 9e37152e1bb57adf177aec6597077e9408dea9e7 Mon Sep 17 00:00:00 2001 From: stas Date: Sat, 1 Oct 2022 16:27:50 -0700 Subject: [PATCH 4/9] another fix --- src/ApiService/ApiService/OneFuzzTypes/Enums.cs | 1 + src/ApiService/ApiService/OneFuzzTypes/Model.cs | 2 +- src/ApiService/ApiService/onefuzzlib/NodeOperations.cs | 10 ++++++++-- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/ApiService/ApiService/OneFuzzTypes/Enums.cs b/src/ApiService/ApiService/OneFuzzTypes/Enums.cs index 0d05ba0150..f2919dab5c 100644 --- a/src/ApiService/ApiService/OneFuzzTypes/Enums.cs +++ b/src/ApiService/ApiService/OneFuzzTypes/Enums.cs @@ -294,6 +294,7 @@ public enum AgentMode { } public enum NodeState { + New, Init, Free, SettingUp, diff --git a/src/ApiService/ApiService/OneFuzzTypes/Model.cs b/src/ApiService/ApiService/OneFuzzTypes/Model.cs index a1104d6781..78b71a79a0 100644 --- a/src/ApiService/ApiService/OneFuzzTypes/Model.cs +++ b/src/ApiService/ApiService/OneFuzzTypes/Model.cs @@ -98,7 +98,7 @@ public record Node string Version, DateTimeOffset? Heartbeat = null, DateTimeOffset? InitializedAt = null, - NodeState State = NodeState.Init, + NodeState State = NodeState.New, Guid? ScalesetId = null, bool ReimageRequested = false, diff --git a/src/ApiService/ApiService/onefuzzlib/NodeOperations.cs b/src/ApiService/ApiService/onefuzzlib/NodeOperations.cs index e9c742f7f9..e3f2c9cea4 100644 --- a/src/ApiService/ApiService/onefuzzlib/NodeOperations.cs +++ b/src/ApiService/ApiService/onefuzzlib/NodeOperations.cs @@ -59,6 +59,7 @@ IAsyncEnumerable SearchStates(Guid? poolId = default, Async.Task SetShutdown(Node node); // state transitions: + Async.Task New(Node node); Async.Task Init(Node node); Async.Task Free(Node node); Async.Task SettingUp(Node node); @@ -303,7 +304,7 @@ public async Async.Task ToReimage(Node node, bool done = false) { var reimageRequested = node.ReimageRequested; if (!node.ReimageRequested && !node.DeleteRequested) { - _logTracer.Info($"setting reimage_requested: {node.MachineId:Tag:MachineId}"); + _logTracer.Info($"setting reimage_requested: {node.MachineId:Tag:MachineId} {node.ScalesetId:Tag:ScalesetId}"); reimageRequested = true; } @@ -313,7 +314,7 @@ public async Async.Task ToReimage(Node node, bool done = false) { var r = await Replace(updatedNode); if (!r.IsOk) { - _logTracer.WithHttpStatus(r.ErrorV).Error($"Failed to save Node record for node {updatedNode.MachineId:Tag:MachineId}"); + _logTracer.WithHttpStatus(r.ErrorV).Error($"Failed to save Node record for node {updatedNode.MachineId:Tag:MachineId} {node.ScalesetId:Tag:ScalesetId}"); } return updatedNode; @@ -610,6 +611,11 @@ private async Task StopIfComplete(Node node, bool done = false) { return true; } + public Task New(Node node) { + // nothing to do + return Async.Task.FromResult(node); + } + public Task Init(Node node) { // nothing to do return Async.Task.FromResult(node); From 975295fcec0bd88e92c2f9b0efd0f5883a9cf4ad Mon Sep 17 00:00:00 2001 From: stas Date: Sat, 1 Oct 2022 16:41:12 -0700 Subject: [PATCH 5/9] fix integration tests --- src/ApiService/IntegrationTests/AgentEventsTests.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ApiService/IntegrationTests/AgentEventsTests.cs b/src/ApiService/IntegrationTests/AgentEventsTests.cs index e07fa1c5d0..811d0753cc 100644 --- a/src/ApiService/IntegrationTests/AgentEventsTests.cs +++ b/src/ApiService/IntegrationTests/AgentEventsTests.cs @@ -279,7 +279,7 @@ await Async.Task.WhenAll( Async.Task.Run(async () => { // should still be in init state: var node = await Context.NodeOperations.SearchAll().SingleAsync(); - Assert.Equal(NodeState.Init, node.State); + Assert.Equal(NodeState.New, node.State); }), Async.Task.Run(async () => { // the node should be told to stop: @@ -308,7 +308,7 @@ await Async.Task.WhenAll( Async.Task.Run(async () => { // the node should still be in init state: var node = await Context.NodeOperations.SearchAll().SingleAsync(); - Assert.Equal(NodeState.Init, node.State); + Assert.Equal(NodeState.New, node.State); }), Async.Task.Run(async () => { // the node should be told to stop: From 743c205f98d37119aa3d3527dc43bd73afe8c981 Mon Sep 17 00:00:00 2001 From: stas Date: Sat, 1 Oct 2022 17:33:21 -0700 Subject: [PATCH 6/9] do not log error when vm deletion is in progress --- src/ApiService/ApiService/Functions/AgentEvents.cs | 2 ++ src/ApiService/ApiService/onefuzzlib/ReproOperations.cs | 4 ++-- src/ApiService/ApiService/onefuzzlib/ScalesetOperations.cs | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/ApiService/ApiService/Functions/AgentEvents.cs b/src/ApiService/ApiService/Functions/AgentEvents.cs index f3ccfc1deb..88d62ba4a3 100644 --- a/src/ApiService/ApiService/Functions/AgentEvents.cs +++ b/src/ApiService/ApiService/Functions/AgentEvents.cs @@ -97,6 +97,8 @@ private async Async.Task Post(HttpRequestData req) { // Don’t check reimage_requested, as nodes only send 'init' state once. If // they send 'init' with reimage_requested, it's because the node was reimaged // successfully. + // Node has to be in state other than Init, otherwise SetState ignores the change, and therefore + // InitializedAt field is never set node = node with { ReimageRequested = false, InitializedAt = DateTimeOffset.UtcNow }; // discard result: node not used after this point _ = await _context.NodeOperations.SetState(node, ev.State); diff --git a/src/ApiService/ApiService/onefuzzlib/ReproOperations.cs b/src/ApiService/ApiService/onefuzzlib/ReproOperations.cs index 216381db96..76b0e76f68 100644 --- a/src/ApiService/ApiService/onefuzzlib/ReproOperations.cs +++ b/src/ApiService/ApiService/onefuzzlib/ReproOperations.cs @@ -92,8 +92,8 @@ public async Async.Task Stopping(Repro repro) { if (!await vmOperations.IsDeleted(vm)) { _logTracer.Info($"vm stopping: {repro.VmId:Tag:VmId} {vm.Name:Tag:VmName}"); var rr = await vmOperations.Delete(vm); - if (!rr) { - _logTracer.Error($"failed to delete repro {repro.VmId:Tag:VmId} {vm.Name:Tag:VmName}"); + if (rr) { + _logTracer.Info($"repro vm fully deleted {repro.VmId:Tag:VmId} {vm.Name:Tag:VmName}"); } repro = repro with { State = VmState.Stopping }; var r = await Replace(repro); diff --git a/src/ApiService/ApiService/onefuzzlib/ScalesetOperations.cs b/src/ApiService/ApiService/onefuzzlib/ScalesetOperations.cs index 7fa81da515..9238cbe083 100644 --- a/src/ApiService/ApiService/onefuzzlib/ScalesetOperations.cs +++ b/src/ApiService/ApiService/onefuzzlib/ScalesetOperations.cs @@ -655,7 +655,7 @@ public async Async.Task ReimageNodes(Scaleset scaleset, IEnumerable nodes, await Async.Task.WhenAll(nodes .Where(node => machineIds.Contains(node.MachineId)) .Select(async node => { - _ = await _context.NodeOperations.ReleaseScaleInProtection(node); + await _context.NodeOperations.ReleaseScaleInProtection(node).IgnoreResult(); })); return; From 1b2f15481ebef3d7f3b2a9ff4c5affb0364d30a4 Mon Sep 17 00:00:00 2001 From: stas Date: Mon, 3 Oct 2022 08:49:11 -0700 Subject: [PATCH 7/9] addressing comments --- src/ApiService/IntegrationTests/AgentEventsTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ApiService/IntegrationTests/AgentEventsTests.cs b/src/ApiService/IntegrationTests/AgentEventsTests.cs index 811d0753cc..d8b5dd0b74 100644 --- a/src/ApiService/IntegrationTests/AgentEventsTests.cs +++ b/src/ApiService/IntegrationTests/AgentEventsTests.cs @@ -277,7 +277,7 @@ await Context.InsertAll( await Async.Task.WhenAll( Async.Task.Run(async () => { - // should still be in init state: + // should still be in new state: var node = await Context.NodeOperations.SearchAll().SingleAsync(); Assert.Equal(NodeState.New, node.State); }), From d6602940e9221994c107df8afd4ac67482823de7 Mon Sep 17 00:00:00 2001 From: stas Date: Mon, 3 Oct 2022 08:57:10 -0700 Subject: [PATCH 8/9] . --- .../ApiService/onefuzzlib/NodeOperations.cs | 29 +++++++++---------- .../IntegrationTests/AgentEventsTests.cs | 2 +- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/src/ApiService/ApiService/onefuzzlib/NodeOperations.cs b/src/ApiService/ApiService/onefuzzlib/NodeOperations.cs index e3f2c9cea4..e02636fdcd 100644 --- a/src/ApiService/ApiService/onefuzzlib/NodeOperations.cs +++ b/src/ApiService/ApiService/onefuzzlib/NodeOperations.cs @@ -194,9 +194,9 @@ public async Task CanProcessNewWork(Node node) { /// reasonably up-to-date with OS patches without disrupting running /// fuzzing tasks with patch reboot cycles. public async Async.Task ReimageLongLivedNodes(Guid scaleSetId) { - var timeFilter = $"not (initialized_at ge datetime'{(DateTimeOffset.UtcNow - INodeOperations.NODE_REIMAGE_TIME).ToString("o")}')"; - - await foreach (var node in QueryAsync($"(scaleset_id eq '{scaleSetId}') and {timeFilter}")) { + var timeFilter = Query.OlderThan("initialized_at", DateTimeOffset.UtcNow - INodeOperations.NODE_REIMAGE_TIME); + //force ToString(), since all GUIDs are strings in the table + await foreach (var node in QueryAsync(Query.And(TableClient.CreateQueryFilter($"scaleset_id eq {scaleSetId.ToString()}"), timeFilter))) { if (node.DebugKeepNode) { _logTracer.Info($"removing debug_keep_node for expired node. scaleset_id:{node.ScalesetId} machine_id:{node.MachineId}"); } @@ -459,18 +459,17 @@ public async Task CouldShrinkScaleset(Node node) { } public async Async.Task SetState(Node node, NodeState state) { - if (node.State == state) { - return node; - } - _logTracer.Event($"SetState Node {node.MachineId:Tag:MachineId} {node.State:Tag:From} - {state:Tag:To}"); - - node = node with { State = state }; - await _context.Events.SendEvent(new EventNodeStateUpdated( - node.MachineId, - node.ScalesetId, - node.PoolName, - state - )); + if (node.State != state) { + _logTracer.Event($"SetState Node {node.MachineId:Tag:MachineId} {node.State:Tag:From} - {state:Tag:To}"); + + node = node with { State = state }; + await _context.Events.SendEvent(new EventNodeStateUpdated( + node.MachineId, + node.ScalesetId, + node.PoolName, + state + )); + } var r = await Update(node); if (!r.IsOk) { diff --git a/src/ApiService/IntegrationTests/AgentEventsTests.cs b/src/ApiService/IntegrationTests/AgentEventsTests.cs index d8b5dd0b74..811d0753cc 100644 --- a/src/ApiService/IntegrationTests/AgentEventsTests.cs +++ b/src/ApiService/IntegrationTests/AgentEventsTests.cs @@ -277,7 +277,7 @@ await Context.InsertAll( await Async.Task.WhenAll( Async.Task.Run(async () => { - // should still be in new state: + // should still be in init state: var node = await Context.NodeOperations.SearchAll().SingleAsync(); Assert.Equal(NodeState.New, node.State); }), From eb00b2ac720e8c3a9c1118f8c706f556c27f8d6f Mon Sep 17 00:00:00 2001 From: stas Date: Mon, 3 Oct 2022 08:59:50 -0700 Subject: [PATCH 9/9] .. --- src/ApiService/ApiService/Functions/AgentEvents.cs | 2 -- src/ApiService/ApiService/OneFuzzTypes/Enums.cs | 1 - src/ApiService/ApiService/OneFuzzTypes/Model.cs | 2 +- src/ApiService/ApiService/onefuzzlib/NodeOperations.cs | 6 ------ src/ApiService/IntegrationTests/AgentEventsTests.cs | 4 ++-- 5 files changed, 3 insertions(+), 12 deletions(-) diff --git a/src/ApiService/ApiService/Functions/AgentEvents.cs b/src/ApiService/ApiService/Functions/AgentEvents.cs index 88d62ba4a3..f3ccfc1deb 100644 --- a/src/ApiService/ApiService/Functions/AgentEvents.cs +++ b/src/ApiService/ApiService/Functions/AgentEvents.cs @@ -97,8 +97,6 @@ private async Async.Task Post(HttpRequestData req) { // Don’t check reimage_requested, as nodes only send 'init' state once. If // they send 'init' with reimage_requested, it's because the node was reimaged // successfully. - // Node has to be in state other than Init, otherwise SetState ignores the change, and therefore - // InitializedAt field is never set node = node with { ReimageRequested = false, InitializedAt = DateTimeOffset.UtcNow }; // discard result: node not used after this point _ = await _context.NodeOperations.SetState(node, ev.State); diff --git a/src/ApiService/ApiService/OneFuzzTypes/Enums.cs b/src/ApiService/ApiService/OneFuzzTypes/Enums.cs index f2919dab5c..0d05ba0150 100644 --- a/src/ApiService/ApiService/OneFuzzTypes/Enums.cs +++ b/src/ApiService/ApiService/OneFuzzTypes/Enums.cs @@ -294,7 +294,6 @@ public enum AgentMode { } public enum NodeState { - New, Init, Free, SettingUp, diff --git a/src/ApiService/ApiService/OneFuzzTypes/Model.cs b/src/ApiService/ApiService/OneFuzzTypes/Model.cs index 78b71a79a0..a1104d6781 100644 --- a/src/ApiService/ApiService/OneFuzzTypes/Model.cs +++ b/src/ApiService/ApiService/OneFuzzTypes/Model.cs @@ -98,7 +98,7 @@ public record Node string Version, DateTimeOffset? Heartbeat = null, DateTimeOffset? InitializedAt = null, - NodeState State = NodeState.New, + NodeState State = NodeState.Init, Guid? ScalesetId = null, bool ReimageRequested = false, diff --git a/src/ApiService/ApiService/onefuzzlib/NodeOperations.cs b/src/ApiService/ApiService/onefuzzlib/NodeOperations.cs index e02636fdcd..dc4c8dbb29 100644 --- a/src/ApiService/ApiService/onefuzzlib/NodeOperations.cs +++ b/src/ApiService/ApiService/onefuzzlib/NodeOperations.cs @@ -59,7 +59,6 @@ IAsyncEnumerable SearchStates(Guid? poolId = default, Async.Task SetShutdown(Node node); // state transitions: - Async.Task New(Node node); Async.Task Init(Node node); Async.Task Free(Node node); Async.Task SettingUp(Node node); @@ -610,11 +609,6 @@ private async Task StopIfComplete(Node node, bool done = false) { return true; } - public Task New(Node node) { - // nothing to do - return Async.Task.FromResult(node); - } - public Task Init(Node node) { // nothing to do return Async.Task.FromResult(node); diff --git a/src/ApiService/IntegrationTests/AgentEventsTests.cs b/src/ApiService/IntegrationTests/AgentEventsTests.cs index 811d0753cc..e07fa1c5d0 100644 --- a/src/ApiService/IntegrationTests/AgentEventsTests.cs +++ b/src/ApiService/IntegrationTests/AgentEventsTests.cs @@ -279,7 +279,7 @@ await Async.Task.WhenAll( Async.Task.Run(async () => { // should still be in init state: var node = await Context.NodeOperations.SearchAll().SingleAsync(); - Assert.Equal(NodeState.New, node.State); + Assert.Equal(NodeState.Init, node.State); }), Async.Task.Run(async () => { // the node should be told to stop: @@ -308,7 +308,7 @@ await Async.Task.WhenAll( Async.Task.Run(async () => { // the node should still be in init state: var node = await Context.NodeOperations.SearchAll().SingleAsync(); - Assert.Equal(NodeState.New, node.State); + Assert.Equal(NodeState.Init, node.State); }), Async.Task.Run(async () => { // the node should be told to stop: