Unity-Technologies · vincentpierre · Feb 19, 2020 · Feb 13, 2020 · Feb 14, 2020 · Feb 14, 2020
diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md
@@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
  - Several classes were changed from public to internal visibility. (#3390)
  - Academy.RegisterSideChannel and UnregisterSideChannel methods were added. (#3391)
  - A tutorial on adding custom SideChannels was added (#3391)
+ - The stepping logic for the Agent and the Academy has been simplified (#3448)
  - Update Barracuda to 0.6.0-preview
 
 ### Bugfixes

diff --git a/com.unity.ml-agents/Runtime/Academy.cs b/com.unity.ml-agents/Runtime/Academy.cs
@@ -113,6 +113,10 @@ public bool IsCommunicatorOn
         // Signals to all the listeners that the academy is being destroyed
         internal event Action DestroyAction;
 
+        // Signals the Agent that a new step is about to start. 
+        // This will mark the Agent as Done if it has reached its maxSteps.
+        internal event Action AgentIncrementStep;
+
         // Signals to all the agents at each environment step along with the
         // Academy's maxStepReached, done and stepCount values. The agents rely
         // on this event to update their own values of max step reached and done
@@ -418,6 +422,9 @@ public void EnvironmentStep()
 
             AgentSetStatus?.Invoke(m_StepCount);
 
+            m_StepCount += 1;
+            m_TotalStepCount += 1;
+            AgentIncrementStep?.Invoke();
 
             using (TimerStack.Instance.Scoped("AgentSendState"))
             {
@@ -433,9 +440,6 @@ public void EnvironmentStep()
             {
                 AgentAct?.Invoke();
             }
-
-            m_StepCount += 1;
-            m_TotalStepCount += 1;
         }
 
         /// <summary>

diff --git a/com.unity.ml-agents/Runtime/Agent.cs b/com.unity.ml-agents/Runtime/Agent.cs
@@ -238,6 +238,7 @@ public void LazyInitialize()
             m_Action = new AgentAction();
             sensors = new List<ISensor>();
 
+            Academy.Instance.AgentIncrementStep += AgentIncrementStep;
             Academy.Instance.AgentSendState += SendInfo;
             Academy.Instance.DecideAction += DecideAction;
             Academy.Instance.AgentAct += AgentStep;
@@ -256,6 +257,7 @@ void OnDisable()
             // We don't want to even try, because this will lazily create a new Academy!
             if (Academy.IsInitialized)
             {
+                Academy.Instance.AgentIncrementStep -= AgentIncrementStep;
                 Academy.Instance.AgentSendState -= SendInfo;
                 Academy.Instance.DecideAction -= DecideAction;
                 Academy.Instance.AgentAct -= AgentStep;
@@ -685,24 +687,25 @@ void SendInfo()
             }
         }
 
+        void AgentIncrementStep()
+        {
+            m_StepCount += 1;
+        }
+
         /// Used by the brain to make the agent perform a step.
         void AgentStep()
         {
-            if ((m_StepCount >= maxStep) && (maxStep > 0))
-            {
-                NotifyAgentDone(true);
-                _AgentReset();
-            }
-            else
-            {
-                m_StepCount += 1;
-            }
-
             if ((m_RequestAction) && (m_Brain != null))
             {
                 m_RequestAction = false;
                 AgentAction(m_Action.vectorActions);
             }
+
+            if ((m_StepCount >= maxStep) && (maxStep > 0))
+            {
+                NotifyAgentDone(true);
+                _AgentReset();
+            }
         }
 
         void DecideAction()

diff --git a/com.unity.ml-agents/Tests/Editor/MLAgentsEditModeTest.cs b/com.unity.ml-agents/Tests/Editor/MLAgentsEditModeTest.cs
@@ -67,6 +67,7 @@ public override void AgentAction(float[] vectorAction)
 
         public override void AgentReset()
         {
+
             agentResetCalls += 1;
             collectObservationsCallsSinceLastReset = 0;
             agentActionCallsSinceLastReset = 0;
@@ -485,19 +486,16 @@ public void TestCumulativeReward()
             agent1.LazyInitialize();
             agent2.SetPolicy(new TestPolicy());
 
-            var j = 0;
-            for (var i = 0; i < 500; i++)
+            var expectedAgent1ActionSinceReset = 0;
+
+            for (var i = 0; i < 50; i++)
             {
-                if (i % 21 == 0)
-                {
-                    j = 0;
-                }
-                else
-                {
-                    j++;
+                expectedAgent1ActionSinceReset += 1;
+                if (expectedAgent1ActionSinceReset == agent1.maxStep || i == 0){
+                    expectedAgent1ActionSinceReset = 0;
                 }
                 agent2.RequestAction();
-                Assert.LessOrEqual(Mathf.Abs(j * 10.1f - agent1.GetCumulativeReward()), 0.05f);
+                Assert.LessOrEqual(Mathf.Abs(expectedAgent1ActionSinceReset * 10.1f - agent1.GetCumulativeReward()), 0.05f);
                 Assert.LessOrEqual(Mathf.Abs(i * 0.1f - agent2.GetCumulativeReward()), 0.05f);
 
                 agent1.AddReward(10f);
@@ -517,26 +515,46 @@ public void TestMaxStepsReset()
             decisionRequester.DecisionPeriod = 1;
             decisionRequester.Awake();
 
-            var maxStep = 6;
+            const int maxStep = 6;
             agent1.maxStep = maxStep;
             agent1.LazyInitialize();
 
+            var expectedAgentStepCount = 0;
+            var expectedResets= 0;
+            var expectedAgentAction = 0;
+            var expectedAgentActionSinceReset = 0;
+            var expectedCollectObsCalls = 0;
+            var expectedCollectObsCallsSinceReset = 0;
+
             for (var i = 0; i < 15; i++)
             {
-                // We expect resets to occur when there are maxSteps actions since the last reset (and on the first step)
-                var expectReset = agent1.agentActionCallsSinceLastReset == maxStep || (i == 0);
-                var previousNumResets = agent1.agentResetCalls;
-
-                aca.EnvironmentStep();
-
-                if (expectReset)
+                // Agent should observe and act on each Academy step
+                expectedAgentAction += 1;
+                expectedAgentActionSinceReset += 1;
+                expectedCollectObsCalls += 1;
+                expectedCollectObsCallsSinceReset += 1;
+                expectedAgentStepCount += 1;
+
+                // If the next step will put the agent at maxSteps, we expect it to reset
+                if (agent1.GetStepCount() == maxStep - 1 || (i == 0))
                 {
-                    Assert.AreEqual(previousNumResets + 1, agent1.agentResetCalls);
+                    expectedResets +=1;
                 }
-                else
+
+                if (agent1.GetStepCount() == maxStep - 1)
                 {
-                    Assert.AreEqual(previousNumResets, agent1.agentResetCalls);
+                    expectedAgentActionSinceReset = 0;
+                    expectedCollectObsCallsSinceReset = 0;
+                    expectedAgentStepCount = 0;
                 }
+                aca.EnvironmentStep();
+
+                Assert.AreEqual(expectedAgentStepCount, agent1.GetStepCount());
+                Assert.AreEqual(expectedResets, agent1.agentResetCalls);
+                Assert.AreEqual(expectedAgentAction, agent1.agentActionCalls);
+                Assert.AreEqual(expectedAgentActionSinceReset, agent1.agentActionCallsSinceLastReset);
+                Assert.AreEqual(expectedCollectObsCalls, agent1.collectObservationsCalls);
+                Assert.AreEqual(expectedCollectObsCallsSinceReset, agent1.collectObservationsCallsSinceLastReset);
             }
         }
     }