Unity-Technologies · ervteng · Mar 27, 2020 · Mar 25, 2020 · Mar 25, 2020 · Mar 25, 2020
diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md
@@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
  - Renamed 'Generalization' feature to 'Environment Parameter Randomization'.
  - Fixed an issue where specifying `vis_encode_type` was required only for SAC. (#3677)
  - The way that UnityEnvironment decides the port was changed. If no port is specified, the behavior will depend on the `file_name` parameter. If it is `None`, 5004 (the editor port) will be used; otherwise 5005 (the base environment port) will be used.
+ - Fixed the reported entropy values for continuous actions (#3684)
  - Fixed an issue where switching models using `SetModel()` during training would use an excessive amount of memory. (#3664)
  - Environment subprocesses now close immediately on timeout or wrong API version. (#3679)
  - Fixed an issue in the gym wrapper that would raise an exception if an Agent called EndEpisode multiple times in the same step. (#3700)

diff --git a/ml-agents/mlagents/trainers/distributions.py b/ml-agents/mlagents/trainers/distributions.py
@@ -160,7 +160,7 @@ def _create_entropy(
         self, encoded: "GaussianDistribution.MuSigmaTensors"
     ) -> tf.Tensor:
         single_dim_entropy = 0.5 * tf.reduce_mean(
-            tf.log(2 * np.pi * np.e) + tf.square(encoded.log_sigma)
+            tf.log(2 * np.pi * np.e) + 2 * encoded.log_sigma
         )
         # Make entropy the right shape
         return tf.ones_like(tf.reshape(encoded.mu[:, 0], [-1])) * single_dim_entropy

diff --git a/ml-agents/mlagents/trainers/sac/optimizer.py b/ml-agents/mlagents/trainers/sac/optimizer.py
@@ -155,7 +155,6 @@ def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]):
             "q1_loss": self.q1_loss,
             "q2_loss": self.q2_loss,
             "entropy_coef": self.ent_coef,
-            "entropy": self.policy.entropy,
             "update_batch": self.update_batch_policy,
             "update_value": self.update_batch_value,
             "update_entropy": self.update_batch_entropy,

diff --git a/ml-agents/mlagents/trainers/tests/test_distributions.py b/ml-agents/mlagents/trainers/tests/test_distributions.py
@@ -53,7 +53,7 @@ def dummy_config():
 
 def test_gaussian_distribution():
     with tf.Graph().as_default():
-        logits = tf.Variable(initial_value=[[0, 0]], trainable=True, dtype=tf.float32)
+        logits = tf.Variable(initial_value=[[1, 1]], trainable=True, dtype=tf.float32)
         distribution = GaussianDistribution(
             logits,
             act_size=VECTOR_ACTION_SPACE,
@@ -71,6 +71,14 @@ def test_gaussian_distribution():
                     assert out.shape[1] == VECTOR_ACTION_SPACE[0]
                 output = sess.run([distribution.total_log_probs])
                 assert output[0].shape[0] == 1
+            # Test entropy is correct
+            log_std_tensor = tf.get_default_graph().get_tensor_by_name(
+                "log_std/BiasAdd:0"
+            )
+            feed_dict = {log_std_tensor: [[1.0, 1.0]]}
+            entropy = sess.run([distribution.entropy], feed_dict=feed_dict)
+            # Entropy with log_std of 1.0 should be 2.42
+            assert pytest.approx(entropy[0], 0.01) == 2.42
 
 
 def test_tanh_distribution():

diff --git a/ml-agents/mlagents/trainers/tests/test_simple_rl.py b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
@@ -235,7 +235,7 @@ def test_visual_advanced_ppo(vis_encode_type, num_visual):
 def test_recurrent_ppo(use_discrete):
     env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete)
     override_vals = {
-        "max_steps": 4000,
+        "max_steps": 5000,
         "batch_size": 64,
         "buffer_size": 128,
         "learning_rate": 1e-3,