diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md index 7de458e867..367b7d1999 100755 --- a/com.unity.ml-agents/CHANGELOG.md +++ b/com.unity.ml-agents/CHANGELOG.md @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Renamed 'Generalization' feature to 'Environment Parameter Randomization'. - Fixed an issue where specifying `vis_encode_type` was required only for SAC. (#3677) - The way that UnityEnvironment decides the port was changed. If no port is specified, the behavior will depend on the `file_name` parameter. If it is `None`, 5004 (the editor port) will be used; otherwise 5005 (the base environment port) will be used. + - Fixed the reported entropy values for continuous actions (#3684) - Fixed an issue where switching models using `SetModel()` during training would use an excessive amount of memory. (#3664) - Environment subprocesses now close immediately on timeout or wrong API version. (#3679) - Fixed an issue in the gym wrapper that would raise an exception if an Agent called EndEpisode multiple times in the same step. (#3700) diff --git a/ml-agents/mlagents/trainers/distributions.py b/ml-agents/mlagents/trainers/distributions.py index ebbb4cbc90..294bad11cb 100644 --- a/ml-agents/mlagents/trainers/distributions.py +++ b/ml-agents/mlagents/trainers/distributions.py @@ -160,7 +160,7 @@ def _create_entropy( self, encoded: "GaussianDistribution.MuSigmaTensors" ) -> tf.Tensor: single_dim_entropy = 0.5 * tf.reduce_mean( - tf.log(2 * np.pi * np.e) + tf.square(encoded.log_sigma) + tf.log(2 * np.pi * np.e) + 2 * encoded.log_sigma ) # Make entropy the right shape return tf.ones_like(tf.reshape(encoded.mu[:, 0], [-1])) * single_dim_entropy diff --git a/ml-agents/mlagents/trainers/sac/optimizer.py b/ml-agents/mlagents/trainers/sac/optimizer.py index 963595fb28..0265fe1599 100644 --- a/ml-agents/mlagents/trainers/sac/optimizer.py +++ b/ml-agents/mlagents/trainers/sac/optimizer.py @@ -155,7 +155,6 @@ def __init__(self, policy: TFPolicy, trainer_params: Dict[str, Any]): "q1_loss": self.q1_loss, "q2_loss": self.q2_loss, "entropy_coef": self.ent_coef, - "entropy": self.policy.entropy, "update_batch": self.update_batch_policy, "update_value": self.update_batch_value, "update_entropy": self.update_batch_entropy, diff --git a/ml-agents/mlagents/trainers/tests/test_distributions.py b/ml-agents/mlagents/trainers/tests/test_distributions.py index 751894bd3f..c27047fd69 100644 --- a/ml-agents/mlagents/trainers/tests/test_distributions.py +++ b/ml-agents/mlagents/trainers/tests/test_distributions.py @@ -53,7 +53,7 @@ def dummy_config(): def test_gaussian_distribution(): with tf.Graph().as_default(): - logits = tf.Variable(initial_value=[[0, 0]], trainable=True, dtype=tf.float32) + logits = tf.Variable(initial_value=[[1, 1]], trainable=True, dtype=tf.float32) distribution = GaussianDistribution( logits, act_size=VECTOR_ACTION_SPACE, @@ -71,6 +71,14 @@ def test_gaussian_distribution(): assert out.shape[1] == VECTOR_ACTION_SPACE[0] output = sess.run([distribution.total_log_probs]) assert output[0].shape[0] == 1 + # Test entropy is correct + log_std_tensor = tf.get_default_graph().get_tensor_by_name( + "log_std/BiasAdd:0" + ) + feed_dict = {log_std_tensor: [[1.0, 1.0]]} + entropy = sess.run([distribution.entropy], feed_dict=feed_dict) + # Entropy with log_std of 1.0 should be 2.42 + assert pytest.approx(entropy[0], 0.01) == 2.42 def test_tanh_distribution(): diff --git a/ml-agents/mlagents/trainers/tests/test_simple_rl.py b/ml-agents/mlagents/trainers/tests/test_simple_rl.py index dfd89ae589..1873dce123 100644 --- a/ml-agents/mlagents/trainers/tests/test_simple_rl.py +++ b/ml-agents/mlagents/trainers/tests/test_simple_rl.py @@ -235,7 +235,7 @@ def test_visual_advanced_ppo(vis_encode_type, num_visual): def test_recurrent_ppo(use_discrete): env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete) override_vals = { - "max_steps": 4000, + "max_steps": 5000, "batch_size": 64, "buffer_size": 128, "learning_rate": 1e-3,