Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix for critic normalization bug #5595

Merged
merged 6 commits into from
Nov 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions com.unity.ml-agents/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ and this project adheres to
1. env_params.max_lifetime_restarts (--max-lifetime-restarts) [default=10]
2. env_params.restarts_rate_limit_n (--restarts-rate-limit-n) [default=1]
3. env_params.restarts_rate_limit_period_s (--restarts-rate-limit-period-s) [default=60]

### Bug Fixes
- Fixed a bug where the critics were not being normalized during training. (#5595)
- Fixed the bug where curriculum learning would crash because of the incorrect run_options parsing. (#5586)

#### com.unity.ml-agents / com.unity.ml-agents.extensions (C#)
Expand Down
1 change: 1 addition & 0 deletions ml-agents/mlagents/trainers/poca/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def _process_trajectory(self, trajectory: Trajectory) -> None:
# Update the normalization
if self.is_training:
self.policy.update_normalization(agent_buffer_trajectory)
self.optimizer.critic.update_normalization(agent_buffer_trajectory)

# Get all value estimates
(
Expand Down
1 change: 1 addition & 0 deletions ml-agents/mlagents/trainers/ppo/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def _process_trajectory(self, trajectory: Trajectory) -> None:
# Update the normalization
if self.is_training:
self.policy.update_normalization(agent_buffer_trajectory)
self.optimizer.critic.update_normalization(agent_buffer_trajectory)

# Get all value estimates
(
Expand Down
1 change: 1 addition & 0 deletions ml-agents/mlagents/trainers/sac/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ def _process_trajectory(self, trajectory: Trajectory) -> None:
# Update the normalization
if self.is_training:
self.policy.update_normalization(agent_buffer_trajectory)
self.optimizer.critic.update_normalization(agent_buffer_trajectory)

# Evaluate all reward functions for reporting purposes
self.collected_rewards["environment"][agent_id] += np.sum(
Expand Down
164 changes: 164 additions & 0 deletions ml-agents/mlagents/trainers/tests/test_trainers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
from unittest.mock import patch

import pytest

from mlagents.trainers.agent_processor import AgentManagerQueue
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager
from mlagents.trainers.settings import RunOptions
from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.tests.dummy_config import (
create_observation_specs_with_shapes,
ppo_dummy_config,
poca_dummy_config,
sac_dummy_config,
)
from mlagents.trainers.tests.mock_brain import make_fake_trajectory
from mlagents.trainers.trainer import TrainerFactory


@pytest.fixture
def ppo_config():
return RunOptions(behaviors={"test_brain": ppo_dummy_config()})


@pytest.fixture
def sac_config():
return RunOptions(behaviors={"test_brain": sac_dummy_config()})


@pytest.fixture
def poca_config():
return RunOptions(behaviors={"test_brain": poca_dummy_config()})


def test_ppo_trainer_update_normalization(ppo_config):
behavior_id_team0 = "test_brain?team=0"
brain_name = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0).brain_name
mock_specs = mb.setup_test_behavior_specs(
True, False, vector_action_space=[2], vector_obs_space=1
)
base_config = ppo_config.behaviors
output_path = "results_dir"
train_model = True
load_model = False
seed = 42
trainer_factory = TrainerFactory(
trainer_config=base_config,
output_path=output_path,
train_model=train_model,
load_model=load_model,
seed=seed,
param_manager=EnvironmentParameterManager(),
)
ppo_trainer = trainer_factory.generate(brain_name)
parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0)
policy = ppo_trainer.create_policy(parsed_behavior_id0, mock_specs)
ppo_trainer.add_policy(parsed_behavior_id0, policy)
trajectory_queue0 = AgentManagerQueue(behavior_id_team0)
ppo_trainer.subscribe_trajectory_queue(trajectory_queue0)
time_horizon = 15
trajectory = make_fake_trajectory(
length=time_horizon,
max_step_complete=True,
observation_specs=create_observation_specs_with_shapes([(1,)]),
action_spec=mock_specs.action_spec,
)
trajectory_queue0.put(trajectory)
# mocking out update_normalization in both the policy and critic
with patch(
"mlagents.trainers.torch.networks.ValueNetwork.update_normalization"
) as optimizer_update_normalization_mock, patch(
"mlagents.trainers.policy.torch_policy.TorchPolicy.update_normalization"
) as policy_update_normalization_mock:
ppo_trainer.advance()
optimizer_update_normalization_mock.assert_called_once()
policy_update_normalization_mock.assert_called_once()


def test_sac_trainer_update_normalization(sac_config):
behavior_id_team0 = "test_brain?team=0"
brain_name = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0).brain_name
mock_specs = mb.setup_test_behavior_specs(
True, False, vector_action_space=[2], vector_obs_space=1
)
base_config = sac_config.behaviors
output_path = "results_dir"
train_model = True
load_model = False
seed = 42
trainer_factory = TrainerFactory(
trainer_config=base_config,
output_path=output_path,
train_model=train_model,
load_model=load_model,
seed=seed,
param_manager=EnvironmentParameterManager(),
)
sac_trainer = trainer_factory.generate(brain_name)
parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0)
policy = sac_trainer.create_policy(parsed_behavior_id0, mock_specs)
sac_trainer.add_policy(parsed_behavior_id0, policy)
trajectory_queue0 = AgentManagerQueue(behavior_id_team0)
sac_trainer.subscribe_trajectory_queue(trajectory_queue0)
time_horizon = 15
trajectory = make_fake_trajectory(
length=time_horizon,
max_step_complete=True,
observation_specs=create_observation_specs_with_shapes([(1,)]),
action_spec=mock_specs.action_spec,
)
trajectory_queue0.put(trajectory)
# mocking out update_normalization in both the policy and critic
with patch(
"mlagents.trainers.torch.networks.ValueNetwork.update_normalization"
) as optimizer_update_normalization_mock, patch(
"mlagents.trainers.policy.torch_policy.TorchPolicy.update_normalization"
) as policy_update_normalization_mock:
sac_trainer.advance()
optimizer_update_normalization_mock.assert_called_once()
policy_update_normalization_mock.assert_called_once()


def test_poca_trainer_update_normalization(poca_config):
behavior_id_team0 = "test_brain?team=0"
brain_name = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0).brain_name
mock_specs = mb.setup_test_behavior_specs(
True, False, vector_action_space=[2], vector_obs_space=1
)
base_config = poca_config.behaviors
output_path = "results_dir"
train_model = True
load_model = False
seed = 42
trainer_factory = TrainerFactory(
trainer_config=base_config,
output_path=output_path,
train_model=train_model,
load_model=load_model,
seed=seed,
param_manager=EnvironmentParameterManager(),
)
poca_trainer = trainer_factory.generate(brain_name)
parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0)
policy = poca_trainer.create_policy(parsed_behavior_id0, mock_specs)
poca_trainer.add_policy(parsed_behavior_id0, policy)
trajectory_queue0 = AgentManagerQueue(behavior_id_team0)
poca_trainer.subscribe_trajectory_queue(trajectory_queue0)
time_horizon = 15
trajectory = make_fake_trajectory(
length=time_horizon,
max_step_complete=True,
observation_specs=create_observation_specs_with_shapes([(1,)]),
action_spec=mock_specs.action_spec,
)
trajectory_queue0.put(trajectory)
# mocking out update_normalization in both the policy and critic
with patch(
"mlagents.trainers.poca.optimizer_torch.TorchPOCAOptimizer.POCAValueNetwork.update_normalization"
) as optimizer_update_normalization_mock, patch(
"mlagents.trainers.policy.torch_policy.TorchPolicy.update_normalization"
) as policy_update_normalization_mock:
poca_trainer.advance()
optimizer_update_normalization_mock.assert_called_once()
policy_update_normalization_mock.assert_called_once()