Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added Reward Providers for Torch #4280

Merged
merged 14 commits into from
Aug 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ml-agents/mlagents/trainers/buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def extend(self, data: np.ndarray) -> None:
Adds a list of np.arrays to the end of the list of np.arrays.
:param data: The np.array list to append.
"""
self += list(np.array(data))
self += list(np.array(data, dtype=np.float32))

def set(self, data):
"""
Expand Down
26 changes: 10 additions & 16 deletions ml-agents/mlagents/trainers/optimizer/torch_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@

from mlagents.trainers.buffer import AgentBuffer
from mlagents.trainers.components.bc.module import BCModule
from mlagents.trainers.components.reward_signals.extrinsic.signal import (
ExtrinsicRewardSignal,
)
from mlagents.trainers.torch.components.reward_providers import create_reward_provider

from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.optimizer import Optimizer
from mlagents.trainers.settings import TrainerSettings, RewardSignalType
from mlagents.trainers.settings import TrainerSettings
from mlagents.trainers.trajectory import SplitObservations
from mlagents.trainers.torch.utils import ModelUtils

Expand All @@ -37,16 +36,11 @@ def create_reward_signals(self, reward_signal_configs):
Create reward signals
:param reward_signal_configs: Reward signal config.
"""
extrinsic_signal = ExtrinsicRewardSignal(
self.policy, reward_signal_configs[RewardSignalType.EXTRINSIC]
)
self.reward_signals = {RewardSignalType.EXTRINSIC.value: extrinsic_signal}
# Create reward signals
# for reward_signal, config in reward_signal_configs.items():
# self.reward_signals[reward_signal] = create_reward_signal(
# self.policy, reward_signal, config
# )
# self.update_dict.update(self.reward_signals[reward_signal].update_dict)
for reward_signal, settings in reward_signal_configs.items():
# Name reward signals by string in case we have duplicates later
self.reward_signals[reward_signal.value] = create_reward_provider(
reward_signal, self.policy.behavior_spec, settings
)

def get_value_estimates(
self, decision_requests: DecisionSteps, idx: int, done: bool
Expand All @@ -72,7 +66,7 @@ def get_value_estimates(
# If we're done, reassign all of the value estimates that need terminal states.
if done:
for k in value_estimates:
if self.reward_signals[k].use_terminal_states:
if not self.reward_signals[k].ignore_done:
value_estimates[k] = 0.0

return value_estimates
Expand Down Expand Up @@ -111,7 +105,7 @@ def get_trajectory_value_estimates(

if done:
for k in next_value_estimate:
if self.reward_signals[k].use_terminal_states:
if not self.reward_signals[k].ignore_done:
next_value_estimate[k] = 0.0

return value_estimates, next_value_estimate
3 changes: 3 additions & 0 deletions ml-agents/mlagents/trainers/ppo/optimizer_torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,4 +149,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
"Losses/Value Loss": value_loss.detach().cpu().numpy(),
}

for reward_provider in self.reward_signals.values():
update_stats.update(reward_provider.update(batch))

return update_stats
25 changes: 19 additions & 6 deletions ml-agents/mlagents/trainers/ppo/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
TestingConfiguration,
FrameworkType,
)
from mlagents.trainers.components.reward_signals import RewardSignal

try:
from mlagents.trainers.policy.torch_policy import TorchPolicy
Expand Down Expand Up @@ -91,18 +92,30 @@ def _process_trajectory(self, trajectory: Trajectory) -> None:

for name, v in value_estimates.items():
agent_buffer_trajectory[f"{name}_value_estimates"].extend(v)
self._stats_reporter.add_stat(
self.optimizer.reward_signals[name].value_name, np.mean(v)
)
if isinstance(self.optimizer.reward_signals[name], RewardSignal):
self._stats_reporter.add_stat(
self.optimizer.reward_signals[name].value_name, np.mean(v)
ervteng marked this conversation as resolved.
Show resolved Hide resolved
)
else:
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate",
np.mean(v),
)

# Evaluate all reward functions
self.collected_rewards["environment"][agent_id] += np.sum(
agent_buffer_trajectory["environment_rewards"]
)
for name, reward_signal in self.optimizer.reward_signals.items():
evaluate_result = reward_signal.evaluate_batch(
agent_buffer_trajectory
).scaled_reward
if isinstance(reward_signal, RewardSignal):
evaluate_result = reward_signal.evaluate_batch(
agent_buffer_trajectory
).scaled_reward
else:
evaluate_result = (
reward_signal.evaluate(agent_buffer_trajectory)
* reward_signal.strength
)
agent_buffer_trajectory[f"{name}_rewards"].extend(evaluate_result)
# Report the reward signals
self.collected_rewards[name][agent_id] += np.sum(evaluate_result)
Expand Down
5 changes: 4 additions & 1 deletion ml-agents/mlagents/trainers/sac/optimizer_torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings):
# Use to reduce "survivor bonus" when using Curiosity or GAIL.
self.gammas = [_val.gamma for _val in trainer_params.reward_signals.values()]
self.use_dones_in_backup = {
name: int(self.reward_signals[name].use_terminal_states)
name: int(not self.reward_signals[name].ignore_done)
for name in self.stream_names
}

Expand Down Expand Up @@ -461,6 +461,9 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
.numpy(),
}

for signal in self.reward_signals.values():
signal.update(batch)

return update_stats

def update_reward_signals(
Expand Down
49 changes: 34 additions & 15 deletions ml-agents/mlagents/trainers/sac/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from mlagents.trainers.trajectory import Trajectory, SplitObservations
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.settings import TrainerSettings, SACSettings, FrameworkType
from mlagents.trainers.components.reward_signals import RewardSignal

try:
from mlagents.trainers.policy.torch_policy import TorchPolicy
Expand Down Expand Up @@ -143,9 +144,15 @@ def _process_trajectory(self, trajectory: Trajectory) -> None:
agent_buffer_trajectory["environment_rewards"]
)
for name, reward_signal in self.optimizer.reward_signals.items():
evaluate_result = reward_signal.evaluate_batch(
agent_buffer_trajectory
).scaled_reward
if isinstance(reward_signal, RewardSignal):
evaluate_result = reward_signal.evaluate_batch(
agent_buffer_trajectory
).scaled_reward
else:
evaluate_result = (
reward_signal.evaluate(agent_buffer_trajectory)
* reward_signal.strength
)
# Report the reward signals
self.collected_rewards[name][agent_id] += np.sum(evaluate_result)

Expand All @@ -154,9 +161,15 @@ def _process_trajectory(self, trajectory: Trajectory) -> None:
agent_buffer_trajectory, trajectory.next_obs, trajectory.done_reached
)
for name, v in value_estimates.items():
self._stats_reporter.add_stat(
self.optimizer.reward_signals[name].value_name, np.mean(v)
)
if isinstance(self.optimizer.reward_signals[name], RewardSignal):
self._stats_reporter.add_stat(
self.optimizer.reward_signals[name].value_name, np.mean(v)
)
else:
self._stats_reporter.add_stat(
f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value",
np.mean(v),
)

# Bootstrap using the last step rather than the bootstrap step if max step is reached.
# Set last element to duplicate obs and remove dones.
Expand Down Expand Up @@ -277,9 +290,14 @@ def _update_sac_policy(self) -> bool:
)
# Get rewards for each reward
for name, signal in self.optimizer.reward_signals.items():
sampled_minibatch[f"{name}_rewards"] = signal.evaluate_batch(
sampled_minibatch
).scaled_reward
if isinstance(signal, RewardSignal):
sampled_minibatch[f"{name}_rewards"] = signal.evaluate_batch(
sampled_minibatch
).scaled_reward
else:
sampled_minibatch[f"{name}_rewards"] = (
signal.evaluate(sampled_minibatch) * signal.strength
)

update_stats = self.optimizer.update(sampled_minibatch, n_sequences)
for stat_name, value in update_stats.items():
Expand Down Expand Up @@ -326,12 +344,13 @@ def _update_reward_signals(self) -> None:
reward_signal_minibatches = {}
for name, signal in self.optimizer.reward_signals.items():
logger.debug(f"Updating {name} at step {self.step}")
# Some signals don't need a minibatch to be sampled - so we don't!
if signal.update_dict:
reward_signal_minibatches[name] = buffer.sample_mini_batch(
self.hyperparameters.batch_size,
sequence_length=self.policy.sequence_length,
)
if isinstance(signal, RewardSignal):
# Some signals don't need a minibatch to be sampled - so we don't!
if signal.update_dict:
reward_signal_minibatches[name] = buffer.sample_mini_batch(
self.hyperparameters.batch_size,
sequence_length=self.policy.sequence_length,
)
update_stats = self.optimizer.update_reward_signals(
reward_signal_minibatches, n_sequences
)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import numpy as np
import pytest
import torch
from mlagents.trainers.torch.components.reward_providers import (
CuriosityRewardProvider,
create_reward_provider,
)
from mlagents_envs.base_env import BehaviorSpec, ActionType
from mlagents.trainers.settings import CuriositySettings, RewardSignalType
from mlagents.trainers.tests.torch.test_reward_providers.utils import (
create_agent_buffer,
)

SEED = [42]


@pytest.mark.parametrize(
"behavior_spec",
[
BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5),
BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)),
],
)
def test_construction(behavior_spec: BehaviorSpec) -> None:
curiosity_settings = CuriositySettings(32, 0.01)
curiosity_settings.strength = 0.1
curiosity_rp = CuriosityRewardProvider(behavior_spec, curiosity_settings)
assert curiosity_rp.strength == 0.1
assert curiosity_rp.name == "Curiosity"


@pytest.mark.parametrize(
"behavior_spec",
[
BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5),
BehaviorSpec([(10,), (64, 66, 3), (84, 86, 1)], ActionType.CONTINUOUS, 5),
BehaviorSpec([(10,), (64, 66, 1)], ActionType.DISCRETE, (2, 3)),
BehaviorSpec([(10,)], ActionType.DISCRETE, (2,)),
],
)
def test_factory(behavior_spec: BehaviorSpec) -> None:
curiosity_settings = CuriositySettings(32, 0.01)
curiosity_rp = create_reward_provider(
RewardSignalType.CURIOSITY, behavior_spec, curiosity_settings
)
assert curiosity_rp.name == "Curiosity"


@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize(
"behavior_spec",
[
BehaviorSpec([(10,), (64, 66, 3), (24, 26, 1)], ActionType.CONTINUOUS, 5),
BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)),
BehaviorSpec([(10,)], ActionType.DISCRETE, (2,)),
],
)
def test_reward_decreases(behavior_spec: BehaviorSpec, seed: int) -> None:
np.random.seed(seed)
torch.manual_seed(seed)
curiosity_settings = CuriositySettings(32, 0.01)
curiosity_rp = CuriosityRewardProvider(behavior_spec, curiosity_settings)
buffer = create_agent_buffer(behavior_spec, 5)
curiosity_rp.update(buffer)
reward_old = curiosity_rp.evaluate(buffer)[0]
for _ in range(10):
curiosity_rp.update(buffer)
reward_new = curiosity_rp.evaluate(buffer)[0]
assert reward_new < reward_old
reward_old = reward_new


@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize(
"behavior_spec", [BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5)]
)
def test_continuous_action_prediction(behavior_spec: BehaviorSpec, seed: int) -> None:
np.random.seed(seed)
torch.manual_seed(seed)
curiosity_settings = CuriositySettings(32, 0.1)
curiosity_rp = CuriosityRewardProvider(behavior_spec, curiosity_settings)
buffer = create_agent_buffer(behavior_spec, 5)
for _ in range(200):
curiosity_rp.update(buffer)
prediction = curiosity_rp._network.predict_action(buffer)[0].detach()
target = buffer["actions"][0]
error = float(torch.mean((prediction - target) ** 2))
assert error < 0.001


@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize(
"behavior_spec",
[
BehaviorSpec([(10,), (64, 66, 3)], ActionType.CONTINUOUS, 5),
BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)),
BehaviorSpec([(10,)], ActionType.DISCRETE, (2,)),
],
)
def test_next_state_prediction(behavior_spec: BehaviorSpec, seed: int) -> None:
np.random.seed(seed)
torch.manual_seed(seed)
curiosity_settings = CuriositySettings(32, 0.1)
curiosity_rp = CuriosityRewardProvider(behavior_spec, curiosity_settings)
buffer = create_agent_buffer(behavior_spec, 5)
for _ in range(100):
curiosity_rp.update(buffer)
prediction = curiosity_rp._network.predict_next_state(buffer)[0]
target = curiosity_rp._network.get_next_state(buffer)[0]
error = float(torch.mean((prediction - target) ** 2).detach())
assert error < 0.001
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import pytest
from mlagents.trainers.torch.components.reward_providers import (
ExtrinsicRewardProvider,
create_reward_provider,
)
from mlagents_envs.base_env import BehaviorSpec, ActionType
from mlagents.trainers.settings import RewardSignalSettings, RewardSignalType
from mlagents.trainers.tests.torch.test_reward_providers.utils import (
create_agent_buffer,
)


@pytest.mark.parametrize(
"behavior_spec",
[
BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5),
BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)),
],
)
def test_construction(behavior_spec: BehaviorSpec) -> None:
settings = RewardSignalSettings()
settings.gamma = 0.2
extrinsic_rp = ExtrinsicRewardProvider(behavior_spec, settings)
assert extrinsic_rp.gamma == 0.2
assert extrinsic_rp.name == "Extrinsic"


@pytest.mark.parametrize(
"behavior_spec",
[
BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5),
BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)),
],
)
def test_factory(behavior_spec: BehaviorSpec) -> None:
settings = RewardSignalSettings()
extrinsic_rp = create_reward_provider(
RewardSignalType.EXTRINSIC, behavior_spec, settings
)
assert extrinsic_rp.name == "Extrinsic"


@pytest.mark.parametrize("reward", [2.0, 3.0, 4.0])
@pytest.mark.parametrize(
"behavior_spec",
[
BehaviorSpec([(10,)], ActionType.CONTINUOUS, 5),
BehaviorSpec([(10,)], ActionType.DISCRETE, (2, 3)),
],
)
def test_reward(behavior_spec: BehaviorSpec, reward: float) -> None:
buffer = create_agent_buffer(behavior_spec, 1000, reward)
settings = RewardSignalSettings()
extrinsic_rp = ExtrinsicRewardProvider(behavior_spec, settings)
generated_rewards = extrinsic_rp.evaluate(buffer)
assert (generated_rewards == reward).all()
Loading