From 24056395cb01e775b1af31cdddaad3dd61a668f7 Mon Sep 17 00:00:00 2001 From: CloudChen <73453791+Cloud-Pku@users.noreply.github.com> Date: Thu, 1 Feb 2024 17:55:14 +0800 Subject: [PATCH] feature(cy): add dreamerV3 + MiniGrid code (#725) * support flat obs, discrete action; add minigrid config * fix one bug * fix eval bug * modify minigrid wrapper * modify something * fix onething * polish & style check * polish the code * code polish * polish code --- ding/model/template/vac.py | 4 +- ding/policy/mbpolicy/dreamer.py | 25 ++- ding/torch_utils/network/dreamer.py | 6 +- ding/world_model/dreamer.py | 78 ++++++--- ding/world_model/model/networks.py | 18 +- .../cartpole_balance_dreamer_config.py | 2 + .../cheetah_run/cheetah_run_dreamer_config.py | 2 + .../walker_walk/walker_walk_dreamer_config.py | 3 +- .../config/minigrid_dreamer_config.py | 96 ++++++++++ dizoo/minigrid/envs/minigrid_env.py | 15 +- dizoo/minigrid/envs/minigrid_wrapper.py | 164 +++++++++++++++++- 11 files changed, 367 insertions(+), 46 deletions(-) create mode 100644 dizoo/minigrid/config/minigrid_dreamer_config.py diff --git a/ding/model/template/vac.py b/ding/model/template/vac.py index 29363d3570..47d5cb1bd6 100644 --- a/ding/model/template/vac.py +++ b/ding/model/template/vac.py @@ -366,7 +366,6 @@ class DREAMERVAC(nn.Module): def __init__( self, - obs_shape: Union[int, SequenceType], action_shape: Union[int, SequenceType, EasyDict], dyn_stoch=32, dyn_deter=512, @@ -391,9 +390,8 @@ def __init__( - action_shape (:obj:`Union[int, SequenceType]`): Action space shape, such as 6 or [2, 3, 3]. """ super(DREAMERVAC, self).__init__() - obs_shape: int = squeeze(obs_shape) action_shape = squeeze(action_shape) - self.obs_shape, self.action_shape = obs_shape, action_shape + self.action_shape = action_shape if dyn_discrete: feat_size = dyn_stoch * dyn_discrete + dyn_deter diff --git a/ding/policy/mbpolicy/dreamer.py b/ding/policy/mbpolicy/dreamer.py index 43d3b88619..35287c1bb6 100644 --- a/ding/policy/mbpolicy/dreamer.py +++ b/ding/policy/mbpolicy/dreamer.py @@ -234,8 +234,11 @@ def _forward_collect(self, data: dict, world_model, envstep, reset=None, state=N latent[key][i] *= mask[i] for i in range(len(action)): action[i] *= mask[i] - - data = data - 0.5 + assert world_model.obs_type == 'vector' or world_model.obs_type == 'RGB', \ + "action type must be vector or RGB" + # normalize RGB image input + if world_model.obs_type == 'RGB': + data = data - 0.5 embed = world_model.encoder(data) latent, _ = world_model.dynamics.obs_step(latent, action, embed, self._cfg.collect.collect_dyn_sample) feat = world_model.dynamics.get_feat(latent) @@ -247,11 +250,18 @@ def _forward_collect(self, data: dict, world_model, envstep, reset=None, state=N action = action.detach() state = (latent, action) + assert world_model.action_type == 'discrete' or world_model.action_type == 'continuous', \ + "action type must be continuous or discrete" + if world_model.action_type == 'discrete': + action = torch.where(action == 1)[1] output = {"action": action, "logprob": logprob, "state": state} if self._cuda: output = to_device(output, 'cpu') output = default_decollate(output) + if world_model.action_type == 'discrete': + for l in range(len(output)): + output[l]['action'] = output[l]['action'].squeeze(0) return {i: d for i, d in zip(data_id, output)} def _process_transition(self, obs: Any, model_output: dict, timestep: namedtuple) -> dict: @@ -272,7 +282,7 @@ def _process_transition(self, obs: Any, model_output: dict, timestep: namedtuple # TODO(zp) random_collect just have action #'logprob': model_output['logprob'], 'reward': timestep.reward, - 'discount': timestep.info['discount'], + 'discount': 1. - timestep.done, # timestep.info['discount'], 'done': timestep.done, } return transition @@ -309,7 +319,9 @@ def _forward_eval(self, data: dict, world_model, reset=None, state=None) -> dict for i in range(len(action)): action[i] *= mask[i] - data = data - 0.5 + # normalize RGB image input + if world_model.obs_type == 'RGB': + data = data - 0.5 embed = world_model.encoder(data) latent, _ = world_model.dynamics.obs_step(latent, action, embed, self._cfg.collect.collect_dyn_sample) feat = world_model.dynamics.get_feat(latent) @@ -321,11 +333,16 @@ def _forward_eval(self, data: dict, world_model, reset=None, state=None) -> dict action = action.detach() state = (latent, action) + if world_model.action_type == 'discrete': + action = torch.where(action == 1)[1] output = {"action": action, "logprob": logprob, "state": state} if self._cuda: output = to_device(output, 'cpu') output = default_decollate(output) + if world_model.action_type == 'discrete': + for l in range(len(output)): + output[l]['action'] = output[l]['action'].squeeze(0) return {i: d for i, d in zip(data_id, output)} def _monitor_vars_learn(self) -> List[str]: diff --git a/ding/torch_utils/network/dreamer.py b/ding/torch_utils/network/dreamer.py index f7c1597e54..b7ae67b57c 100644 --- a/ding/torch_utils/network/dreamer.py +++ b/ding/torch_utils/network/dreamer.py @@ -178,7 +178,7 @@ def forward(self, features): elif self._dist == "binary": return Bernoulli(torchd.independent.Independent(torchd.bernoulli.Bernoulli(logits=mean), len(self._shape))) elif self._dist == "twohot_symlog": - return TwoHotDistSymlog(logits=mean, device=self._device) + return TwoHotDistSymlog(logits=mean, low=-1., high=1., device=self._device) raise NotImplementedError(self._dist) @@ -475,8 +475,8 @@ def log_prob(self, x): above = torch.clip(above, 0, len(self.buckets) - 1) equal = (below == above) - dist_to_below = torch.where(equal, 1, torch.abs(self.buckets[below] - x)) - dist_to_above = torch.where(equal, 1, torch.abs(self.buckets[above] - x)) + dist_to_below = torch.where(equal, torch.tensor(1).to(x), torch.abs(self.buckets[below] - x)) + dist_to_above = torch.where(equal, torch.tensor(1).to(x), torch.abs(self.buckets[above] - x)) total = dist_to_below + dist_to_above weight_below = dist_to_above / total weight_above = dist_to_below / total diff --git a/ding/world_model/dreamer.py b/ding/world_model/dreamer.py index eafe257454..ceac6fa082 100644 --- a/ding/world_model/dreamer.py +++ b/ding/world_model/dreamer.py @@ -5,10 +5,10 @@ from ding.utils import WORLD_MODEL_REGISTRY, lists_to_dicts from ding.utils.data import default_collate -from ding.model import ConvEncoder +from ding.model import ConvEncoder, FCEncoder from ding.world_model.base_world_model import WorldModel from ding.world_model.model.networks import RSSM, ConvDecoder -from ding.torch_utils import to_device +from ding.torch_utils import to_device, one_hot from ding.torch_utils.network.dreamer import DenseHead @@ -37,6 +37,7 @@ class DREAMERWorldModel(WorldModel, nn.Module): norm='LayerNorm', grad_heads=['image', 'reward', 'discount'], units=512, + image_dec_layers=2, reward_layers=2, discount_layers=2, value_layers=2, @@ -71,26 +72,33 @@ def __init__(self, cfg, env, tb_logger): self._cfg.act = nn.modules.activation.SiLU # nn.SiLU self._cfg.norm = nn.modules.normalization.LayerNorm # nn.LayerNorm self.state_size = self._cfg.state_size + self.obs_type = self._cfg.obs_type self.action_size = self._cfg.action_size + self.action_type = self._cfg.action_type self.reward_size = self._cfg.reward_size self.hidden_size = self._cfg.hidden_size self.batch_size = self._cfg.batch_size + if self.obs_type == 'vector': + self.encoder = FCEncoder(self.state_size, self._cfg.encoder_hidden_size_list, activation=torch.nn.SiLU()) + self.embed_size = self._cfg.encoder_hidden_size_list[-1] + elif self.obs_type == 'RGB': + self.encoder = ConvEncoder( + self.state_size, + hidden_size_list=[32, 64, 128, 256, 4096], # to last layer 128? + activation=torch.nn.SiLU(), + kernel_size=self._cfg.encoder_kernels, + layer_norm=True + ) + self.embed_size = ( + (self.state_size[1] // 2 ** (len(self._cfg.encoder_kernels))) ** 2 * self._cfg.cnn_depth * + 2 ** (len(self._cfg.encoder_kernels) - 1) + ) - self.encoder = ConvEncoder( - self.state_size, - hidden_size_list=[32, 64, 128, 256, 4096], # to last layer 128? - activation=torch.nn.SiLU(), - kernel_size=self._cfg.encoder_kernels, - layer_norm=True - ) - self.embed_size = ( - (self.state_size[1] // 2 ** (len(self._cfg.encoder_kernels))) ** 2 * self._cfg.cnn_depth * - 2 ** (len(self._cfg.encoder_kernels) - 1) - ) self.dynamics = RSSM( self._cfg.dyn_stoch, self._cfg.dyn_deter, self._cfg.dyn_hidden, + self._cfg.action_type, self._cfg.dyn_input_layers, self._cfg.dyn_output_layers, self._cfg.dyn_rec_depth, @@ -113,14 +121,28 @@ def __init__(self, cfg, env, tb_logger): feat_size = self._cfg.dyn_stoch * self._cfg.dyn_discrete + self._cfg.dyn_deter else: feat_size = self._cfg.dyn_stoch + self._cfg.dyn_deter - self.heads["image"] = ConvDecoder( - feat_size, # pytorch version - self._cfg.cnn_depth, - self._cfg.act, - self._cfg.norm, - self.state_size, - self._cfg.decoder_kernels, - ) + + if isinstance(self.state_size, int): + self.heads['image'] = DenseHead( + feat_size, + (self.state_size, ), + self._cfg.image_dec_layers, + self._cfg.units, + 'SiLU', # self._cfg.act + 'LN', # self._cfg.norm + dist='binary', + outscale=0.0, + device=self._cfg.device, + ) + elif len(self.state_size) == 3: + self.heads["image"] = ConvDecoder( + feat_size, # pytorch version + self._cfg.cnn_depth, + self._cfg.act, + self._cfg.norm, + self.state_size, + self._cfg.decoder_kernels, + ) self.heads["reward"] = DenseHead( feat_size, # dyn_stoch * dyn_discrete + dyn_deter (255, ), @@ -172,9 +194,15 @@ def train(self, env_buffer, envstep, train_iter, batch_size, batch_length): data = {k: torch.stack(data[k], dim=1) for k in data} # -> {dict_key: Tensor([B, T, any_dims])} data['discount'] = data.get('discount', 1.0 - data['done'].float()) - data['discount'] *= 0.997 data['weight'] = data.get('weight', None) - data['image'] = data['obs'] - 0.5 + if self.obs_type == 'RGB': + data['image'] = data['obs'] - 0.5 + else: + data['image'] = data['obs'] + if self.action_type == 'continuous': + data['action'] *= (1.0 / torch.clip(torch.abs(data['action']), min=1.0)) + else: + data['action'] = one_hot(data['action'], self.action_size) data = to_device(data, self._cfg.device) if len(data['reward'].shape) == 2: data['reward'] = data['reward'].unsqueeze(-1) @@ -185,9 +213,9 @@ def train(self, env_buffer, envstep, train_iter, batch_size, batch_length): self.requires_grad_(requires_grad=True) - image = data['image'].reshape([-1] + list(data['image'].shape[-3:])) + image = data['image'].reshape([-1] + list(data['image'].shape[2:])) embed = self.encoder(image) - embed = embed.reshape(list(data['image'].shape[:-3]) + [embed.shape[-1]]) + embed = embed.reshape(list(data['image'].shape[:2]) + [embed.shape[-1]]) post, prior = self.dynamics.observe(embed, data["action"]) kl_loss, kl_value, loss_lhs, loss_rhs = self.dynamics.kl_loss( diff --git a/ding/world_model/model/networks.py b/ding/world_model/model/networks.py index 091fa4f827..2e47cc6604 100644 --- a/ding/world_model/model/networks.py +++ b/ding/world_model/model/networks.py @@ -1,11 +1,12 @@ import math import numpy as np +from typing import Optional, Dict, Union, List import torch from torch import nn import torch.nn.functional as F from torch import distributions as torchd - +from ding.utils import SequenceType from ding.torch_utils.network.dreamer import weight_init, uniform_weight_init, static_scan, \ OneHotDist, ContDist, SymlogDist, DreamerLayerNorm @@ -17,6 +18,7 @@ def __init__( stoch=30, deter=200, hidden=200, + action_type=None, layers_input=1, layers_output=1, rec_depth=1, @@ -38,6 +40,7 @@ def __init__( self._stoch = stoch self._deter = deter self._hidden = hidden + self._action_type = action_type self._min_std = min_std self._layers_input = layers_input self._layers_output = layers_output @@ -179,7 +182,8 @@ def get_dist(self, state, dtype=None): def obs_step(self, prev_state, prev_action, embed, sample=True): # if shared is True, prior and post both use same networks(inp_layers, _img_out_layers, _ims_stat_layer) # otherwise, post use different network(_obs_out_layers) with prior[deter] and embed as inputs - prev_action *= (1.0 / torch.clip(torch.abs(prev_action), min=1.0)).detach() + if self._action_type == 'continuous': + prev_action *= (1.0 / torch.clip(torch.abs(prev_action), min=1.0)).detach() prior = self.img_step(prev_state, prev_action, None, sample) if self._shared: post = self.img_step(prev_state, prev_action, embed, sample) @@ -202,7 +206,8 @@ def obs_step(self, prev_state, prev_action, embed, sample=True): # this is used for making future image def img_step(self, prev_state, prev_action, embed=None, sample=True): # (batch, stoch, discrete_num) - prev_action *= (1.0 / torch.clip(torch.abs(prev_action), min=1.0)).detach() + if self._action_type == 'continuous': + prev_action *= (1.0 / torch.clip(torch.abs(prev_action), min=1.0)).detach() prev_stoch = prev_state["stoch"] if self._discrete: shape = list(prev_stoch.shape[:-2]) + [self._stoch * self._discrete] @@ -282,8 +287,9 @@ def kl_loss(self, post, prior, forward, free, lscale, rscale): dist(sg(lhs)) if self._discrete else dist(sg(lhs))._dist, dist(rhs) if self._discrete else dist(rhs)._dist, ) - loss_lhs = torch.clip(torch.mean(value_lhs), min=free) - loss_rhs = torch.clip(torch.mean(value_rhs), min=free) + # free bits + loss_lhs = torch.mean(torch.clip(value_lhs, min=free)) + loss_rhs = torch.mean(torch.clip(value_rhs, min=free)) loss = lscale * loss_lhs + rscale * loss_rhs return loss, value, loss_lhs, loss_rhs @@ -357,7 +363,7 @@ def calc_same_pad(self, k, s, d): outpad = pad * 2 - val return pad, outpad - def __call__(self, features, dtype=None): + def __call__(self, features): x = self._linear_layer(features) # feature:[batch, time, stoch*discrete + deter] x = x.reshape([-1, 4, 4, self._embed_size // 16]) x = x.permute(0, 3, 1, 2) diff --git a/dizoo/dmc2gym/config/cartpole_balance/cartpole_balance_dreamer_config.py b/dizoo/dmc2gym/config/cartpole_balance/cartpole_balance_dreamer_config.py index 66f7c7e2a4..623cfaacf1 100644 --- a/dizoo/dmc2gym/config/cartpole_balance/cartpole_balance_dreamer_config.py +++ b/dizoo/dmc2gym/config/cartpole_balance/cartpole_balance_dreamer_config.py @@ -60,7 +60,9 @@ cuda=cuda, model=dict( state_size=(3, 64, 64), # has to be specified + obs_type='RGB', action_size=1, # has to be specified + action_type='continuous', reward_size=1, batch_size=16, ), diff --git a/dizoo/dmc2gym/config/cheetah_run/cheetah_run_dreamer_config.py b/dizoo/dmc2gym/config/cheetah_run/cheetah_run_dreamer_config.py index 32a43463e7..22b6ae911b 100644 --- a/dizoo/dmc2gym/config/cheetah_run/cheetah_run_dreamer_config.py +++ b/dizoo/dmc2gym/config/cheetah_run/cheetah_run_dreamer_config.py @@ -60,7 +60,9 @@ cuda=cuda, model=dict( state_size=(3, 64, 64), # has to be specified + obs_type='RGB', action_size=6, # has to be specified + action_type='continuous', reward_size=1, batch_size=16, ), diff --git a/dizoo/dmc2gym/config/walker_walk/walker_walk_dreamer_config.py b/dizoo/dmc2gym/config/walker_walk/walker_walk_dreamer_config.py index 16e76eac39..da7f9e0edb 100644 --- a/dizoo/dmc2gym/config/walker_walk/walker_walk_dreamer_config.py +++ b/dizoo/dmc2gym/config/walker_walk/walker_walk_dreamer_config.py @@ -28,7 +28,6 @@ # it is better to put random_collect_size in policy.other random_collect_size=2500, model=dict( - obs_shape=(3, 64, 64), action_shape=6, actor_dist='normal', ), @@ -60,7 +59,9 @@ cuda=cuda, model=dict( state_size=(3, 64, 64), # has to be specified + obs_type='RGB', action_size=6, # has to be specified + action_type='continuous', reward_size=1, batch_size=16, ), diff --git a/dizoo/minigrid/config/minigrid_dreamer_config.py b/dizoo/minigrid/config/minigrid_dreamer_config.py new file mode 100644 index 0000000000..410f803d96 --- /dev/null +++ b/dizoo/minigrid/config/minigrid_dreamer_config.py @@ -0,0 +1,96 @@ +from easydict import EasyDict + +from ding.entry import serial_pipeline_dreamer + +cuda = False +collector_env_num = 8 +evaluator_env_num = 5 +minigrid_dreamer_config = dict( + exp_name='minigrid_dreamer_empty', + env=dict( + collector_env_num=collector_env_num, + evaluator_env_num=evaluator_env_num, + n_evaluator_episode=evaluator_env_num, + # typical MiniGrid env id: + # {'MiniGrid-Empty-8x8-v0', 'MiniGrid-FourRooms-v0', 'MiniGrid-DoorKey-8x8-v0','MiniGrid-DoorKey-16x16-v0'}, + # please refer to https://github.com/Farama-Foundation/MiniGrid for details. + env_id='MiniGrid-Empty-8x8-v0', + # env_id='MiniGrid-AKTDT-7x7-1-v0', + max_step=100, + stop_value=20, # run fixed env_steps + # stop_value=0.96, + flat_obs=True, + full_obs=True, + onehot_obs=True, + move_bonus=True, + ), + policy=dict( + cuda=cuda, + # it is better to put random_collect_size in policy.other + random_collect_size=2500, + model=dict( + action_shape=7, + # encoder_hidden_size_list=[256, 128, 64, 64], + # critic_head_hidden_size=64, + # actor_head_hidden_size=64, + actor_dist='onehot', + ), + learn=dict( + lambda_=0.95, + learning_rate=3e-5, + batch_size=16, + batch_length=64, + imag_sample=True, + discount=0.997, + reward_EMA=True, + ), + collect=dict( + n_sample=1, + unroll_len=1, + action_size=7, # has to be specified + collect_dyn_sample=True, + ), + eval=dict(evaluator=dict(eval_freq=5000, )), + other=dict( + # environment buffer + replay_buffer=dict(replay_buffer_size=500000, periodic_thruput_seconds=60), + ), + ), + world_model=dict( + pretrain=100, + train_freq=2, + cuda=cuda, + model=dict( + state_size=1344, + obs_type = 'vector', + action_size=7, + action_type='discrete', + encoder_hidden_size_list=[256, 128, 64, 64], + reward_size=1, + batch_size=16, + ), + ), +) + +minigrid_dreamer_config = EasyDict(minigrid_dreamer_config) + +minigrid_create_config = dict( + env=dict( + type='minigrid', + import_names=['dizoo.minigrid.envs.minigrid_env'], + ), + env_manager=dict(type='base'), + policy=dict( + type='dreamer', + import_names=['ding.policy.mbpolicy.dreamer'], + ), + replay_buffer=dict(type='sequence', ), + world_model=dict( + type='dreamer', + import_names=['ding.world_model.dreamer'], + ), +) +minigrid_create_config = EasyDict(minigrid_create_config) + +if __name__ == '__main__': + serial_pipeline_dreamer((minigrid_dreamer_config, minigrid_create_config), seed=0, max_env_step=500000) diff --git a/dizoo/minigrid/envs/minigrid_env.py b/dizoo/minigrid/envs/minigrid_env.py index 12bd64cae0..923967394e 100644 --- a/dizoo/minigrid/envs/minigrid_env.py +++ b/dizoo/minigrid/envs/minigrid_env.py @@ -9,8 +9,8 @@ import numpy as np from matplotlib import animation import matplotlib.pyplot as plt -from minigrid.wrappers import FlatObsWrapper, RGBImgPartialObsWrapper, ImgObsWrapper -from .minigrid_wrapper import ViewSizeWrapper +from minigrid.wrappers import FullyObsWrapper +from .minigrid_wrapper import ViewSizeWrapper, MoveBonus, OneHotObsWrapper, FlatObsWrapper from ding.envs import ObsPlusPrevActRewWrapper from ding.envs import BaseEnv, BaseEnvTimestep @@ -36,6 +36,9 @@ def __init__(self, cfg: dict) -> None: self._init_flag = False self._env_id = cfg.env_id self._flat_obs = cfg.flat_obs + self._full_obs = cfg.full_obs + self._onehot_obs = cfg.onehot_obs + self._move_bonus = cfg.move_bonus self._save_replay = False self._max_step = cfg.max_step @@ -52,6 +55,12 @@ def reset(self) -> np.ndarray: self._env = ViewSizeWrapper(self._env, agent_view_size=5) if self._env_id == 'MiniGrid-AKTDT-7x7-1-v0': self._env = ViewSizeWrapper(self._env, agent_view_size=3) + if self._full_obs: + self._env = FullyObsWrapper(self._env) + if self._onehot_obs: + self._env = OneHotObsWrapper(self._env) + if self._move_bonus: + self._env = MoveBonus(self._env) if self._flat_obs: self._env = FlatObsWrapper(self._env) # self._env = RGBImgPartialObsWrapper(self._env) @@ -60,7 +69,7 @@ def reset(self) -> np.ndarray: self._env = ObsPlusPrevActRewWrapper(self._env) self._init_flag = True if self._flat_obs: - self._observation_space = gym.spaces.Box(0, 1, shape=(2835, ), dtype=np.float32) + self._observation_space = gym.spaces.Box(0, 1, shape=self._env.observation_space.shape, dtype=np.float32) else: self._observation_space = self._env.observation_space # to be compatiable with subprocess env manager diff --git a/dizoo/minigrid/envs/minigrid_wrapper.py b/dizoo/minigrid/envs/minigrid_wrapper.py index 09a14c9c81..72683d159d 100644 --- a/dizoo/minigrid/envs/minigrid_wrapper.py +++ b/dizoo/minigrid/envs/minigrid_wrapper.py @@ -1,6 +1,10 @@ import gymnasium as gym from gymnasium import spaces -from gymnasium.core import ObservationWrapper +from gymnasium.core import ObservationWrapper, Wrapper +import numpy as np +import operator +from functools import reduce +from minigrid.core.constants import COLOR_TO_IDX, OBJECT_TO_IDX, STATE_TO_IDX class ViewSizeWrapper(ObservationWrapper): @@ -32,3 +36,161 @@ def observation(self, obs): # print('vis_mask:' + vis_mask) image = grid.encode(vis_mask) return {**obs, "image": image} + + +class MoveBonus(Wrapper): + """ + Adds an movement bonus based on which positions + are visited on the grid. + + Example: + >>> import gymnasium as gym + >>> from minigrid.wrappers import PositionBonus + >>> env = gym.make("MiniGrid-Empty-5x5-v0") + >>> _, _ = env.reset(seed=0) + >>> _, reward, _, _, _ = env.step(1) + >>> print(reward) + 0 + >>> _, reward, _, _, _ = env.step(1) + >>> print(reward) + 0 + >>> env_bonus = MoveBonus(env) + >>> obs, _ = env_bonus.reset(seed=0) + >>> obs, reward, terminated, truncated, info = env_bonus.step(1) + >>> print(reward) + 1.0 + >>> obs, reward, terminated, truncated, info = env_bonus.step(1) + >>> print(reward) + 0.7071067811865475 + """ + + def __init__(self, env): + """A wrapper that adds an exploration bonus to less visited positions. + + Args: + env: The environment to apply the wrapper + """ + super().__init__(env) + self.goal_pos = (self.width - 2, self.height - 2) + self.scale = np.sqrt(self.width ** 2 + self.height ** 2) + + def step(self, action): + """Steps through the environment with `action`.""" + + cur_dis = np.sqrt( + (self.goal_pos[0] - self.env.agent_pos[0]) ** 2 + (self.goal_pos[1] - self.env.agent_pos[1]) ** 2 + ) + obs, reward, terminated, truncated, info = self.env.step(action) + tmp_dis = np.sqrt( + (self.goal_pos[0] - self.env.agent_pos[0]) ** 2 + (self.goal_pos[1] - self.env.agent_pos[1]) ** 2 + ) + + move_bonus = (cur_dis - tmp_dis) / self.scale + reward += move_bonus + + return obs, reward, terminated, truncated, info + + +class OneHotObsWrapper(ObservationWrapper): + """ + Wrapper to get a one-hot encoding of a partially observable + agent view as observation. + + Example: + >>> import gymnasium as gym + >>> from minigrid.wrappers import OneHotPartialObsWrapper + >>> env = gym.make("MiniGrid-Empty-5x5-v0") + >>> obs, _ = env.reset() + >>> obs["image"][0, :, :] + array([[2, 5, 0], + [2, 5, 0], + [2, 5, 0], + [2, 5, 0], + [2, 5, 0], + [2, 5, 0], + [2, 5, 0]], dtype=uint8) + >>> env = OneHotPartialObsWrapper(env) + >>> obs, _ = env.reset() + >>> obs["image"][0, :, :] + array([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0], + [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0], + [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0], + [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0], + [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0], + [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0], + [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0]], + dtype=uint8) + """ + + def __init__(self, env): + """A wrapper that makes the image observation a one-hot encoding of a partially observable agent view. + + Args: + env: The environment to apply the wrapper + """ + super().__init__(env) + + obs_shape = env.observation_space["image"].shape + + # Number of bits per cell + num_bits = len(OBJECT_TO_IDX) + len(COLOR_TO_IDX) + len(STATE_TO_IDX) + 1 + + new_image_space = spaces.Box(low=0, high=1, shape=(obs_shape[0], obs_shape[1], num_bits), dtype="float32") + self.observation_space = spaces.Dict({**self.observation_space.spaces, "image": new_image_space}) + + def observation(self, obs): + img = obs["image"] + out = np.zeros(self.observation_space.spaces["image"].shape, dtype="float32") + + for i in range(img.shape[0]): + for j in range(img.shape[1]): + type = img[i, j, 0] + color = img[i, j, 1] + state = img[i, j, 2] + + out[i, j, type] = 1 + out[i, j, len(OBJECT_TO_IDX) + color] = 1 + out[i, j, len(OBJECT_TO_IDX) + len(COLOR_TO_IDX) + state] = 1 + + return {**obs, "image": out} + + +class FlatObsWrapper(ObservationWrapper): + """ + Encode mission strings using a one-hot scheme, + and combine these with observed images into one flat array. + + This wrapper is not applicable to BabyAI environments, given that these have their own language component. + + Example: + >>> import gymnasium as gym + >>> import matplotlib.pyplot as plt + >>> from minigrid.wrappers import FlatObsWrapper + >>> env = gym.make("MiniGrid-LavaCrossingS11N5-v0") + >>> env_obs = FlatObsWrapper(env) + >>> obs, _ = env_obs.reset() + >>> obs.shape + (2835,) + """ + + def __init__(self, env): + super().__init__(env) + + imgSpace = env.observation_space.spaces["image"] + imgSize = reduce(operator.mul, imgSpace.shape, 1) + + self.observation_space = spaces.Box( + low=0, + high=255, + shape=(imgSize, ), + dtype="float32", + ) + + self.cachedStr: str = None + + def observation(self, obs): + img = obs["image"] + + img = img.flatten() + + return img