diff --git a/ding/envs/env/ding_env_wrapper.py b/ding/envs/env/ding_env_wrapper.py index 5914b18d98..83fab06048 100644 --- a/ding/envs/env/ding_env_wrapper.py +++ b/ding/envs/env/ding_env_wrapper.py @@ -15,13 +15,28 @@ class DingEnvWrapper(BaseEnv): + """ + Overview: + This is a wrapper for the BaseEnv class, used to provide a consistent environment interface. + Interfaces: + __init__, reset, step, close, seed, random_action, _wrap_env, __repr__, create_collector_env_cfg, + create_evaluator_env_cfg, enable_save_replay, observation_space, action_space, reward_space, clone + """ def __init__(self, env: gym.Env = None, cfg: dict = None, seed_api: bool = True, caller: str = 'collector') -> None: """ - You can pass in either an env instance, or a config to create an env instance: - - An env instance: Parameter `env` must not be `None`, but should be the instance. - Do not support subprocess env manager; Thus usually used in simple env. - - A config to create an env instance: Parameter `cfg` dict must contain `env_id`. + Overview: + Initialize the DingEnvWrapper. Either an environment instance or a config to create the environment + instance should be passed in: + - An environment instance: The `env` parameter must not be `None`, but should be the instance. + It does not support subprocess environment manager. Thus, it is usually used in simple environments. + - A config to create an environment instance: The `cfg` parameter must contain `env_id`. + Arguments: + - env (:obj:`gym.Env`): An environment instance to be wrapped. + - cfg (:obj:`dict`): The configuration dictionary to create an environment instance. + - seed_api (:obj:`bool`): Whether to use seed API. Defaults to True. + - caller (:obj:`str`): A string representing the caller of this method, including ``collector`` or + ``evaluator``. Different caller may need different wrappers. Default is 'collector'. """ self._env = None self._raw_env = env @@ -59,7 +74,13 @@ def __init__(self, env: gym.Env = None, cfg: dict = None, seed_api: bool = True, self._replay_path = None # override - def reset(self) -> None: + def reset(self) -> np.ndarray: + """ + Overview: + Resets the state of the environment. If the environment is not initialized, it will be created first. + Returns: + - obs (:obj:`Dict`): The new observation after reset. + """ if not self._init_flag: self._env = gym.make(self._cfg.env_id) self._wrap_env(self._caller) @@ -108,6 +129,12 @@ def reset(self) -> None: # override def close(self) -> None: + """ + Overview: + Clean up the environment by closing and deleting it. + This method should be called when the environment is no longer needed. + Failing to call this method can lead to memory leaks. + """ try: self._env.close() del self._env @@ -116,12 +143,27 @@ def close(self) -> None: # override def seed(self, seed: int, dynamic_seed: bool = True) -> None: + """ + Overview: + Set the seed for the environment. + Arguments: + - seed (:obj:`int`): The seed to set. + - dynamic_seed (:obj:`bool`): Whether to use dynamic seed, default is True. + """ self._seed = seed self._dynamic_seed = dynamic_seed np.random.seed(self._seed) # override def step(self, action: Union[np.int64, np.ndarray]) -> BaseEnvTimestep: + """ + Overview: + Execute the given action in the environment, and return the timestep (observation, reward, done, info). + Arguments: + - action (:obj:`Union[np.int64, np.ndarray]`): The action to execute in the environment. + Returns: + - timestep (:obj:`BaseEnvTimestep`): The timestep after the action execution. + """ action = self._judge_action_type(action) if self._cfg.act_scale: action = affine_transform(action, min_val=self._env.action_space.low, max_val=self._env.action_space.high) @@ -137,6 +179,15 @@ def step(self, action: Union[np.int64, np.ndarray]) -> BaseEnvTimestep: return BaseEnvTimestep(obs, rew, done, info) def _judge_action_type(self, action: Union[np.ndarray, dict]) -> Union[np.ndarray, dict]: + """ + Overview: + Ensure the action taken by the agent is of the correct type. + This method is used to standardize different action types to a common format. + Arguments: + - action (Union[np.ndarray, dict]): The action taken by the agent. + Returns: + - action (Union[np.ndarray, dict]): The formatted action. + """ if isinstance(action, int): return action elif isinstance(action, np.int64): @@ -161,6 +212,12 @@ def _judge_action_type(self, action: Union[np.ndarray, dict]) -> Union[np.ndarra ) def random_action(self) -> np.ndarray: + """ + Overview: + Return a random action from the action space of the environment. + Returns: + - action (:obj:`np.ndarray`): The random action. + """ random_action = self.action_space.sample() if isinstance(random_action, np.ndarray): pass @@ -177,6 +234,13 @@ def random_action(self) -> np.ndarray: return random_action def _wrap_env(self, caller: str = 'collector') -> None: + """ + Overview: + Wrap the environment according to the configuration. + Arguments: + - caller (:obj:`str`): The caller of the environment, including ``collector`` or ``evaluator``. \ + Different caller may need different wrappers. Default is 'collector'. + """ # wrapper_cfgs: Union[str, List] wrapper_cfgs = self._cfg.env_wrapper if isinstance(wrapper_cfgs, str): @@ -191,10 +255,24 @@ def _wrap_env(self, caller: str = 'collector') -> None: self._env = wrapper(self._env) def __repr__(self) -> str: + """ + Overview: + Return the string representation of the instance. + Returns: + - str (:obj:`str`): The string representation of the instance. + """ return "DI-engine Env({}), generated by DingEnvWrapper".format(self._cfg.env_id) @staticmethod def create_collector_env_cfg(cfg: dict) -> List[dict]: + """ + Overview: + Create a list of environment configuration for collectors based on the input configuration. + Arguments: + - cfg (:obj:`dict`): The input configuration dictionary. + Returns: + - env_cfgs (:obj:`List[dict]`): The list of environment configurations for collectors. + """ actor_env_num = cfg.pop('collector_env_num') cfg = copy.deepcopy(cfg) cfg.is_train = True @@ -202,31 +280,81 @@ def create_collector_env_cfg(cfg: dict) -> List[dict]: @staticmethod def create_evaluator_env_cfg(cfg: dict) -> List[dict]: + """ + Overview: + Create a list of environment configuration for evaluators based on the input configuration. + Arguments: + - cfg (:obj:`dict`): The input configuration dictionary. + Returns: + - env_cfgs (:obj:`List[dict]`): The list of environment configurations for evaluators. + """ evaluator_env_num = cfg.pop('evaluator_env_num') cfg = copy.deepcopy(cfg) cfg.is_train = False return [cfg for _ in range(evaluator_env_num)] def enable_save_replay(self, replay_path: Optional[str] = None) -> None: + """ + Overview: + Enable the save replay functionality. The replay will be saved at the specified path. + Arguments: + - replay_path (:obj:`Optional[str]`): The path to save the replay, default is None. + """ if replay_path is None: replay_path = './video' self._replay_path = replay_path @property def observation_space(self) -> gym.spaces.Space: + """ + Overview: + Return the observation space of the wrapped environment. + The observation space represents the range and shape of possible observations + that the environment can provide to the agent. + Note: + If the data type of the observation space is float64, it's converted to float32 + for better compatibility with most machine learning libraries. + Returns: + - observation_space (gym.spaces.Space): The observation space of the environment. + """ if self._observation_space.dtype == np.float64: self._observation_space.dtype = np.float32 return self._observation_space @property def action_space(self) -> gym.spaces.Space: + """ + Overview: + Return the action space of the wrapped environment. + The action space represents the range and shape of possible actions + that the agent can take in the environment. + Returns: + - action_space (gym.spaces.Space): The action space of the environment. + """ return self._action_space @property def reward_space(self) -> gym.spaces.Space: + """ + Overview: + Return the reward space of the wrapped environment. + The reward space represents the range and shape of possible rewards + that the agent can receive as a result of its actions. + Returns: + - reward_space (gym.spaces.Space): The reward space of the environment. + """ return self._reward_space def clone(self, caller: str = 'collector') -> BaseEnv: + """ + Overview: + Clone the current environment wrapper, creating a new environment with the same settings. + Arguments: + - caller (str): A string representing the caller of this method, including ``collector`` or ``evaluator``. \ + Different caller may need different wrappers. Default is 'collector'. + Returns: + - DingEnvWrapper: A new instance of the environment with the same settings. + """ try: spec = copy.deepcopy(self._raw_env.spec) raw_env = CloudPickleWrapper(self._raw_env) diff --git a/ding/envs/env_wrappers/env_wrappers.py b/ding/envs/env_wrappers/env_wrappers.py index 75aa67c73c..76c0880d59 100644 --- a/ding/envs/env_wrappers/env_wrappers.py +++ b/ding/envs/env_wrappers/env_wrappers.py @@ -1,76 +1,93 @@ -# Borrow a lot from openai baselines: -# https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py +""" +This code is adapted from OpenAI Baselines: + https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py + +List of Environment Wrappers: +- NoopResetWrapper: This wrapper facilitates the sampling of initial states by executing a random number of + no-operation actions upon environment reset. +- MaxAndSkipWrapper: Incorporates max pooling across time steps, a method that reduces the temporal dimension by taking + the maximum value over specified time intervals. +- WarpFrameWrapper: Implements frame warping by resizing the images to 84x84, a common preprocessing step in + reinforcement learning on visual data, as described in the DeepMind Nature paper and subsequent works. +- ScaledFloatFrameWrapper: Normalizes observations to a range of 0 to 1, which is a common requirement for neural + network inputs. +- ClipRewardWrapper: Clips the reward to {-1, 0, +1} based on its sign. This simplifies the reward structure and + can make learning more stable in environments with high variance in rewards. +- DelayRewardWrapper: Returns cumulative reward at defined intervals, and at all other times, returns a reward of 0. + This can be useful for sparse reward problems. +- FrameStackWrapper: Stacks the latest 'n' frames as a single observation. This allows the agent to have a sense of + dynamics and motion from the stacked frames. +- ObsTransposeWrapper: Transposes the observation to bring the channel to the first dimension, a common requirement + for convolutional neural networks. +- ObsNormWrapper: Normalizes observations based on a running mean and standard deviation. This can help to standardize + inputs for the agent and speed up learning. +- RewardNormWrapper: Normalizes reward based on a running standard deviation, which can stabilize learning in + environments with high variance in rewards. +- RamWrapper: Wraps a RAM-based environment into an image-like environment. This can be useful for applying + image-based algorithms to RAM-based Atari games. +- EpisodicLifeWrapper: Treats end of life as the end of an episode, but only resets on true game over. This can help + the agent better differentiate between losing a life and losing the game. +- FireResetWrapper: Executes the 'fire' action upon environment reset. This is specific to certain Atari games where + the 'fire' action starts the game. +- GymHybridDictActionWrapper: Transforms the original `gym.spaces.Tuple` action space into a `gym.spaces.Dict`. +- FlatObsWrapper: Flattens image and language observations into a single vector, which can be helpful for input into + certain types of models. +- StaticObsNormWrapper: Provides functionality for normalizing observations according to a static mean and + standard deviation. +- EvalEpisodeReturnWrapper: Evaluates the return over an episode during evaluation, providing a more comprehensive + view of the agent's performance. +- GymToGymnasiumWrapper: Adapts environments from the Gym library to be compatible with the Gymnasium library. +- AllinObsWrapper: Consolidates all information into the observation, useful for environments where the agent's + observation should include additional information such as the current score or time remaining. +""" -from typing import Union, List, Tuple -from easydict import EasyDict -from functools import reduce -from collections import deque import copy import operator +from collections import deque +from functools import reduce +from typing import Union, Any, Tuple, Dict, List + import gym import gymnasium import numpy as np -from torch import float32 +from easydict import EasyDict from ding.torch_utils import to_ndarray from ding.utils import ENV_WRAPPER_REGISTRY, import_module -""" - -Env Wrapper List: - - NoopResetWrapper: Sample initial states by taking random number of no-ops on reset. - - MaxAndSkipWrapper: Max pooling across time steps - - WarpFrameWrapper: Warp frames to 84x84 as done in the Nature paper and later work. - - ScaledFloatFrameWrapper: Normalize observations to 0~1. - - ClipRewardWrapper: Clip the reward to {+1, 0, -1} by its sign. - - DelayRewardWrapper: Return cumulative reward at intervals; At other time, return reward of 0. - - FrameStackWrapper: Stack latest n frames(usually 4 in Atari) as one observation. - - ObsTransposeWrapper: Transpose observation to put channel to first dim. - - ObsNormWrapper: Normalize observations according to running mean and std. - - RewardNormWrapper: Normalize reward according to running std. - - RamWrapper: Wrap ram env into image-like env - - EpisodicLifeWrapper: Make end-of-life == end-of-episode, but only reset on true game over. - - FireResetWrapper: Take fire action at environment reset. - - GymHybridDictActionWrapper: Transform the original ``gym.spaces.Tuple`` action space to ``gym.spaces.Dict``. - - FlatObsWrapper: Flatten image and language observation into a vector. - - StaticObsNormWrapper - - EvalEpisodeReturnWrapper - - GymToGymnasiumWrapper - - AllinObsWrapper -""" @ENV_WRAPPER_REGISTRY.register('noop_reset') class NoopResetWrapper(gym.Wrapper): """ Overview: - Sample initial states by taking random number of no-ops on reset. \ - No-op is assumed to be action 0. - Interface: - ``__init__``, ``reset``, ``new_shape`` + Sample initial states by taking random number of no-ops on reset. No-op is assumed to be action 0. + Interfaces: + __init__, reset Properties: - env (:obj:`gym.Env`): the environment to wrap. - noop_max (:obj:`int`): the maximum value of no-ops to run. """ - def __init__(self, env, noop_max=30): + def __init__(self, env: gym.Env, noop_max: int = 30): """ Overview: - Initialize ``self.`` See ``help(type(self))`` for accurate signature. + Initialize the NoopResetWrapper. Arguments: - env (:obj:`gym.Env`): the environment to wrap. - - noop_max (:obj:`int`): the maximum value of no-ops to run. + - noop_max (:obj:`int`): the maximum value of no-ops to run. Defaults to 30. """ super().__init__(env) self.noop_max = noop_max self.noop_action = 0 assert env.unwrapped.get_action_meanings()[0] == 'NOOP' - def reset(self): + def reset(self) -> np.ndarray: """ Overview: - Resets the state of the environment and returns an initial observation. + Resets the state of the environment and returns an initial observation, + after taking a random number of no-ops. Returns: - - observation (:obj:`Any`): the initial observation. + - observation (:obj:`Any`): The initial observation after no-ops. """ self.env.reset() noops = np.random.randint(1, self.noop_max + 1) @@ -85,41 +102,39 @@ def reset(self): class MaxAndSkipWrapper(gym.Wrapper): """ Overview: - Return only every `skip`-th frame (frameskipping) using most \ - recent raw observations (for max pooling across time steps) - Interface: - ``__init__``, ``step``, ``new_shape`` + Wraps the environment to return only every ``skip``-th frame (frameskipping) \ + using most recent raw observations (for max pooling across time steps). + Interfaces: + __init__, step Properties: - - env (:obj:`gym.Env`): the environment to wrap. - - skip (:obj:`int`): number of `skip`-th frame. + - env (:obj:`gym.Env`): The environment to wrap. + - skip (:obj:`int`): Number of ``skip``-th frame. Defaults to 4. """ - def __init__(self, env, skip=4): + def __init__(self, env: gym.Env, skip: int = 4): """ Overview: - Initialize ``self.`` See ``help(type(self))`` for accurate signature. + Initialize the MaxAndSkipWrapper. Arguments: - - env (:obj:`gym.Env`): the environment to wrap. - - skip (:obj:`int`): number of `skip`-th frame. + - env (:obj:`gym.Env`): The environment to wrap. + - skip (:obj:`int`): Number of ``skip``-th frame. Defaults to 4. """ super().__init__(env) self._skip = skip - def step(self, action): + def step(self, action: Union[int, np.ndarray]) -> tuple: """ Overview: - Step the environment with the given action. Repeat action, \ - sum reward, and max over last observations. + Take the given action and repeat it for a specified number of steps. \ + The rewards are summed up and the maximum frame over the last observations is returned. Arguments: - - action (:obj:`Any`): the given action to step with. + - action (:obj:`Any`): The action to repeat. Returns: - - max_frame (:obj:`np.array`) : max over last observations - - total_reward (:obj:`Any`) : amount of reward returned after previous action - - done (:obj:`Bool`) : whether the episode has ended, in which case further step() \ - calls will return undefined results - - info (:obj:`Dict`) : contains auxiliary diagnostic information (helpful for \ + - max_frame (:obj:`np.array`): Max over last observations + - total_reward (:obj:`Any`): Sum of rewards after previous action. + - done (:obj:`Bool`): Whether the episode has ended. + - info (:obj:`Dict`): Contains auxiliary diagnostic information (helpful for \ debugging, and sometimes learning) - """ obs_list, total_reward, done = [], 0., False for i in range(self._skip): @@ -136,21 +151,25 @@ def step(self, action): class WarpFrameWrapper(gym.ObservationWrapper): """ Overview: - Warp frames to 84x84 as done in the Nature paper and later work. - Interface: - ``__init__``, ``observation``, ``new_shape`` + The WarpFrameWrapper class is a gym observation wrapper that resizes + the frame of an environment observation to a specified size (default is 84x84). + This is often used in the preprocessing pipeline of observations in reinforcement learning, + especially for visual observations from Atari environments. + Interfaces: + __init__, observation Properties: - env (:obj:`gym.Env`): the environment to wrap. - - ``size=84``, ``obs_space``, ``self.observation_space`` - + - size (:obj:`int`): the size to which the frames are to be resized. + - observation_space (:obj:`gym.Space`): the observation space of the wrapped environment. """ - def __init__(self, env, size=84): + def __init__(self, env: gym.Env, size: int = 84): """ Overview: - Initialize ``self.`` See ``help(type(self))`` for accurate signature. + Constructor for WarpFrameWrapper class, initializes the environment and the size. Arguments: - env (:obj:`gym.Env`): the environment to wrap. + - size (:obj:`int`): the size to which the frames are to be resized. Default is 84. """ super().__init__(env) self.size = size @@ -170,14 +189,14 @@ def __init__(self, env, size=84): if len(self.observation_space) == 1: self.observation_space = self.observation_space[0] - def observation(self, frame): + def observation(self, frame: np.ndarray) -> np.ndarray: """ Overview: - Returns the current observation from a frame + Resize the frame (observation) to the desired size. Arguments: - - frame (:obj:`Any`): the frame to get observation from + - frame (:obj:`np.ndarray`): the frame to be resized. Returns: - - observation (:obj:`Any`): Framed observation + - frame (:obj:`np.ndarray`): the resized frame. """ try: import cv2 @@ -186,7 +205,7 @@ def observation(self, frame): import sys logging.warning("Please install opencv-python first.") sys.exit(1) - # deal with channel_first case + # deal with the `channel_first` case if frame.shape[0] < 10: frame = frame.transpose(1, 2, 0) frame = cv2.resize(frame, (self.size, self.size), interpolation=cv2.INTER_AREA) @@ -202,15 +221,15 @@ def observation(self, frame): class ScaledFloatFrameWrapper(gym.ObservationWrapper): """ Overview: - Normalize observations to 0~1. - Interface: - ``__init__``, ``observation``, ``new_shape`` + The ScaledFloatFrameWrapper normalizes observations to between 0 and 1. + Interfaces: + __init__, observation """ - def __init__(self, env): + def __init__(self, env: gym.Env): """ Overview: - Initialize ``self.`` See ``help(type(self))`` for accurate signature; setup the properties. + Initialize the ScaledFloatFrameWrapper, setting the scale and bias for normalization. Arguments: - env (:obj:`gym.Env`): the environment to wrap. """ @@ -221,16 +240,15 @@ def __init__(self, env): self.scale = high - low self.observation_space = gym.spaces.Box(low=0., high=1., shape=env.observation_space.shape, dtype=np.float32) - def observation(self, observation): + def observation(self, observation: np.ndarray) -> np.ndarray: """ Overview: - Returns the scaled observation + Scale the observation to be within the range [0, 1]. Arguments: - - observation(:obj:`Float`): The original observation + - observation (:obj:`np.ndarray`): the original observation. Returns: - - observation (:obj:`Float`): The Scaled Float observation + - scaled_observation (:obj:`np.ndarray`): the scaled observation. """ - return ((observation - self.bias) / self.scale).astype('float32') @@ -238,33 +256,33 @@ def observation(self, observation): class ClipRewardWrapper(gym.RewardWrapper): """ Overview: - Clip the reward to {+1, 0, -1} by its sign. - Interface: - ``__init__``, ``reward``, ``new_shape`` + The ClipRewardWrapper class is a gym reward wrapper that clips the reward to {-1, 0, +1} based on its sign. + This can be used to normalize the scale of the rewards in reinforcement learning algorithms. + Interfaces: + __init__, reward Properties: - env (:obj:`gym.Env`): the environment to wrap. - - ``reward_range`` - + - reward_range (:obj:`Tuple[int, int]`): the range of the reward values after clipping. """ - def __init__(self, env): + def __init__(self, env: gym.Env): """ Overview: - Initialize ``self.`` See ``help(type(self))`` for accurate signature; setup the properties. + Initialize the ClipRewardWrapper class. Arguments: - env (:obj:`gym.Env`): the environment to wrap. """ super().__init__(env) self.reward_range = (-1, 1) - def reward(self, reward): + def reward(self, reward: float) -> float: """ Overview: - Bin reward to {+1, 0, -1} by its sign. Note: np.sign(0) == 0. + Clip the reward to {-1, 0, +1} based on its sign. Note: np.sign(0) == 0. Arguments: - - reward(:obj:`Float`): Raw Reward + - reward (:obj:`float`): the original reward. Returns: - - reward(:obj:`Float`): Clipped Reward + - reward (:obj:`float`): the clipped reward. """ return np.sign(reward) @@ -273,26 +291,49 @@ def reward(self, reward): class ActionRepeatWrapper(gym.Wrapper): """ Overview: - Repeat the action to step with env. - Interface: - ``__init__``, ``step`` + The ActionRepeatWrapper class is a gym wrapper that repeats the same action for a number of steps. + This wrapper is particularly useful in environments where the desired effect is achieved by maintaining + the same action across multiple time steps. For instance, some physical environments like motion control + tasks might require consistent force input to produce a significant state change. + + Using this wrapper can reduce the temporal complexity of the problem, as it allows the agent to perform + multiple actions within a single time step. This can speed up learning, as the agent has fewer decisions + to make within a time step. However, it may also sacrifice some level of decision-making precision, as the + agent cannot change its action across successive time steps. + + Note that the use of the ActionRepeatWrapper may not be suitable for all types of environments. Specifically, + it may not be the best choice for environments where new decisions must be made at each time step, or where + the time sequence of actions has a significant impact on the outcome. + Interfaces: + __init__, step Properties: - env (:obj:`gym.Env`): the environment to wrap. - - ``action_repeat`` - + - action_repeat (:obj:`int`): the number of times to repeat the action. """ - def __init__(self, env, action_repeat=1): + def __init__(self, env: gym.Env, action_repeat: int = 1): """ Overview: - Initialize ``self.`` See ``help(type(self))`` for accurate signature; setup the properties. + Initialize the ActionRepeatWrapper class. Arguments: - env (:obj:`gym.Env`): the environment to wrap. + - action_repeat (:obj:`int`): the number of times to repeat the action. Default is 1. """ super().__init__(env) self.action_repeat = action_repeat - def step(self, action): + def step(self, action: Union[int, np.ndarray]) -> tuple: + """ + Overview: + Take the given action and repeat it for a specified number of steps. The rewards are summed up. + Arguments: + - action (:obj:`Union[int, np.ndarray]`): The action to repeat. + Returns: + - obs (:obj:`np.ndarray`): The observation after repeating the action. + - reward (:obj:`float`): The sum of rewards after repeating the action. + - done (:obj:`bool`): Whether the episode has ended. + - info (:obj:`Dict`): Contains auxiliary diagnostic information. + """ reward = 0 for _ in range(self.action_repeat): obs, rew, done, info = self.env.step(action) @@ -306,31 +347,67 @@ def step(self, action): class DelayRewardWrapper(gym.Wrapper): """ Overview: - Return cumulative reward at intervals; At other time, return reward of 0. - Interface: - ``__init__``, ``reset``, ``step``, ``new_shape`` + The DelayRewardWrapper class is a gym wrapper that delays the reward. It cumulates the reward over a + predefined number of steps and returns the cumulated reward only at the end of this interval. + At other times, it returns a reward of 0. + + This wrapper is particularly useful in environments where the impact of an action is not immediately + observable, but rather delayed over several steps. For instance, in strategic games or planning tasks, + the effect of an action may not be directly noticeable, but it contributes to a sequence of actions that + leads to a reward. In these cases, delaying the reward to match the action-effect delay can make the + learning process more consistent with the problem's nature. + + However, using this wrapper may increase the difficulty of learning, as the agent needs to associate its + actions with delayed outcomes. It also introduces a non-standard reward structure, which could limit the + applicability of certain reinforcement learning algorithms. + + Note that the use of the DelayRewardWrapper may not be suitable for all types of environments. Specifically, + it may not be the best choice for environments where the effect of actions is immediately observable and the + reward should be assigned accordingly. + Interfaces: + __init__, reset, step Properties: - env (:obj:`gym.Env`): the environment to wrap. - - ``reward_range`` + - delay_reward_step (:obj:`int`): the number of steps over which to delay and cumulate the reward. """ - def __init__(self, env, delay_reward_step=0): + def __init__(self, env: gym.Env, delay_reward_step: int = 0): """ Overview: - Initialize ``self.`` See ``help(type(self))`` for accurate signature; setup the properties. + Initialize the DelayRewardWrapper class. Arguments: - env (:obj:`gym.Env`): the environment to wrap. + - delay_reward_step (:obj:`int`): the number of steps over which to delay and cumulate the reward. """ super().__init__(env) self._delay_reward_step = delay_reward_step - def reset(self): + def reset(self) -> np.ndarray: + """ + Overview: + Resets the state of the environment and resets the delay reward duration and current delay reward. + Returns: + - obs (:obj:`np.ndarray`): the initial observation of the environment. + """ self._delay_reward_duration = 0 self._current_delay_reward = 0. obs = self.env.reset() return obs - def step(self, action): + def step(self, action: Union[int, np.ndarray]) -> tuple: + """ + Overview: + Take the given action and repeat it for a specified number of steps. The rewards are summed up. + If the number of steps equals the delay reward step, return the cumulated reward and reset the + delay reward duration and current delay reward. Otherwise, return a reward of 0. + Arguments: + - action (:obj:`Union[int, np.ndarray]`): the action to take in the step. + Returns: + - obs (:obj:`np.ndarray`): The observation after the step. + - reward (:obj:`float`): The cumulated reward after the delay reward step or 0. + - done (:obj:`bool`): Whether the episode has ended. + - info (:obj:`Dict`): Contains auxiliary diagnostic information. + """ obs, reward, done, info = self.env.step(action) self._current_delay_reward += reward self._delay_reward_duration += 1 @@ -347,27 +424,57 @@ def step(self, action): class EvalEpisodeReturnWrapper(gym.Wrapper): """ Overview: - Accumulate rewards at every timestep, and return at the end of the episode in `info`. - Interface: - ``__init__``, ``reset``, ``step``, ``new_shape`` + A wrapper for a gym environment that accumulates rewards at every timestep, and returns the total reward at the + end of the episode in `info`. This is used for evaluation purposes. + Interfaces: + __init__, reset, step Properties: - env (:obj:`gym.Env`): the environment to wrap. """ - def __init__(self, env): + def __init__(self, env: gym.Env): """ Overview: - Initialize ``self.`` See ``help(type(self))`` for accurate signature; setup the properties. + Initialize the EvalEpisodeReturnWrapper. This involves setting up the environment to wrap. Arguments: - - env (:obj:`gym.Env`): the environment to wrap. + - env (:obj:`gym.Env`): The environment to wrap. """ super().__init__(env) - def reset(self): + def reset(self) -> np.ndarray: + """ + Overview: + Reset the environment and initialize the accumulated reward to zero. + Returns: + - obs (:obj:`np.ndarray`): The initial observation from the environment. + """ self._eval_episode_return = 0. return self.env.reset() - def step(self, action): + def step(self, action: Any) -> tuple: + """ + Overview: + Step the environment with the provided action, accumulate the returned reward, and add the total reward to + `info` if the episode is done. + Arguments: + - action (:obj:`Any`): The action to take in the environment. + Returns: + - obs (:obj:`np.ndarray`): The next observation from the environment. + - reward (:obj:`float`): The reward from taking the action. + - done (:obj:`bool`): Whether the episode is done. + - info (:obj:`Dict[str, Any]`): A dictionary of extra information, which includes 'eval_episode_return' if + the episode is done. + Examples: + >>> env = gym.make("CartPole-v1") + >>> env = EvalEpisodeReturnWrapper(env) + >>> obs = env.reset() + >>> done = False + >>> while not done: + ... action = env.action_space.sample() # Replace with your own policy + ... obs, reward, done, info = env.step(action) + ... if done: + ... print("Total episode reward:", info['eval_episode_return']) + """ obs, reward, done, info = self.env.step(action) self._eval_episode_return += reward if done: @@ -378,23 +485,27 @@ def step(self, action): @ENV_WRAPPER_REGISTRY.register('frame_stack') class FrameStackWrapper(gym.Wrapper): """ - Overview: - Stack latest n frames(usually 4 in Atari) as one observation. - Interface: - ``__init__``, ``reset``, ``step``, ``_get_ob``, ``new_shape`` - Properties: - - env (:obj:`gym.Env`): the environment to wrap. - - n_frame (:obj:`int`): the number of frames to stack. - - ``observation_space``, ``frames`` - """ - - def __init__(self, env, n_frames=4): + Overview: + FrameStackWrapper is a gym environment wrapper that stacks the latest n frames (generally 4 in Atari) + as a single observation. It is commonly used in environments where the observation is an image, + and consecutive frames provide useful temporal information for the agent. + Interfaces: + __init__, reset, step, _get_ob + Properties: + - env (:obj:`gym.Env`): The environment to wrap. + - n_frames (:obj:`int`): The number of frames to stack. + - frames (:obj:`collections.deque`): A queue that holds the most recent frames. + - observation_space (:obj:`gym.Space`): The space of the stacked observations. + """ + + def __init__(self, env: gym.Env, n_frames: int = 4) -> None: """ Overview: - Initialize ``self.`` See ``help(type(self))`` for accurate signature; setup the properties. + Initialize the FrameStackWrapper. This process includes setting up the environment to wrap, + the number of frames to stack, and the observation space. Arguments: - - env (:obj:`gym.Env`): the environment to wrap. - - n_frame (:obj:`int`): the number of frames to stack. + - env (:obj:`gym.Env`): The environment to wrap. + - n_frame (:obj:`int`): The number of frames to stack. """ super().__init__(env) self.n_frames = n_frames @@ -413,42 +524,43 @@ def __init__(self, env, n_frames=4): if len(self.observation_space) == 1: self.observation_space = self.observation_space[0] - def reset(self): + def reset(self) -> np.ndarray: """ Overview: - Resets the state of the environment and append new observation to frames + Reset the environment and initialize frames with the initial observation. Returns: - - ``self._get_ob()``: observation + - init_obs (:obj:`np.ndarray`): The stacked initial observations. """ obs = self.env.reset() for _ in range(self.n_frames): self.frames.append(obs) return self._get_ob() - def step(self, action): + def step(self, action: Any) -> Tuple[np.ndarray, float, bool, Dict[str, Any]]: """ Overview: - Step the environment with the given action. Repeat action, sum reward, \ - and max over last observations, and append new observation to frames + Perform a step in the environment with the given action, append the returned observation + to frames, and return the stacked observations. Arguments: - - action (:obj:`Any`): the given action to step with. + - action (:obj:`Any`): The action to perform a step with. Returns: - - ``self._get_ob()`` : observation - - reward (:obj:`Any`) : amount of reward returned after previous action - - done (:obj:`Bool`) : whether the episode has ended, in which case further \ - step() calls will return undefined results - - info (:obj:`Dict`) : contains auxiliary diagnostic information (helpful \ - for debugging, and sometimes learning) + - self._get_ob() (:obj:`np.ndarray`): The stacked observations. + - reward (:obj:`float`): The amount of reward returned after the previous action. + - done (:obj:`bool`): Whether the episode has ended, in which case further step() calls will return + undefined results. + - info (:obj:`Dict[str, Any]`): Contains auxiliary diagnostic information (helpful for debugging, + and sometimes learning). """ - obs, reward, done, info = self.env.step(action) self.frames.append(obs) return self._get_ob(), reward, done, info - def _get_ob(self): + def _get_ob(self) -> np.ndarray: """ Overview: - The original wrapper use `LazyFrames` but since we use np buffer, it has no effect + The original wrapper used `LazyFrames`, but since we use an np buffer, it has no effect. + Returns: + - stacked_frames (:obj:`np.ndarray`): The stacked frames. """ return np.stack(self.frames, axis=0) @@ -457,21 +569,23 @@ def _get_ob(self): class ObsTransposeWrapper(gym.ObservationWrapper): """ Overview: - Transpose observation to put channel to first dim. - Interface: - ``__init__``, ``observation``, ``new_shape`` + The ObsTransposeWrapper class is a gym wrapper that transposes the observation to put the channel dimension + first. This can be helpful for certain types of neural networks that expect the channel dimension to be + the first dimension. + Interfaces: + __init__, observation Properties: - - env (:obj:`gym.Env`): the environment to wrap. - - ``observation_space`` + - env (:obj:`gym.Env`): The environment to wrap. + - observation_space (:obj:`gym.spaces.Box`): The transformed observation space. """ - def __init__(self, env): + def __init__(self, env: gym.Env): """ Overview: - Initialize ``self.`` See ``help(type(self))`` for accurate signature; \ - setup the properties. + Initialize the ObsTransposeWrapper class and update the observation space according to the environment's + observation space. Arguments: - - env (:obj:`gym.Env`): the environment to wrap. + - env (:obj:`gym.Env`): The environment to wrap. """ super().__init__(env) obs_space = env.observation_space @@ -493,11 +607,12 @@ def __init__(self, env): def observation(self, obs: Union[tuple, np.ndarray]) -> Union[tuple, np.ndarray]: """ Overview: - Returns the transposed observation + Transpose the observation to put the channel dimension first. If the observation is a tuple, each element + in the tuple is transposed independently. Arguments: - - observation (:obj:`Union[tuple, np.ndarray]`): The original observation + - obs (:obj:`Union[tuple, np.ndarray]`): The original observation. Returns: - - observation (:obj:`Union[tuple, np.ndarray]`): The transposed observation + - obs (:obj:`Union[tuple, np.ndarray]`): The transposed observation. """ if isinstance(obs, tuple): new_obs = [] @@ -512,35 +627,40 @@ def observation(self, obs: Union[tuple, np.ndarray]) -> Union[tuple, np.ndarray] class RunningMeanStd(object): """ Overview: - Wrapper to update new variable, new mean, and new count - Interface: - ``__init__``, ``update``, ``reset``, ``new_shape`` + The RunningMeanStd class is a utility that maintains a running mean and standard deviation calculation over + a stream of data. + Interfaces: + __init__, update, reset, mean, std Properties: - - env (:obj:`gym.Env`): the environment to wrap. - - ``mean``, ``std``, ``_epsilon``, ``_shape``, ``_mean``, ``_var``, ``_count`` + - mean (:obj:`np.ndarray`): The running mean. + - std (:obj:`np.ndarray`): The running standard deviation. + - _epsilon (:obj:`float`): A small number to prevent division by zero when calculating standard deviation. + - _shape (:obj:`tuple`): The shape of the data stream. + - _mean (:obj:`np.ndarray`): The current mean of the data stream. + - _var (:obj:`np.ndarray`): The current variance of the data stream. + - _count (:obj:`float`): The number of data points processed. """ - def __init__(self, epsilon=1e-4, shape=()): + def __init__(self, epsilon: float = 1e-4, shape: tuple = ()): """ Overview: - Initialize ``self.`` See ``help(type(self))`` for accurate \ - signature; setup the properties. + Initialize the RunningMeanStd object. Arguments: - - env (:obj:`gym.Env`): the environment to wrap. - - epsilon (:obj:`Float`): the epsilon used for self for the std output - - shape (:obj: `np.array`): the np array shape used for the expression \ - of this wrapper on attibutes of mean and variance + - epsilon (:obj:`float`, optional): A small number to prevent division by zero when calculating standard + deviation. Default is 1e-4. + - shape (:obj:`tuple`, optional): The shape of the data stream. Default is an empty tuple, which + corresponds to scalars. """ self._epsilon = epsilon self._shape = shape self.reset() - def update(self, x): + def update(self, x: np.array): """ Overview: - Update mean, variable, and count + Update the running statistics with a new batch of data. Arguments: - - ``x``: the batch + - x (:obj:`np.array`): A batch of data. """ batch_mean = np.mean(x, axis=0) batch_var = np.var(x, axis=0) @@ -572,7 +692,9 @@ def reset(self): def mean(self) -> np.ndarray: """ Overview: - Property ``mean`` gotten from ``self._mean`` + Get the current running mean. + Returns: + The current running mean. """ return self._mean @@ -580,7 +702,9 @@ def mean(self) -> np.ndarray: def std(self) -> np.ndarray: """ Overview: - Property ``std`` calculated from ``self._var`` and the epsilon value of ``self._epsilon`` + Get the current running standard deviation. + Returns: + The current running mean. """ return np.sqrt(self._var) + self._epsilon @@ -589,20 +713,21 @@ def std(self) -> np.ndarray: class ObsNormWrapper(gym.ObservationWrapper): """ Overview: - Normalize observations according to running mean and std. - Interface: - ``__init__``, ``step``, ``reset``, ``observation``, ``new_shape`` + The ObsNormWrapper class is a gym observation wrapper that normalizes + observations according to running mean and standard deviation (std). + Interfaces: + __init__, step, reset, observation Properties: - env (:obj:`gym.Env`): the environment to wrap. - - - ``data_count``, ``clip_range``, ``rms`` + - data_count (:obj:`int`): the count of data points observed so far. + - clip_range (:obj:`Tuple[int, int]`): the range to clip the normalized observation. + - rms (:obj:`RunningMeanStd`): running mean and standard deviation of the observations. """ - def __init__(self, env): + def __init__(self, env: gym.Env): """ Overview: - Initialize ``self.`` See ``help(type(self))`` for accurate signature; \ - setup the properties according to running mean and std. + Initialize the ObsNormWrapper class. Arguments: - env (:obj:`gym.Env`): the environment to wrap. """ @@ -611,38 +736,33 @@ def __init__(self, env): self.clip_range = (-3, 3) self.rms = RunningMeanStd(shape=env.observation_space.shape) - def step(self, action): + def step(self, action: Union[int, np.ndarray]): """ Overview: - Step the environment with the given action. Repeat action, sum reward, \ - and update ``data_count``, and also update the ``self.rms`` property \ - once after integrating with the input ``action``. + Take an action in the environment, update the running mean and std, + and return the normalized observation. Arguments: - - action (:obj:`Any`): the given action to step with. + - action (:obj:`Union[int, np.ndarray]`): the action to take in the environment. Returns: - - ``self.observation(observation)`` : normalized observation after the \ - input action and updated ``self.rms`` - - reward (:obj:`Any`) : amount of reward returned after previous action - - done (:obj:`Bool`) : whether the episode has ended, in which case further \ - step() calls will return undefined results - - info (:obj:`Dict`) : contains auxiliary diagnostic information (helpful \ - for debugging, and sometimes learning) - + - obs (:obj:`np.ndarray`): the normalized observation after the action. + - reward (:obj:`float`): the reward after the action. + - done (:obj:`bool`): whether the episode has ended. + - info (:obj:`Dict`): contains auxiliary diagnostic information. """ self.data_count += 1 observation, reward, done, info = self.env.step(action) self.rms.update(observation) return self.observation(observation), reward, done, info - def observation(self, observation): + def observation(self, observation: np.ndarray) -> np.ndarray: """ Overview: - Get obeservation + Normalize the observation using the current running mean and std. + If less than 30 data points have been observed, return the original observation. Arguments: - - observation (:obj:`Any`): Original observation + - observation (:obj:`np.ndarray`): the original observation. Returns: - - observation (:obj:`Any`): Normalized new observation - + - observation (:obj:`np.ndarray`): the normalized observation. """ if self.data_count > 30: return np.clip((observation - self.rms.mean) / self.rms.std, self.clip_range[0], self.clip_range[1]) @@ -652,12 +772,11 @@ def observation(self, observation): def reset(self, **kwargs): """ Overview: - Resets the state of the environment and reset properties. + Reset the environment and the properties related to the running mean and std. Arguments: - - kwargs (:obj:`Dict`): Reset with this key argumets + - kwargs (:obj:`Dict`): keyword arguments to be passed to the environment's reset function. Returns: - - observation (:obj:`Any`): New observation after reset - + - observation (:obj:`np.ndarray`): the initial observation of the environment. """ self.data_count = 0 self.rms.reset() @@ -669,39 +788,40 @@ def reset(self, **kwargs): class StaticObsNormWrapper(gym.ObservationWrapper): """ Overview: - Normalize observations according to the mean and std in the fixed dataset. - Interface: - ``__init__``, ``observation`` + The StaticObsNormWrapper class is a gym observation wrapper that normalizes + observations according to a precomputed mean and standard deviation (std) from a fixed dataset. + Interfaces: + __init__, observation Properties: - env (:obj:`gym.Env`): the environment to wrap. - - - ``mean``, ``std``, ``clip_range`` + - mean (:obj:`numpy.ndarray`): the mean of the observations in the fixed dataset. + - std (:obj:`numpy.ndarray`): the standard deviation of the observations in the fixed dataset. + - clip_range (:obj:`Tuple[int, int]`): the range to clip the normalized observation. """ - def __init__(self, env, mean, std): + def __init__(self, env: gym.Env, mean: np.ndarray, std: np.ndarray): """ Overview: - Initialize ``self.`` See ``help(type(self))`` for accurate signature; \ - setup the properties according to dataset mean and std. + Initialize the StaticObsNormWrapper class. Arguments: - env (:obj:`gym.Env`): the environment to wrap. - - mean (:obj:`numpy.ndarray`): the mean of observation in the dataset. - - std (:obj:`numpy.ndarray`): the standard deviation of observation in the dataset. + - mean (:obj:`numpy.ndarray`): the mean of the observations in the fixed dataset. + - std (:obj:`numpy.ndarray`): the standard deviation of the observations in the fixed dataset. """ super().__init__(env) self.mean = mean self.std = std self.clip_range = (-3, 3) - def observation(self, observation): + def observation(self, observation: np.ndarray) -> np.ndarray: """ Overview: - Get obeservation + Normalize the given observation using the precomputed mean and std. + The normalized observation is then clipped within the specified range. Arguments: - - observation (:obj:`Any`): Original observation + - observation (:obj:`np.ndarray`): the original observation. Returns: - - observation (:obj:`Any`): Normalized new observation - + - observation (:obj:`np.ndarray`): the normalized and clipped observation. """ return np.clip((observation - self.mean) / self.std, self.clip_range[0], self.clip_range[1]) @@ -710,21 +830,24 @@ def observation(self, observation): class RewardNormWrapper(gym.RewardWrapper): """ Overview: - Normalize reward according to running std. - Interface: - ``__init__``, ``step``, ``reward``, ``reset``, ``new_shape`` + This wrapper class normalizes the reward according to running std. It extends the `gym.RewardWrapper`. + Interfaces: + __init__, step, reward, reset Properties: - - env (:obj:`gym.Env`): the environment to wrap. - - ``cum_reward``, ``reward_discount``, ``data_count``, ``rms`` + - env (:obj:`gym.Env`): The environment to wrap. + - cum_reward (:obj:`numpy.ndarray`): The cumulated reward, initialized as zero and updated in `step` method. + - reward_discount (:obj:`float`): The discount factor for reward. + - data_count (:obj:`int`): A counter for data, incremented in each `step` call. + - rms (:obj:`RunningMeanStd`): An instance of RunningMeanStd to compute the running mean and std of reward. """ - def __init__(self, env, reward_discount): + def __init__(self, env: gym.Env, reward_discount: float) -> None: """ Overview: - Initialize ``self.`` See ``help(type(self))`` for accurate signature; \ - setup the properties according to running mean and std. + Initialize the RewardNormWrapper, setup the properties according to running mean and std. Arguments: - - env (:obj:`gym.Env`): the environment to wrap. + - env (:obj:`gym.Env`): The environment to wrap. + - reward_discount (:obj:`float`): The discount factor for reward. """ super().__init__(env) self.cum_reward = np.zeros((1, ), 'float64') @@ -732,23 +855,21 @@ def __init__(self, env, reward_discount): self.data_count = 0 self.rms = RunningMeanStd(shape=(1, )) - def step(self, action): + def step(self, action: Any) -> Tuple[np.ndarray, float, bool, Dict]: """ Overview: - Step the environment with the given action. Repeat action, sum reward, \ - and update ``data_count``, and also update the ``self.rms`` and ``self.cum_reward`` \ - properties once after integrating with the input ``action``. + Step the environment with the given action, update properties and return the new observation, reward, + done status and info. Arguments: - - action (:obj:`Any`): the given action to step with. + - action (:obj:`Any`): The action to execute in the environment. Returns: - - observation : normalized observation after the input action and updated ``self.rms`` - - ``self.reward(reward)`` : amount of reward returned after previous action \ - (normalized) and update ``self.cum_reward`` - - done (:obj:`Bool`) : whether the episode has ended, in which case further \ - step() calls will return undefined results - - info (:obj:`Dict`) : contains auxiliary diagnostic information (helpful for \ - debugging, and sometimes learning) - + - observation (:obj:`np.ndarray`): Normalized observation after executing the action and updated `self.rms`. + - reward (:obj:`float`): Amount of reward returned after the action execution (normalized) and updated + `self.cum_reward`. + - done (:obj:`bool`): Whether the episode has ended, in which case further step() calls will return + undefined results. + - info (:obj:`Dict`): Contains auxiliary diagnostic information (helpful for debugging, and sometimes + learning). """ self.data_count += 1 observation, reward, done, info = self.env.step(action) @@ -757,14 +878,14 @@ def step(self, action): self.rms.update(self.cum_reward) return observation, self.reward(reward), done, info - def reward(self, reward): + def reward(self, reward: float) -> float: """ Overview: - Normalize reward if ``data_count`` is more than 30 + Normalize reward if `data_count` is more than 30. Arguments: - - reward(:obj:`Float`): Raw Reward + - reward (:obj:`float`): The raw reward. Returns: - - reward(:obj:`Float`): Normalized Reward + - reward (:obj:`float`): Normalized reward. """ if self.data_count > 30: return float(reward / self.rms.std) @@ -789,21 +910,21 @@ def reset(self, **kwargs): class RamWrapper(gym.Wrapper): """ Overview: - Wrap ram env into image-like env - Interface: - ``__init__``, ``reset``, ``step``, ``new_shape`` + This wrapper class wraps a RAM environment into an image-like environment. It extends the `gym.Wrapper`. + Interfaces: + __init__, reset, step Properties: - - env (:obj:`gym.Env`): the environment to wrap. - - n_frame (:obj:`int`): the number of frames to stack. - - ``observation_space`` + - env (:obj:`gym.Env`): The environment to wrap. + - observation_space (:obj:`gym.spaces.Box`): The observation space of the wrapped environment. """ - def __init__(self, env, render=False): + def __init__(self, env: gym.Env, render: bool = False) -> None: """ Overview: - Initialize ``self.`` See ``help(type(self))`` for accurate signature; + Initialize the RamWrapper and set up the observation space to wrap the RAM environment. Arguments: - - env (:obj:`gym.Env`): the environment to wrap. + - env (:obj:`gym.Env`): The environment to wrap. + - render (:obj:`bool`): Whether to render the environment, default is False. """ super().__init__(env) shape = env.observation_space.shape + (1, 1) @@ -814,34 +935,30 @@ def __init__(self, env, render=False): dtype=np.float32 ) - def reset(self): + def reset(self) -> np.ndarray: """ Overview: - Resets the state of the environment and reset properties. - + Resets the state of the environment and returns a reshaped observation. Returns: - - observation (:obj:`Any`): New observation after reset and reshaped - + - observation (:obj:`np.ndarray`): New observation after reset and reshaped. """ obs = self.env.reset() return obs.reshape(128, 1, 1).astype(np.float32) - def step(self, action): + def step(self, action: Any) -> Tuple[np.ndarray, Any, bool, Dict]: """ Overview: - Step the environment with the given action. Repeat action, sum reward and \ - reshape the observation. + Execute one step within the environment with the given action. Repeat action, sum reward and reshape the + observation. Arguments: - - action (:obj:`Any`): the given action to step with. + - action (:obj:`Any`): The action to take in the environment. Returns: - - ``obs.reshape(128, 1, 1).astype(np.float32)`` : reshaped observation after \ - step with type restriction. - - reward (:obj:`Any`) : amount of reward returned after previous action - - done (:obj:`Bool`) : whether the episode has ended, in which case further \ - step() calls will return undefined results - - info (:obj:`Dict`) : contains auxiliary diagnostic information (helpful for \ - debugging, and sometimes learning) - + - observation (:obj:`np.ndarray`): Reshaped observation after step with type restriction. + - reward (:obj:`Any`): Amount of reward returned after previous action. + - done (:obj:`bool`): Whether the episode has ended, in which case further step() calls will return + undefined results. + - info (:obj:`Dict`): Contains auxiliary diagnostic information (helpful for debugging, and sometimes + learning). """ obs, reward, done, info = self.env.step(action) return obs.reshape(128, 1, 1).astype(np.float32), reward, done, info @@ -851,45 +968,41 @@ def step(self, action): class EpisodicLifeWrapper(gym.Wrapper): """ Overview: - Make end-of-life == end-of-episode, but only reset on true game over. It helps \ - the value estimation. - Interface: - ``__init__``, ``step``, ``reset``, ``observation``, ``new_shape`` + This wrapper makes end-of-life equivalent to end-of-episode, but only resets on + true game over. This helps in better value estimation. + Interfaces: + __init__, step, reset Properties: - - env (:obj:`gym.Env`): the environment to wrap. - - - ``lives``, ``was_real_done`` + - env (:obj:`gym.Env`): The environment to wrap. + - lives (:obj:`int`): The current number of lives. + - was_real_done (:obj:`bool`): Whether the last episode was ended due to game over. """ - def __init__(self, env): + def __init__(self, env: gym.Env) -> None: """ Overview: - Initialize ``self.`` See ``help(type(self))`` for accurate signature; set \ - lives to 0 at set done. + Initialize the EpisodicLifeWrapper, setting lives to 0 and was_real_done to True. Arguments: - - env (:obj:`gym.Env`): the environment to wrap. + - env (:obj:`gym.Env`): The environment to wrap. """ super().__init__(env) self.lives = 0 self.was_real_done = True - def step(self, action): + def step(self, action: Any) -> Tuple[np.ndarray, float, bool, Dict]: """ Overview: - Step the environment with the given action. Repeat action, sum reward; set \ - ``self.was_real_done`` as done, and step according to lives i.e. check \ - current lives, make loss of life terminal, then update lives to \ - handle bonus lives. + Execute the given action in the environment, update properties based on the new + state and return the new observation, reward, done status and info. Arguments: - - action (:obj:`Any`): the given action to step with. + - action (:obj:`Any`): The action to execute in the environment. Returns: - - obs (:obj:`Any`): normalized observation after the input action and updated ``self.rms`` - - reward (:obj:`Any`) : amount of reward returned after previous action - - done (:obj:`Bool`) : whether the episode has ended, in which case further step() \ - calls will return undefined results - - info (:obj:`Dict`) : contains auxiliary diagnostic information (helpful for debugging,\ - and sometimes learning) - + - observation (:obj:`np.ndarray`): Normalized observation after the action execution and updated `self.rms`. + - reward (:obj:`float`): Amount of reward returned after the action execution. + - done (:obj:`bool`): Whether the episode has ended, in which case further step() calls will return + undefined results. + - info (:obj:`Dict`): Contains auxiliary diagnostic information (helpful for debugging, and + sometimes learning). """ obs, reward, done, info = self.env.step(action) self.was_real_done = done @@ -904,16 +1017,15 @@ def step(self, action): self.lives = lives return obs, reward, done, info - def reset(self): + def reset(self) -> np.ndarray: """ Overview: - Calls the Gym environment reset, only when lives are exhausted. This way all states are \ - still reachable even though lives are episodic, and the learner need not know about \ - any of this behind-the-scenes. + Resets the state of the environment and updates the number of lives, only when + lives are exhausted. This way all states are still reachable even though lives + are episodic, and the learner need not know about any of this behind-the-scenes. Returns: - - obs (:obj:`Any`): New observation after reset with no-op step to advance from terminal/lost \ - life state in case of not ``self.was_real_done``. - + - observation (:obj:`np.ndarray`): New observation after reset with no-op step to advance from + terminal/lost life state. """ if self.was_real_done: obs = self.env.reset() @@ -928,29 +1040,32 @@ def reset(self): class FireResetWrapper(gym.Wrapper): """ Overview: - Take fire action at environment reset. + This wrapper takes a fire action at environment reset. Related discussion: https://github.com/openai/baselines/issues/240 - Interface: - ``__init__``, ``reset``, ``new_shape`` + Interfaces: + __init__, reset Properties: - - env (:obj:`gym.Env`): the environment to wrap. + - env (:obj:`gym.Env`): The environment to wrap. """ - def __init__(self, env): + def __init__(self, env: gym.Env) -> None: """ Overview: - Initialize ``self.`` See ``help(type(self))`` for accurate signature. + Initialize the FireResetWrapper. Assume that the second action of the environment + is 'FIRE' and there are at least three actions. Arguments: - - env (:obj:`gym.Env`): the environment to wrap. + - env (:obj:`gym.Env`): The environment to wrap. """ super().__init__(env) assert env.unwrapped.get_action_meanings()[1] == 'FIRE' assert len(env.unwrapped.get_action_meanings()) >= 3 - def reset(self): + def reset(self) -> np.ndarray: """ Overview: - Resets the state of the environment and reset properties i.e. reset with action 1 + Resets the state of the environment and executes a fire action, i.e. reset with action 1. + Returns: + - observation (:obj:`np.ndarray`): New observation after reset and fire action. """ self.env.reset() return self.env.step(1)[0] @@ -960,20 +1075,20 @@ def reset(self): class GymHybridDictActionWrapper(gym.ActionWrapper): """ Overview: - Transform Gym-Hybrid's original ``gym.spaces.Tuple`` action space to ``gym.spaces.Dict``. - Interface: - ``__init__``, ``action`` + Transform Gym-Hybrid's original `gym.spaces.Tuple` action space to `gym.spaces.Dict`. + Interfaces: + __init__, action Properties: - - env (:obj:`gym.Env`): the environment to wrap. - - ``self.action_space`` + - env (:obj:`gym.Env`): The environment to wrap. + - action_space (:obj:`gym.spaces.Dict`): The new action space. """ - def __init__(self, env): + def __init__(self, env: gym.Env) -> None: """ Overview: - Initialize ``self.`` See ``help(type(self))`` for accurate signature. + Initialize the GymHybridDictActionWrapper, setting up the new action space. Arguments: - - env (:obj:`gym.Env`): the environment to wrap. + - env (:obj:`gym.Env`): The environment to wrap. """ super().__init__(env) self.action_space = gym.spaces.Dict( @@ -990,7 +1105,22 @@ def __init__(self, env): } ) - def step(self, action): + def step(self, action: Dict) -> Tuple[Dict, float, bool, Dict]: + """ + Overview: + Execute the given action in the environment, transform the action from Dict to Tuple, + and return the new observation, reward, done status and info. + Arguments: + - action (:obj:`Dict`): The action to execute in the environment, structured as a dictionary. + Returns: + - observation (:obj:`Dict`): The wrapped observation, which includes the current observation, + previous action and previous reward. + - reward (:obj:`float`): Amount of reward returned after the action execution. + - done (:obj:`bool`): Whether the episode has ended, in which case further step() calls will return + undefined results. + - info (:obj:`Dict`): Contains auxiliary diagnostic information (helpful for debugging, and + sometimes learning). + """ # # From Dict to Tuple # action_type = action[0] # if action_type == 0: @@ -1012,22 +1142,22 @@ def step(self, action): class ObsPlusPrevActRewWrapper(gym.Wrapper): """ Overview: - This wrapper is used in policy NGU. - Set a dict {'obs': obs, 'prev_action': self.prev_action, 'prev_reward_extrinsic': self.prev_reward_extrinsic} - as the new wrapped observation, - which including the current obs, previous action and previous reward. - Interface: - ``__init__``, ``reset``, ``step`` + This wrapper is used in policy NGU. It sets a dict as the new wrapped observation, + which includes the current observation, previous action and previous reward. + Interfaces: + __init__, reset, step Properties: - - env (:obj:`gym.Env`): the environment to wrap. + - env (:obj:`gym.Env`): The environment to wrap. + - prev_action (:obj:`int`): The previous action. + - prev_reward_extrinsic (:obj:`float`): The previous reward. """ - def __init__(self, env): + def __init__(self, env: gym.Env) -> None: """ Overview: - Initialize ``self.`` See ``help(type(self))`` for accurate signature; setup the properties. + Initialize the ObsPlusPrevActRewWrapper, setting up the previous action and reward. Arguments: - - env (:obj:`gym.Env`): the environment to wrap. + - env (:obj:`gym.Env`): The environment to wrap. """ super().__init__(env) self.observation_space = gym.spaces.Dict( @@ -1042,35 +1172,35 @@ def __init__(self, env): self.prev_action = -1 # null action self.prev_reward_extrinsic = 0 # null reward - def reset(self): + def reset(self) -> Dict: """ Overview: - Resets the state of the environment. + Resets the state of the environment, and returns the wrapped observation. Returns: - - obs (:obj:`Dict`) : the wrapped observation, which including the current obs, \ + - observation (:obj:`Dict`): The wrapped observation, which includes the current observation, previous action and previous reward. """ obs = self.env.reset() obs = {'obs': obs, 'prev_action': self.prev_action, 'prev_reward_extrinsic': self.prev_reward_extrinsic} return obs - def step(self, action): + def step(self, action: Any) -> Tuple[Dict, float, bool, Dict]: """ Overview: - Step the environment with the given action. - Save the previous action and reward to be used in next new obs + Execute the given action in the environment, save the previous action and reward + to be used in the next observation, and return the new observation, reward, + done status and info. Arguments: - - action (:obj:`Any`): the given action to step with. + - action (:obj:`Any`): The action to execute in the environment. Returns: - - obs (:obj:`Dict`) : the wrapped observation, which including the current obs, \ + - observation (:obj:`Dict`): The wrapped observation, which includes the current observation, previous action and previous reward. - - reward (:obj:`Any`) : amount of reward returned after previous action - - done (:obj:`Bool`) : whether the episode has ended, in which case further \ - step() calls will return undefined results - - info (:obj:`Dict`) : contains auxiliary diagnostic information (helpful \ - for debugging, and sometimes learning) + - reward (:obj:`float`): Amount of reward returned after the action execution. + - done (:obj:`bool`): Whether the episode has ended, in which case further step() calls will return + undefined results. + - info (:obj:`Dict`): Contains auxiliary diagnostic information (helpful for debugging, and sometimes + learning). """ - obs, reward, done, info = self.env.step(action) obs = {'obs': obs, 'prev_action': self.prev_action, 'prev_reward_extrinsic': self.prev_reward_extrinsic} self.prev_action = action @@ -1079,8 +1209,21 @@ def step(self, action): class TransposeWrapper(gym.Wrapper): + """ + Overview: + This class is used to transpose the observation space of the environment. + + Interfaces: + __init__, _process_obs, step, reset + """ - def __init__(self, env): + def __init__(self, env: gym.Env) -> None: + """ + Overview: + Initialize the TransposeWrapper, setting up the new observation space. + Arguments: + - env (:obj:`gym.Env`): The environment to wrap. + """ super().__init__(env) old_space = copy.deepcopy(env.observation_space) new_shape = (old_space.shape[-1], *old_space.shape[:-1]) @@ -1088,31 +1231,92 @@ def __init__(self, env): low=old_space.low.min(), high=old_space.high.max(), shape=new_shape, dtype=old_space.dtype ) - def _process_obs(self, obs): + def _process_obs(self, obs: np.ndarray) -> np.ndarray: + """ + Overview: + Transpose the observation into the format (channels, height, width). + Arguments: + - obs (:obj:`np.ndarray`): The observation to transform. + Returns: + - obs (:obj:`np.ndarray`): The transposed observation. + """ obs = to_ndarray(obs) obs = np.transpose(obs, (2, 0, 1)) return obs - def step(self, action): + def step(self, action: Any) -> Tuple[np.ndarray, float, bool, Dict]: + """ + Overview: + Execute the given action in the environment, process the observation and return + the new observation, reward, done status, and info. + Arguments: + - action (:obj:`Any`): The action to execute in the environment. + Returns: + - observation (:obj:`np.ndarray`): The processed observation after the action execution. + - reward (:obj:`float`): Amount of reward returned after the action execution. + - done (:obj:`bool`): Whether the episode has ended, in which case further step() calls will return + undefined results. + - info (:obj:`Dict`): Contains auxiliary diagnostic information (helpful for debugging, and sometimes + learning). + """ obs, reward, done, info = self.env.step(action) return self._process_obs(obs), reward, done, info - def reset(self): + def reset(self) -> np.ndarray: + """ + Overview: + Resets the state of the environment and returns the processed observation. + Returns: + - observation (:obj:`np.ndarray`): The processed observation after reset. + """ obs = self.env.reset() return self._process_obs(obs) class TimeLimitWrapper(gym.Wrapper): + """ + Overview: + This class is used to enforce a time limit on the environment. + Interfaces: + __init__, reset, step + """ - def __init__(self, env, max_limit): + def __init__(self, env: gym.Env, max_limit: int) -> None: + """ + Overview: + Initialize the TimeLimitWrapper, setting up the maximum limit of time steps. + Arguments: + - env (:obj:`gym.Env`): The environment to wrap. + - max_limit (:obj:`int`): The maximum limit of time steps. + """ super().__init__(env) self.max_limit = max_limit - def reset(self): + def reset(self) -> np.ndarray: + """ + Overview: + Resets the state of the environment and the time counter. + Returns: + - observation (:obj:`np.ndarray`): The new observation after reset. + """ self.time_count = 0 return self.env.reset() - def step(self, action): + def step(self, action: Any) -> Tuple[np.ndarray, float, bool, Dict]: + """ + Overview: + Execute the given action in the environment, update the time counter, and + return the new observation, reward, done status and info. + Arguments: + - action (:obj:`Any`): The action to execute in the environment. + Returns: + - observation (:obj:`np.ndarray`): The new observation after the action execution. + - reward (:obj:`float`): Amount of reward returned after the action execution. + - done (:obj:`bool`): Whether the episode has ended, in which case further step() calls will return + undefined results. + - info (:obj:`Dict`): Contains auxiliary diagnostic information (helpful for debugging, and sometimes + learning). + """ obs, reward, done, info = self.env.step(action) self.time_count += 1 if self.time_count >= self.max_limit: @@ -1126,10 +1330,21 @@ def step(self, action): class FlatObsWrapper(gym.Wrapper): """ - Note: only suitable for these envs like minigrid. + Overview: + This class is used to flatten the observation space of the environment. + Note: only suitable for environments like minigrid. + Interfaces: + __init__, observation, reset, step """ - def __init__(self, env, maxStrLen=96): + def __init__(self, env: gym.Env, maxStrLen: int = 96) -> None: + """ + Overview: + Initialize the FlatObsWrapper, setup the new observation space. + Arguments: + - env (:obj:`gym.Env`): The environment to wrap. + - maxStrLen (:obj:`int`): The maximum length of mission string, default is 96. + """ super().__init__(env) self.maxStrLen = maxStrLen @@ -1147,7 +1362,16 @@ def __init__(self, env, maxStrLen=96): self.cachedStr: str = None - def observation(self, obs): + def observation(self, obs: Union[np.ndarray, Tuple]) -> np.ndarray: + """ + Overview: + Process the observation, convert the mission into one-hot encoding and concatenate + it with the image data. + Arguments: + - obs (:obj:`Union[np.ndarray, Tuple]`): The raw observation to process. + Returns: + - obs (:obj:`np.ndarray`): The processed observation. + """ if isinstance(obs, tuple): # for compatibility of gymnasium obs = obs[0] image = obs["image"] @@ -1179,27 +1403,70 @@ def observation(self, obs): return obs - def reset(self, *args, **kwargs): + def reset(self, *args, **kwargs) -> np.ndarray: + """ + Overview: + Resets the state of the environment and returns the processed observation. + Returns: + - observation (:obj:`np.ndarray`): The processed observation after reset. + """ obs = self.env.reset(*args, **kwargs) return self.observation(obs) - def step(self, *args, **kwargs): + def step(self, *args, **kwargs) -> Tuple[np.ndarray, float, bool, Dict]: + """ + Overview: + Execute the given action in the environment, and return the processed observation, + reward, done status, and info. + Returns: + - observation (:obj:`np.ndarray`): The processed observation after the action execution. + - reward (:obj:`float`): Amount of reward returned after the action execution. + - done (:obj:`bool`): Whether the episode has ended, in which case further step() calls will return + undefined results. + - info (:obj:`Dict`): Contains auxiliary diagnostic information (helpful for debugging, and sometimes + learning). + """ o, r, d, i = self.env.step(*args, **kwargs) o = self.observation(o) return o, r, d, i class GymToGymnasiumWrapper(gym.Wrapper): + """ + Overview: + This class is used to wrap a gymnasium environment to a gym environment. + Interfaces: + __init__, seed, reset + """ - def __init__(self, env): + def __init__(self, env: gymnasium.Env) -> None: + """ + Overview: + Initialize the GymToGymnasiumWrapper. + Arguments: + - env (:obj:`gymnasium.Env`): The gymnasium environment to wrap. + """ assert isinstance(env, gymnasium.Env), type(env) super().__init__(env) self._seed = None - def seed(self, seed): + def seed(self, seed: int) -> None: + """ + Overview: + Set the seed for the environment. + Arguments: + - seed (:obj:`int`): The seed to set. + """ self._seed = seed - def reset(self): + def reset(self) -> np.ndarray: + """ + Overview: + Resets the state of the environment and returns the new observation. If a seed + was set, use it in the reset. + Returns: + - observation (:obj:`np.ndarray`): The new observation after reset. + """ if self.seed is not None: return self.env.reset(seed=self._seed) else: @@ -1210,20 +1477,31 @@ def reset(self): class AllinObsWrapper(gym.Wrapper): """ Overview: - This wrapper is used in policy DT. - Set a dict {'obs': obs, 'reward': reward} - as the new wrapped observation, - which including the current obs, previous reward. - Interface: - ``__init__``, ``reset``, ``step``, ``seed`` + This wrapper is used in policy ``Decision Transformer``, which is proposed in paper + https://arxiv.org/abs/2106.01345. It sets a dict {'obs': obs, 'reward': reward} + as the new wrapped observation, which includes the current observation and previous reward. + Interfaces: + __init__, reset, step, seed Properties: - - env (:obj:`gym.Env`): the environment to wrap. + - env (:obj:`gym.Env`): The environment to wrap. """ - def __init__(self, env): + def __init__(self, env: gym.Env) -> None: + """ + Overview: + Initialize the AllinObsWrapper. + Arguments: + - env (:obj:`gym.Env`): The environment to wrap. + """ super().__init__(env) - def reset(self): + def reset(self) -> Dict: + """ + Overview: + Resets the state of the environment and returns the new observation. + Returns: + - observation (:obj:`Dict`): The new observation after reset, includes the current observation and reward. + """ ret = {'obs': self.env.reset(), 'reward': np.array([0])} self._observation_space = gym.spaces.Dict( { @@ -1233,24 +1511,45 @@ def reset(self): ) return ret - def step(self, action): + def step(self, action: Any): + """ + Overview: + Execute the given action in the environment, and return the new observation, + reward, done status, and info. + Arguments: + - action (:obj:`Any`): The action to execute in the environment. + Returns: + - timestep (:obj:`BaseEnvTimestep`): The timestep after the action execution. + """ obs, reward, done, info = self.env.step(action) obs = {'obs': obs, 'reward': reward} from ding.envs import BaseEnvTimestep return BaseEnvTimestep(obs, reward, done, info) def seed(self, seed: int, dynamic_seed: bool = True) -> None: + """ + Overview: + Set the seed for the environment. + Arguments: + - seed (:obj:`int`): The seed to set. + - dynamic_seed (:obj:`bool`): Whether to use dynamic seed, default is True. + """ self.env.seed(seed, dynamic_seed) -def update_shape(obs_shape, act_shape, rew_shape, wrapper_names): +def update_shape(obs_shape: Any, act_shape: Any, rew_shape: Any, wrapper_names: List[str]) -> Tuple[Any, Any, Any]: """ Overview: - Get new shape of observation, acton, and reward given the wrapper. + Get new shapes of observation, action, and reward given the wrapper. Arguments: - obs_shape (:obj:`Any`), act_shape (:obj:`Any`), rew_shape (:obj:`Any`), wrapper_names (:obj:`Any`) + - obs_shape (:obj:`Any`): The original shape of observation. + - act_shape (:obj:`Any`): The original shape of action. + - rew_shape (:obj:`Any`): The original shape of reward. + - wrapper_names (:obj:`List[str]`): The names of the wrappers. Returns: - obs_shape (:obj:`Any`), act_shape (:obj:`Any`), rew_shape (:obj:`Any`) + - obs_shape (:obj:`Any`): The new shape of observation. + - act_shape (:obj:`Any`): The new shape of action. + - rew_shape (:obj:`Any`): The new shape of reward. """ for wrapper_name in wrapper_names: if wrapper_name: @@ -1261,16 +1560,15 @@ def update_shape(obs_shape, act_shape, rew_shape, wrapper_names): return obs_shape, act_shape, rew_shape -def create_env_wrapper(env: gym.Env, env_wrapper_cfg: dict) -> gym.Wrapper: +def create_env_wrapper(env: gym.Env, env_wrapper_cfg: EasyDict) -> gym.Wrapper: """ Overview: - Create an env wrapper according to env_wrapper_cfg and env instance. + Create an environment wrapper according to the environment wrapper configuration and the environment instance. Arguments: - - env (:obj:`gym.Env`): An env instance to be wrapped. - - env_wrapper_cfg (:obj:`EasyDict`): Env wrapper config. - ArgumentsKeys: - - `env_wrapper_cfg`'s necessary: `type` - - `env_wrapper_cfg`'s optional: `import_names`, `kwargs` + - env (:obj:`gym.Env`): The environment instance to be wrapped. + - env_wrapper_cfg (:obj:`EasyDict`): The configuration for the environment wrapper. + Returns: + - env (:obj:`gym.Wrapper`): The wrapped environment instance. """ env_wrapper_cfg = copy.deepcopy(env_wrapper_cfg) if 'import_names' in env_wrapper_cfg: